From dcea58aea7f6a8d6b44f4d00d71e08a670e27a95 Mon Sep 17 00:00:00 2001
From: rschoene <rene.schoene@tu-dresden.de>
Date: Fri, 15 Mar 2019 20:51:08 +0100
Subject: [PATCH] Alternate scripts reporting.

---
 .../trainbenchmark-reporting/.gitignore       |   2 +-
 .../trainbenchmark-reporting/build.gradle     |  25 +-
 .../trainbenchmark-reporting/do-merge.sh      |   1 +
 .../merge_results.json                        |  27 ++
 .../trainbenchmark-reporting/merge_results.py | 242 ++++++++++++++++++
 .../trainbenchmark-reporting/toolwise.R       | 154 +++++++++++
 .../trainbenchmark-reporting/toolwise.py      |  33 +++
 .../trainbenchmark-reporting/toolwise.sh      |   4 +
 8 files changed, 484 insertions(+), 4 deletions(-)
 create mode 100755 trainbenchmark/trainbenchmark-reporting/do-merge.sh
 create mode 100644 trainbenchmark/trainbenchmark-reporting/merge_results.json
 create mode 100755 trainbenchmark/trainbenchmark-reporting/merge_results.py
 create mode 100644 trainbenchmark/trainbenchmark-reporting/toolwise.R
 create mode 100644 trainbenchmark/trainbenchmark-reporting/toolwise.py
 create mode 100755 trainbenchmark/trainbenchmark-reporting/toolwise.sh

diff --git a/trainbenchmark/trainbenchmark-reporting/.gitignore b/trainbenchmark/trainbenchmark-reporting/.gitignore
index fb37a9f26..5bfbad918 100644
--- a/trainbenchmark/trainbenchmark-reporting/.gitignore
+++ b/trainbenchmark/trainbenchmark-reporting/.gitignore
@@ -1,2 +1,2 @@
 Rplots.pdf
-
+local-merge_results.json
diff --git a/trainbenchmark/trainbenchmark-reporting/build.gradle b/trainbenchmark/trainbenchmark-reporting/build.gradle
index 5e8f0e2e1..7fc025885 100644
--- a/trainbenchmark/trainbenchmark-reporting/build.gradle
+++ b/trainbenchmark/trainbenchmark-reporting/build.gradle
@@ -1,7 +1,26 @@
+task doMerge(type: Exec) {
+    group = 'Benchmark'
+    description = 'Merges the results'
+    commandLine './do-merge.sh'
+}
+
 task plot(type: Exec) {
-  commandLine 'Rscript', 'report.R'
+    group = 'Benchmark'
+    description = 'Plots the \'classic\' TrainBenchmark result'
+    commandLine 'Rscript', 'report.R'
+    dependsOn doMerge
 }
 
 task plotIndividual(type: Exec) {
-  commandLine 'Rscript', 'individual.R'
-}
\ No newline at end of file
+    group = 'Benchmark'
+    description = 'Plots the individual TrainBenchmark results'
+    commandLine 'Rscript', 'individual.R'
+    dependsOn doMerge
+}
+
+task plotToolwise(type: Exec) {
+    group = 'Benchmark'
+    description = 'Plots the individual TrainBenchmark results per tool'
+    commandLine './toolwise.sh'
+    dependsOn doMerge
+}
diff --git a/trainbenchmark/trainbenchmark-reporting/do-merge.sh b/trainbenchmark/trainbenchmark-reporting/do-merge.sh
new file mode 100755
index 000000000..cda5cf12b
--- /dev/null
+++ b/trainbenchmark/trainbenchmark-reporting/do-merge.sh
@@ -0,0 +1 @@
+python merge_results.py --result-dir ../results/ --create-run-dirs --create-toolwise-dirs $@
diff --git a/trainbenchmark/trainbenchmark-reporting/merge_results.json b/trainbenchmark/trainbenchmark-reporting/merge_results.json
new file mode 100644
index 000000000..d8de85321
--- /dev/null
+++ b/trainbenchmark/trainbenchmark-reporting/merge_results.json
@@ -0,0 +1,27 @@
+{
+  "tools": [
+    "tinkergraph",
+    "drools",
+    "mysql",
+    "kiama",
+    "jastadd-java-references",
+    "jastadd-java-references-incremental",
+    "jastadd-symbolic-references-incremental",
+    "jastadd-symbolic-references",
+    "sqlite",
+    "viatra",
+    "racr-cpp",
+    "racr-python",
+    "racr-scheme",
+    "neo4j",
+    "sesame",
+    "emfapi",
+    "rdf4j",
+    "epsilon",
+    "eclipseocl"
+  ],
+  "ignored": [
+  ],
+  "toolwise": [
+  ]
+}
diff --git a/trainbenchmark/trainbenchmark-reporting/merge_results.py b/trainbenchmark/trainbenchmark-reporting/merge_results.py
new file mode 100755
index 000000000..a1fdb731e
--- /dev/null
+++ b/trainbenchmark/trainbenchmark-reporting/merge_results.py
@@ -0,0 +1,242 @@
+#!/usr/bin/env python
+import argparse
+import csv
+import glob
+import json
+import logging
+import os
+import os.path
+import re
+import sys
+
+
+FORMAT = '%(asctime)s %(levelname)-8s %(threadName)-10s (%(filename)s:%(lineno)d): %(message)s'
+BENCHMARK_PATTERN = re.compile('.*-(BatchModel|Repair|Inject)Test.*')
+logger = logging.getLogger('merge_result')
+SIZE_PATTERN = re.compile('.*-railway-[^\\-]*-([^\\-]*)-.csv')
+NAME_PATTERN = re.compile('(times|matches)-([^\\-]*)-.*.csv')
+RUN_PATTERN = re.compile('run-(....)-(..)-(..)-(..)-(..)-(..)')
+RUN_REPLACMENT = r'\1_\2_\3 \4:\5:\6'
+
+
+def include_file_config(args):
+    def override_if_defined(key, convert=lambda x: x):
+        keyArgs = key.replace('-', '_')
+        value = content.get(key) or content.get(keyArgs)
+        if value:
+            setattr(args, keyArgs, convert(value))
+
+    # load config file
+    with open(args.file_config) as fdr:
+        content = json.load(fdr)
+    # update with local version, if existing
+    directory, basename = os.path.split(os.path.abspath(args.file_config))
+    local_config_file = os.path.join(directory, 'local-' + basename)
+    if os.path.exists(local_config_file):
+        with open(local_config_file) as fdr:
+            content.update(json.load(fdr))
+    else:
+        logger.debug('No local config file found.')
+    if not content.get('tools'):
+        logger.error('Key "tools" not found in config file "' + args.file_config + '". Exiting.')
+        sys.exit(1)
+    args.tools = content['tools']
+    override_if_defined('max-size', int)
+    override_if_defined('dry-run', bool)
+    override_if_defined('result-dir')
+    override_if_defined('create-run-dirs', bool)
+    override_if_defined('create-toolwise-dirs', bool)
+    override_if_defined('no-clean', bool)
+    override_if_defined('verbose', bool)
+
+
+def create_link(fileToLink, linkName, dry_run):
+    if dry_run:
+        return
+    if os.path.lexists(linkName):
+        os.unlink(linkName)
+    (logger.info if args.dry_run else logger.debug)('Linking %s to %s', fileToLink, linkName)
+    os.symlink(fileToLink, linkName)
+
+
+def ensure_directory(dir_name, dry_run):
+    if dry_run:
+        return
+    if not os.path.exists(dir_name):
+        logger.info('Creating %s', dir_name)
+        os.mkdir(dir_name)
+
+
+def exceeds(filename, max_size):
+    match = SIZE_PATTERN.match(filename)
+    return int(match.group(1)) > max_size if match else False
+
+
+def remove_if_there(the_list, element_to_remove):
+    if element_to_remove in the_list:
+        the_list.remove(element_to_remove)
+
+
+def new_run_name(old_run_name):
+    return RUN_PATTERN.sub(RUN_REPLACMENT, old_run_name)
+
+
+def copy_replace(fileTocopy, all_runs_dir, tool_name, run, dry_run):
+    """
+    Take fileTocopy, copy it to all_runs_dir, while replacing tool_name with run
+    in both its name and its content
+    """
+    run_name = new_run_name(run)
+    targetFile = os.path.join(
+        all_runs_dir, os.path.basename(fileTocopy).replace(tool_name, run_name))
+    first = True
+    with open(fileTocopy) as fdr_source, open(targetFile, 'w') as fdr_target:
+        for line in fdr_source:
+            if first:
+                first = False
+            else:
+                line = line.replace(tool_name, run_name)
+            fdr_target.write(line)
+
+
+def main(args):
+    """
+    Main process.
+
+    Used directory structure/variables:
+    results/                        -> result_dir
+        tools/                      -
+            tool1/                  -> tool_dir
+                tool1-run1/         -> run_dir
+                    times.csv       -
+                    matches.csv     -
+                run-list.csv        -
+                all-runs/           -> all_runs_dir
+                    times-run1.csv@ -
+        run1/                       -> global_run_dir
+        merged/                     -> merged_dir
+            individual/             -> merged_dir_individual
+                times.csv@          -
+            combined/               -> merged_dir_benchmark
+                times.csv@          -
+    """
+    log_action = logger.info if args.dry_run else logger.debug
+
+    # Gathering paths, creating top-level directories
+    result_dir = os.path.abspath(args.result_dir)
+    merged_dir = os.path.join(result_dir, 'merged')
+    merged_dir_benchmark = os.path.join(merged_dir, 'benchmark')
+    merged_dir_individual = os.path.join(merged_dir, 'individual')
+    for dir_name in (merged_dir, merged_dir_benchmark, merged_dir_individual):
+        ensure_directory(dir_name, args.dry_run)
+
+    # Gathering tools
+    tools = []
+    reader = csv.reader(args.tools)
+    next(reader)
+    for row in reader:
+        if not row:
+            continue
+        tools.append(row[0])
+    logger.debug('result_dir: %s, tools: %s', result_dir, tools)
+
+    # Clean symlinks if requested or max_size is set
+    if (args.clean or args.max_size) and not args.dry_run:
+        for dir_to_clean in [merged_dir, merged_dir_benchmark, merged_dir_individual]:
+            for link in os.listdir(dir_to_clean):
+                linkName = os.path.join(dir_to_clean, link)
+                if os.path.islink(linkName):
+                    os.unlink(linkName)
+    if (args.clean or args.create_toolwise_dirs) and not args.dry_run:
+        for linkName in glob.iglob(os.path.join(result_dir, 'tools', '*', 'all-runs', '*.csv')):
+            os.remove(linkName)
+
+    # Merge results
+    for tool in tools:
+        if tool.startswith('#'):
+            logger.debug('Ignoring tool "%s"', tool[1:])
+            continue
+        already_merged = []
+        tool_dir = os.path.join(result_dir, 'tools', tool)
+        if not os.path.exists(tool_dir):
+            logger.warn('Tool not found: %s', tool)
+            continue
+        all_runs_dir = os.path.join(tool_dir, 'all-runs')
+        ensure_directory(all_runs_dir, args.dry_run)
+        runs = sorted(os.listdir(tool_dir), reverse=True)
+        remove_if_there(runs, 'all-runs')
+        remove_if_there(runs, 'run-list.csv')
+
+        if args.create_toolwise_dirs:
+            # write out run-list.csv
+            with open(os.path.join(tool_dir, 'run-list.csv'), 'w') as fdr:
+                fdr.write('Runs\n')
+                for run in runs:
+                    fdr.write(new_run_name(run) + '\n')
+
+        for run in runs:
+            run_dir = os.path.join(tool_dir, run)
+            global_run_dir = os.path.join(result_dir, run)
+            if not os.path.isdir(run_dir):
+                continue
+            ensure_directory(global_run_dir, args.dry_run)
+            for csvFile in os.listdir(run_dir):
+                # link file in run directory
+                fileToLink = os.path.join(tool_dir, run, csvFile)
+                linkName = os.path.join(global_run_dir, csvFile)
+                create_link(fileToLink, linkName, args.dry_run)
+                # skip if max-size is set and size is exceeded
+                if args.max_size and exceeds(fileToLink, args.max_size):
+                    continue
+                if args.create_toolwise_dirs:
+                    # link in all-runs (rename file accordingly)
+                    match = NAME_PATTERN.match(csvFile)
+                    if match:
+                        if not BENCHMARK_PATTERN.match(csvFile):
+                            tool_name = match.group(2)
+                            copy_replace(fileToLink, all_runs_dir, tool_name, run, args.dry_run)
+                    else:
+                        logging.warn('file did not match pattern: %s', csvFile)
+                # link file in merged directory
+                if csvFile not in already_merged:
+                    linkName = os.path.join(merged_dir_benchmark if BENCHMARK_PATTERN.match(csvFile)
+                                            else merged_dir_individual, csvFile)
+                    create_link(fileToLink, linkName, args.dry_run)
+                    already_merged.append(csvFile)
+                else:
+                    log_action('Skipping %s', csvFile)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='Merge results of all benchmark runs.',
+        epilog="""The config file must contain the key "tools" specifying the tools to process.
+        It further can contain any long version of arguments to this program as a default value.
+        Any command line parameter will override such a default value.
+        Additionally, a local version of the file will be read, overriding the default values.
+        Its filename is "local-" prepended to the name of the config file.""")
+    parser.add_argument(
+        "-r", "--result-dir", help="Path to result directory to search in.", type=str)
+    parser.add_argument(
+        "-c", "--create-run-dirs", help="Whether to recreate runs directories.",
+        action="store_true")
+    parser.add_argument(
+        "-t", "--create-toolwise-dirs", help="Whether to recreate toolwise regression directories.",
+        action="store_true")
+    parser.add_argument(
+        "-d", "--dry-run", help="Only print action, don't execute them.",
+        action="store_true")
+    parser.add_argument(
+        "-n", "--no-clean", help="Don't remove previously existing symlinks in merged dir.",
+        dest='clean', action="store_false")
+    parser.add_argument(
+        "-v", "--verbose", help="Print debug messages.", action="store_true")
+    parser.add_argument(
+        "-m", "--max-size", type=int,
+        help="Maximum benchmark size to include. Implies cleaning existing symlinks.")
+    parser.add_argument(
+        "-f", "--file-config", default='merge_results.json', help="Config file to use.")
+    args = parser.parse_args()
+    include_file_config(args)
+    logging.basicConfig(format=FORMAT, level=logging.DEBUG if args.verbose else logging.INFO)
+    main(args)
diff --git a/trainbenchmark/trainbenchmark-reporting/toolwise.R b/trainbenchmark/trainbenchmark-reporting/toolwise.R
new file mode 100644
index 000000000..3d0db01f3
--- /dev/null
+++ b/trainbenchmark/trainbenchmark-reporting/toolwise.R
@@ -0,0 +1,154 @@
+library(data.table)
+library(reshape2)
+library(plyr)
+library(ggplot2)
+library(ggrepel)
+library(arules)
+library(ggforce)
+
+source('util.R')
+
+args = commandArgs(trailingOnly=TRUE)
+if (length(args)==0) {
+  stop("At least one argument must be supplied (tool-name).\n", call.=FALSE)
+}
+toolName = args[1]
+
+# prepare output directory
+output_dir = paste("../diagrams/merged", toolName, sep="/")
+if (!(dir.exists(output_dir))) {
+    dir.create(output_dir)
+}
+
+# constants
+workloads = c(
+  "PosLength",  "SwitchMonitored",
+  "RouteSensor", "SwitchSet",
+  "ConnectedSegments", "SemaphoreNeighbor"
+)
+phases = c("Read", "Check", "Read.and.Check", "Transformation", "Recheck", "Transformation.and.Recheck")
+phasesPrettified = c("Read", "Check", "Read and Check", "Transformation", "Recheck", "Transformation and Recheck")
+
+sizes = list()      # 1     2      4      8      16      32      64      128     256     512     1024    2048   4096
+sizes[["Repair"]] = c("8k", "15k", "33k", "66k", "135k", "271k", "566k", "1.1M", "2.2M", "4.6M", "9.3M", "18M", "37M")
+
+runList = read.csv(paste("../results/tools/", toolName, "/run-list.csv", sep=""), colClasses=c(rep("character",1)))
+
+# load the data
+tsvs = list.files(paste("../results/tools/", toolName, "/all-runs/", sep=""), pattern = "times-.*\\.csv", full.names = T, recursive = T)
+
+l = lapply(tsvs, read.csv)
+times = rbindlist(l)
+
+# preprocess the data
+times$Tool = factor(times$Tool, levels = runList$Runs)
+keep_descriptions_first_char(times)
+
+times$Model = gsub("\\D+", "", times$Model)
+times$Model = as.numeric(times$Model)
+times$Time = times$Time / 10^6
+# make the phases a factor with a fixed set of values to help dcasting
+# (e.g. Batch measurements do not have Transformation and Recheck attributes,
+# hence accessing the "Transformation" attribute would throw an error)
+times$Phase = factor(times$Phase, levels = c("Read", "Check", "Transformation", "Recheck"))
+
+times.wide = dcast(data = times,
+                   formula = Tool + Workload + Description + Model + Run ~ Phase,
+                   value.var = "Time",
+                   drop = T,
+                   fun.aggregate = mean
+)
+
+# calculate aggregated values
+times.derived = times.wide
+times.derived$Read.and.Check = times.derived$Read + times.derived$Check
+times.derived$Transformation.and.Recheck = times.derived$Transformation + times.derived$Recheck
+
+# calculate the median value of runs
+times.aggregated.runs = ddply(
+  .data = times.derived,
+  .variables = c("Tool", "Workload", "Description", "Model"),
+  .fun = colwise(median),
+  .progress = "text"
+)
+# drop the "Run" column
+times.aggregated.runs = subset(times.aggregated.runs, select = -c(Run))
+
+times.processed = melt(
+  data = times.aggregated.runs,
+  id.vars = c("Tool", "Workload", "Description", "Model"),
+  measure.vars = phases,
+  variable.name = "Phase",
+  value.name = "Time"
+)
+
+# beautify plotted record:
+# 1. change dots to spaces
+# 2. make sure that the phases are still factors
+times.plot = times.processed
+times.plot$Phase = gsub('\\.', ' ', times.plot$Phase)
+times.plot$Phase = factor(times.plot$Phase, levels = phasesPrettified)
+times.plot$Workload = factor(times.plot$Workload, levels = workloads)
+
+### line charts
+for (phase in phasesPrettified) {
+  phase.filename = gsub(' ', '-', phase)
+  workloadSizes = sizes[["Repair"]]
+
+  # filter the dataframe to the current phase
+  df = times.plot[times.plot$Phase == phase, ]
+
+  # do not visualize empty data sets
+  if (nrow(df) == 0) {
+    print(paste("No rows to visualize for phase", phase))
+    next
+  }
+
+  # x axis labels
+  xbreaks = unique(df$Model)
+  currentWorkloadSizes = head(workloadSizes, n=length(xbreaks))
+  xlabels = paste(xbreaks, "\n", currentWorkloadSizes, sep = "")
+
+  # drop every other models size
+  maxLabel = max(log2(max(df$Model)), 2)
+  if (maxLabel %% 2) {
+    start = 3
+  } else {
+    start = 2
+  }
+  filter = seq(start, maxLabel, by=2)
+
+  xlabels[filter] = ""
+
+  # y axis labels
+  yaxis = nice_y_axis()
+  ybreaks = yaxis$ybreaks
+  ylabels = yaxis$ylabels
+
+  p = ggplot(df) + #na.omit(df)) +
+    aes(x = as.factor(Model), y = Time) +
+    labs(title = paste("Individual query execution time,", phase, "phase, ", toolName), x = "Model size\n#Elements", y = "Execution times [ms]") +
+    geom_point(aes(col = Tool, shape = Tool), size = 2.0) +
+    scale_shape_manual(values = seq(0, 15)) +
+    geom_line(aes(col = Tool, group = Tool), size = 0.5) +
+    scale_x_discrete(breaks = xbreaks, labels = xlabels) +
+    scale_y_log10(breaks = ybreaks, labels = ylabels) +
+    guides(color = guide_legend(ncol = 4)) +
+    theme_bw() +
+    theme(
+      plot.title = element_text(hjust = 0.5),
+      text = element_text(size = 10),
+      legend.key = element_blank(),
+      legend.title = element_blank(),
+      legend.position = "bottom",
+      axis.text = element_text(size = 9)
+    )
+  print(p)
+  for (cpage in 1:6) {
+    ggsave(
+      plot = p + facet_grid_paginate(~ Workload, nrow=1, ncol = 1, page=cpage, scale = "free"),
+      filename = paste(output_dir, "/", toolName, "-", phase.filename, "-",workloads[cpage], ".pdf", sep=""),
+      width = 250, height = 150, units = "mm"
+    )
+  }
+}
diff --git a/trainbenchmark/trainbenchmark-reporting/toolwise.py b/trainbenchmark/trainbenchmark-reporting/toolwise.py
new file mode 100644
index 000000000..93a275357
--- /dev/null
+++ b/trainbenchmark/trainbenchmark-reporting/toolwise.py
@@ -0,0 +1,33 @@
+import argparse
+import json
+import logging
+import os.path
+import subprocess
+
+
+FORMAT = '%(asctime)s %(levelname)-8s %(threadName)-10s (%(filename)s:%(lineno)d): %(message)s'
+logger = logging.getLogger('toolwise')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Plot results per tool.')
+    parser.add_argument(
+        "-v", "--verbose", help="Print debug messages.", action="store_true")
+    parser.add_argument(
+        "-f", "--file-config", default='merge_results.json', help="Config file to use.")
+    args = parser.parse_args()
+    logging.basicConfig(format=FORMAT, level=logging.DEBUG if args.verbose else logging.INFO)
+    # load config file
+    with open('merge_results.json') as fdr:
+        content = json.load(fdr)
+    # update with local version, if existing
+    directory, basename = os.path.split(os.path.abspath('merge_results.json'))
+    local_config_file = os.path.join(directory, 'local-' + basename)
+    if os.path.exists(local_config_file):
+        with open(local_config_file) as fdr:
+            content.update(json.load(fdr))
+    else:
+        logger.debug('No local config file found.')
+    for tool in content.get('toolwise', []):
+        logging.info('Processing %s now.', tool)
+        subprocess.call(["Rscript", "toolwise.R", tool])
diff --git a/trainbenchmark/trainbenchmark-reporting/toolwise.sh b/trainbenchmark/trainbenchmark-reporting/toolwise.sh
new file mode 100755
index 000000000..b9891085e
--- /dev/null
+++ b/trainbenchmark/trainbenchmark-reporting/toolwise.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+python toolwise.py
+# --file-config <file>
+# --verbose
-- 
GitLab