Added Code and data

3 years ago · 13d30ecd6f
88 changed files with 2534 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,7 @@
+.idea
+__pycache__
+data/dataframes
+rouge_evalauation/dataframes
+rouge_evalauation/figures
+rouge_evalauation/evaluated_data/second_eval/AufeinanderfolgendeSätze.xlsx
+rouge_evalauation/manual_evaluation
--- a/data/download_rii.py
+++ b/data/download_rii.py
@ -0,0 +1,214 @@
+import time
+import xml.etree.ElementTree as ET
+import urllib.request as request
+import zipfile
+import os
+
+import pandas as pd
+
+import settings
+import utils
+from utils import time_convert
+
+base_dir_bgh = 'raw_data/BGH_Data'
+extended_dir_bgh = base_dir_bgh + '/senates'
+dataframe_dir_bgh = 'dataframes/bgh/'
+pickle_name_bgh = 'bgh_data.pkl'
+simple_attributes = ["doknr", "ecli", "gertyp", "gerort", "spruchkoerper", "entsch-datum",
+                     "aktenzeichen", "doktyp", "norm", "vorinstanz", "mitwirkung", "titelzeile",
+                     "leitsatz", "sonstosatz", "tenor", "tatbestand", "entscheidungsgruende",
+                     "gruende", "abwmeinung", "sonstlt", "identifier", "coverage", "language",
+                     "publisher", "accessRights"]
+nested_attributes = ["region_abk", "region_long"]
+text_attributes = ["titelzeile", "leitsatz", "sonstosatz", "tenor", "tatbestand",
+                   "entscheidungsgruende", "gruende", "abwmeinung", "sonstlt"]
+stopword_extension = '_no_stopwords'
+current_path = 'data'
+
+
+def get_file_list():
+    """
+    Makes http request for the files
+    :return: the web page with all current cases as an xml-tree
+    """
+    xml_file, https_message = request.urlretrieve('https://www.rechtsprechung-im-internet.de/rii-toc.xml')
+    tree = ET.parse(xml_file)
+    root = tree.getroot()
+    return root
+
+
+def count_cases(root, tag):
+    """
+    counts all cases belonging to the given tag and returns the count
+    :param root: downloaded xml-tree with all files
+    :param tag: tag to find in the name
+    :return: number of cases belonging to the BGH
+    """
+    count = 0
+    for child in root:
+        if tag in child[0].text:
+            count += 1
+    return count
+
+
+def download(base_dir, extended_dir, tag):
+    """
+    download all cases to a folder related to their senats
+    :param base_dir: Name of the directory for the data
+    :param extended_dir: name of the subdirectory for saving
+    :param tag: tag to recognize the court (BGH, BVerwG)
+    """
+    # set up directories
+    utils.create_dir(current_path=current_path, directory_name=base_dir)
+    utils.create_dir(current_path=current_path, directory_name=extended_dir)
+    # do the download
+    root = get_file_list()  # 0 ist gericht, 3 ist link
+    max_cases = count_cases(root, tag)
+    downloaded = 0
+    for child in root:
+        while True:
+            try:
+                if tag in child[0].text:
+                    filename, http = request.urlretrieve(child[3].text)
+                    with zipfile.ZipFile(filename, 'r') as zip_ref:
+                        zip_ref.extractall(
+                            utils.server_path(current_path=current_path,
+                                              path=extended_dir + '/' + child[0].text.replace('\n', '') + '/'))
+                    os.remove(filename)
+                    downloaded += 1
+                    print("\rDownloaded %d of %d " % (downloaded, max_cases) + tag + "Cases", end="")
+            finally:
+                break
+    print("\nDone!")
+
+
+def read_file_data(file):
+    """
+    Reads the data of one case / file.
+
+    :param file: package containing (filename, directory, directory extension) to address the file
+    :return: a dictionary with key: attribute_name and value: attribute_value
+    """
+    filename, directory, extended_dir = file
+    tree = ET.parse(utils.server_path(current_path=current_path, path=os.path.join(extended_dir, directory, filename)))
+    root = tree.getroot()
+    res = {}
+    for attribute in simple_attributes:
+        attr = root.find(attribute)  # leitsatz überprüfen: zwei Worte zusammen, aber leerzeichen immer noch da!
+        text = ''
+        for t in attr.itertext():
+            if t == '.' or t == ',' or t == ';' or t == '!' or t == '?':
+                text = text.strip()     # remove space before these characters
+            text += t + ' '
+        text = text.strip()
+        if text == '':
+            res[attribute] = None
+        else:
+            res[attribute] = text
+
+    for attribute in nested_attributes:
+        nesting = attribute.split('_')
+        xml_tag = root
+        # find nested attribute
+        for i in range(len(nesting)):
+            xml_tag = xml_tag.find(nesting[i])
+        text = ""
+        for t in xml_tag.itertext():
+            if t == '.' or t == ',' or t == ';' or t == '!' or t == '?':
+                text = text.strip()     # remove space before these characters
+            text += t + ' '
+        text = text.strip()
+        if text == '':
+            res[attribute] = None
+        else:
+            res[attribute] = text
+
+    for attribute in utils.rii_text_columns:
+        if res[attribute] is not None:
+            if settings.remove_brackets:
+                res[attribute] = utils.remove_brackets(res[attribute])
+            res[attribute] = utils.remove_spaces_before_sentence_marks(res[attribute])
+
+    return pd.DataFrame(res, index=[0])
+
+
+def create_pickle(extended_dir, pickle_name, steps):
+    """
+    Combines all downloaded files of the given extended directory into one pickle
+
+    :param extended_dir: extended dir to find the files
+    :param pickle_name: name of the pickle to save
+    :param steps: how many cases should be worked on now
+    """
+    utils.create_dir(current_path=current_path, directory_name=dataframe_dir_bgh, delete=False)
+    start_time = time.time()
+    extension = ''
+    if settings.remove_brackets:
+        extension = settings.no_brackets_suffix
+
+    files = [(filename, directory, extended_dir) for directory in
+             utils.list_dir_files(current_path=current_path, path=extended_dir) for filename in
+             utils.list_dir_files(current_path=current_path, path=os.path.join(extended_dir, directory))
+             if filename.endswith(".xml")]
+
+    original_length = len(files)
+    data = pd.DataFrame(columns=simple_attributes + nested_attributes)
+
+    pickle_path = dataframe_dir_bgh+extension+pickle_name
+
+    files, data = utils.get_step_subset_raw(steps=steps,
+                                            path_to_dest_dataframe=pickle_path,
+                                            source_data=files,
+                                            dest_data=data,
+                                            call_path=current_path)
+
+    result = utils.parallel_imap(read_file_data, files)
+    for row in result:
+        data = pd.concat([data, row], ignore_index=True)
+    with utils.open_file(current_path=current_path, path=pickle_path, modes='wb') as f:
+        data.to_pickle(f)
+
+    print('Resulting dataframes have length ' + str(data.shape[0]) +
+          ' (' + str(data.shape[0] / original_length * 100) + '%)')
+    end_time = time.time()
+    time_lapsed = end_time - start_time
+    time_convert(time_lapsed)
+
+
+def get_selected_bgh_data(directory='.\\'):
+    """
+    Shortcut for getting the BGH data currently needed. Selects all data from the Civil copurts which contain 'Urteile'
+
+    :param directory: directory offset from current position, with ending slashes
+    :return: the data
+    """
+    return get_data(pickle_name_bgh, directory, spruchkoerper='Zivilsenat', doktyp='Urteil')
+
+
+def get_data(pickle_name, directory='../data/', spruchkoerper=None, doktyp=None):
+    """
+    Method for access to the bgh pickle
+    :param pickle_name: name to identify the data
+    :param directory: directory path to the data file (with ending slash)
+    :param spruchkoerper: Parameter can be used to select the senates (checks whether the given string is contained
+    in the datas spruchkoerper)
+    :param doktyp: can be used to select specific documents (like 'Urteil', 'Beschluss', etc.), must contain the word
+    :return: The data as a pandas dataframe
+    """
+    extension = ''
+    if settings.remove_brackets:
+        extension = settings.no_brackets_suffix
+    data = utils.df_from_pickle(current_path=current_path, path=directory + dataframe_dir_bgh + extension + pickle_name)
+    if spruchkoerper is not None:
+        data = data[data['spruchkoerper'].notnull()]
+        data = data[data['spruchkoerper'].str.contains(spruchkoerper)]
+    if doktyp is not None:
+        data = data[data['doktyp'].str.lower().str.contains(doktyp.lower())]
+    data = data.dropna(axis=1, how='all')  # drop all columns with no value
+    data = data.drop_duplicates()
+    return data
+
+
+# if __name__ == "__main__":
+    # download(base_dir=base_dir_bgh, extended_dir=extended_dir_bgh, tag='BGH')
+    # create_pickle(extended_dir=extended_dir_bgh, pickle_name=pickle_name_bgh, steps=2)
--- a/pm_summary/annotation_evaluation.py
+++ b/pm_summary/annotation_evaluation.py
@ -0,0 +1,47 @@
+import os
+
+import pandas as pd
+
+
+import utils
+
+pm_sent_no = 'pm_sent_number'
+pm_sent = 'Sätze der Pressemitteilung'
+judgement_sent_no = 'judgement_sent_number'
+judgement_sent = 'Dazu passende Sätze des Urteils'
+keywords = 'Schlagworte'
+comments = 'Anmerkung'
+duration = 'Wie lange hast Du für die Bearbeitung dieses Urteils gebraucht?'
+bad_pm = 'Ist diese Pressemitteilung eine schlechte Darstellung / Zusammenfassung des Urteils?'
+current_dir = 'pm_summary/'
+
+
+def prepare_file(path):
+    """
+    Liest eine Datei aus und überführt sie in ein einheitliches Format.
+
+    :param path: Pfad zur Datei.
+    :return: dictionary, in dem die Ergebnisse stehen. Für jeden Satz der PM gibt es ein Unterdict (Zahl als key).
+    """
+    res = {}
+    raw_data = pd.read_excel(path, names=[pm_sent_no, pm_sent], header=None)
+    for index, row in raw_data.iterrows():
+        current_sentence = {pm_sent_no: row[pm_sent_no], pm_sent: row[pm_sent]}
+        res[current_sentence[pm_sent_no]] = current_sentence
+    return res
+
+
+def get_all_pm_files():
+    """
+    Returns the list of all annotated pm-files
+
+    :return: [(pm_filename, file_data)*]
+    """
+    file_path_base = utils.server_path(current_path=current_dir,
+                                       path='../rouge_evalauation/evaluated_data/extractive_judgments')
+    res = []
+    for judgment in os.listdir(file_path_base):
+        if '.xlsx' in judgment:
+            filename = file_path_base + '/' + judgment
+            res.append((judgment, prepare_file(filename)))
+    return res
--- a/rouge.py
+++ b/rouge.py
@ -0,0 +1,148 @@
+import utils
+
+beta = 1
+# https://aclanthology.org/W04-1013/
+
+
+def rouge_n(reference_summary, created_summary, n, pp_options=None, extended_results=False):
+    """
+    Calculates the rouge n score
+
+    :param reference_summary: gold standard summary
+    :param created_summary: summary to evaluate
+    :param n: size of n-grams
+    :param pp_options: list of options for preprocessing, if None then no preprocessing will be done
+    :param extended_results: indicates, whether, precision, recall and f-measure should be returned
+    :return: the score or (precision, recall, f-measure) if extended results are wanted
+    """
+    # preprocess
+    if pp_options is not None:  # otherwise don't preprocess. Text is already preprocessed
+        reference_summary = utils.preprocess_text(reference_summary, pp_options)
+        created_summary = utils.preprocess_text(created_summary, pp_options)
+    else:  # seperate sentence marks from tokens
+        for sentence_mark in utils.sentence_marks:
+            reference_summary = reference_summary.replace(sentence_mark, ' '+sentence_mark)
+            created_summary = created_summary.replace(sentence_mark, ' ' + sentence_mark)
+    # split into n-grams of size n
+    # count occurances of single ngrams
+    reference_ngrams, ref_complete_count = count_n_grams(reference_summary, n)
+    created_ngrams, created_complete_count = count_n_grams(created_summary, n)
+
+    overlapping_count = 0
+    for ref_key in reference_ngrams.keys():
+        created_count = created_ngrams.get(ref_key)
+        if created_count is not None:  # ngrams in both dicts
+            ref_count = reference_ngrams[ref_key]
+            overlapping_count += min(ref_count, created_count)
+
+    # calculate score
+    if ref_complete_count == 0:
+        return 0
+    recall = overlapping_count / ref_complete_count
+    if extended_results:
+        precision = overlapping_count / created_complete_count
+        return precision, recall, (2*precision*recall) / (precision+recall)
+    return recall
+
+
+def count_n_grams(pp_summary, n):
+    """
+    Counts the n-grams of the given size in a summary.
+
+    :param pp_summary: Pre-processed summary
+    :param n: n for the size of ngrams
+    :return: {ngram:count} for all ngrams in the summary
+    """
+    words = pp_summary.split(' ')
+    complete_count = 0
+    n_grams = {}
+    for i in range(len(words)-(n-1)):
+        n_gram = ' '.join(words[i:i+n])
+        if n_gram != '':
+            complete_count += 1
+            count = n_grams.get(n_gram)
+            if count is None:
+                count = 0
+            n_grams[n_gram] = count + 1
+    return n_grams, complete_count
+
+
+def rouge_l(reference_summary, created_summary, pp_options=None, extended_results=False):
+    """
+    Calculates the rouge-l value of a summary and its gold standard summary
+
+    :param reference_summary: Gold standard summary
+    :param created_summary: Created summary to compare
+    :param pp_options: options for preprocessing, if None then there will be no preprocessing
+    :param extended_results: if True, precision, recall and f-score will be returned
+    :return: The calculated score, if extended results are wanted (precision, recall, f-measure)
+    """
+    # preprocess
+    if pp_options is not None:  # otherwise don't preprocess. Text is already preprocessed
+        reference_summary = utils.preprocess_text(reference_summary, pp_options)
+        created_summary = utils.preprocess_text(created_summary, pp_options)
+    # seperate sentence marks from words
+    # split into sentences
+    m_reference_word_number = len(reference_summary.split(' '))
+    reference_summary = utils.split_into_sentences(reference_summary)
+    n_created_word_number = len(created_summary.split(' '))
+    created_summary = utils.split_into_sentences(created_summary)
+    total_sum_subsequences = 0
+    # to make sure every word in the created summary is used only once
+    used_created_indices = [set()]*len(created_summary)
+    used_gold_indices = [set()]*len(reference_summary)
+    for j in range(len(reference_summary)):
+        ref_sentence = reference_summary[j]
+        # calculate union longest subsequence
+        for i in range(len(created_summary)):
+            created_sentence = created_summary[i]
+            indices_a, indices_b = get_subsequence(ref_sentence, created_sentence)
+            used_gold_indices[j] = (used_gold_indices[j]).union(indices_a)
+            used_created_indices[i] = (used_created_indices[i]).union(indices_b)
+    # used indices of b here to ensure words arent used twice
+    used_created_indices = [len(sent_set) for sent_set in used_created_indices]
+    used_gold_indices = [len(sent_set) for sent_set in used_gold_indices]
+    total_sum_subsequences = min(sum(used_gold_indices), sum(used_created_indices))
+
+    if total_sum_subsequences == 0:
+        return 0
+    p_lcs = total_sum_subsequences / n_created_word_number
+    r_lcs = total_sum_subsequences / m_reference_word_number
+    f_lcs = ((1 + beta * beta) * r_lcs*p_lcs) / (r_lcs + beta * beta * p_lcs)
+    if extended_results:
+        return p_lcs, r_lcs, f_lcs
+    return f_lcs
+
+
+def get_subsequence(sent_a, sent_b):
+    """
+    Finds all (not necessarily consecutive) subsequences of a in b.
+    :param sent_a: Sentence to find subsequences from
+    :param sent_b: Sentence to find subsequence in
+    :return: (ind_a, ind_b) two sets of indices of sent_a and sent_b of the longest subsequence
+    """
+    result_a = set()
+    words_a = sent_a.split(' ')
+    words_b = sent_b.split(' ')
+    for word_index_a in range(len(words_a)):
+        word_result = set()
+        char_index_b = 0
+        while word_index_a < len(words_a):
+            # word is contained
+            try:
+                found_index = words_b.index(words_a[word_index_a], char_index_b)
+                word_result.add(word_index_a)
+                char_index_b = found_index
+                word_index_a += 1
+            except ValueError:
+                # word not in b contained, do nth
+                word_index_a += 1
+        if len(word_result) > len(result_a):
+            result_a = word_result
+    result_b = set([words_b.index(words_a[a_ind]) for a_ind in result_a])
+    return result_a, result_b
+
+
+if __name__ == "__main__":
+
+    print('Done')
--- a/rouge_evalauation/create_evaluation_files.py
+++ b/rouge_evalauation/create_evaluation_files.py
--- a/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_113-18.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_113-18.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_25-20.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_25-20.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_292-17.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_292-17.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_35-18.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_35-18.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_391-17.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_391-17.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_42-19.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_42-19.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_55-19.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_55-19.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_67-18.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_67-18.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_79-21.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_79-21.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/II_ZR_152-20.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/II_ZR_152-20.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/II_ZR_84-20.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/II_ZR_84-20.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/IV_ZR_144-21.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/IV_ZR_144-21.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/IV_ZR_253-20.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/IV_ZR_253-20.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/I_ZR_139-15.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/I_ZR_139-15.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/I_ZR_146-20.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/I_ZR_146-20.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/I_ZR_153-17.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/I_ZR_153-17.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/I_ZR_23-18.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/I_ZR_23-18.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/VIII_ZR_21-19.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/VIII_ZR_21-19.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/VIII_ZR_277-16.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/VIII_ZR_277-16.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/VIII_ZR_66-17.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/VIII_ZR_66-17.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/VIII_ZR_94-17.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/VIII_ZR_94-17.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/VII_ZR_151-18.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/VII_ZR_151-18.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/VII_ZR_192-20.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/VII_ZR_192-20.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/VII_ZR_78-20.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/VII_ZR_78-20.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/VI_ZR_128-20.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/VI_ZR_128-20.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/VI_ZR_252-19.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/VI_ZR_252-19.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/VI_ZR_506-17.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/VI_ZR_506-17.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/VIa_ZR_418-21.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/VIa_ZR_418-21.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_112-18.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_112-18.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_176-17.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_176-17.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_218-18.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_218-18.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_254-17.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_254-17.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_273-17.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_273-17.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_299-19.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_299-19.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_8-19.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_8-19.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/XII_ZR_13-19.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/XII_ZR_13-19.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/XI_ZR_345-18.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/XI_ZR_345-18.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/XI_ZR_7-19.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/XI_ZR_7-19.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/X_ZR_107-16.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/X_ZR_107-16.xlsx
--- a/rouge_evalauation/evaluated_data/extractive_judgments/X_ZR_96-17.xlsx
+++ b/rouge_evalauation/evaluated_data/extractive_judgments/X_ZR_96-17.xlsx
--- a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/I
+++ b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/I
--- a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/I
+++ b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/I
--- a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/II
+++ b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/II
--- a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/II
+++ b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/II
--- a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/III
+++ b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/III
--- a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/III
+++ b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/III
--- a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/IV
+++ b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/IV
--- a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/IV
+++ b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/IV
--- a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/V
+++ b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/V
--- a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/V
+++ b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/V
--- a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VI
+++ b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VI
--- a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VI
+++ b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VI
--- a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VII
+++ b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VII
--- a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VII
+++ b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VII
--- a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VIII
+++ b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VIII
--- a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VIII
+++ b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VIII
--- a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VIa
+++ b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VIa
--- a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/X
+++ b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/X
--- a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/XI
+++ b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/XI
--- a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/XI
+++ b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/XI
--- a/rouge_evalauation/evaluated_data/summary_sentences/I
+++ b/rouge_evalauation/evaluated_data/summary_sentences/I
--- a/rouge_evalauation/evaluated_data/summary_sentences/I
+++ b/rouge_evalauation/evaluated_data/summary_sentences/I
--- a/rouge_evalauation/evaluated_data/summary_sentences/III
+++ b/rouge_evalauation/evaluated_data/summary_sentences/III
--- a/rouge_evalauation/evaluated_data/summary_sentences/III
+++ b/rouge_evalauation/evaluated_data/summary_sentences/III
--- a/rouge_evalauation/evaluated_data/summary_sentences/III
+++ b/rouge_evalauation/evaluated_data/summary_sentences/III
--- a/rouge_evalauation/evaluated_data/summary_sentences/III
+++ b/rouge_evalauation/evaluated_data/summary_sentences/III
--- a/rouge_evalauation/evaluated_data/summary_sentences/III
+++ b/rouge_evalauation/evaluated_data/summary_sentences/III
--- a/rouge_evalauation/evaluated_data/summary_sentences/III
+++ b/rouge_evalauation/evaluated_data/summary_sentences/III
--- a/rouge_evalauation/evaluated_data/summary_sentences/III
+++ b/rouge_evalauation/evaluated_data/summary_sentences/III
--- a/rouge_evalauation/evaluated_data/summary_sentences/V
+++ b/rouge_evalauation/evaluated_data/summary_sentences/V
--- a/rouge_evalauation/evaluated_data/summary_sentences/V
+++ b/rouge_evalauation/evaluated_data/summary_sentences/V
--- a/rouge_evalauation/evaluated_data/summary_sentences/V
+++ b/rouge_evalauation/evaluated_data/summary_sentences/V
--- a/rouge_evalauation/evaluated_data/summary_sentences/V
+++ b/rouge_evalauation/evaluated_data/summary_sentences/V
--- a/rouge_evalauation/evaluated_data/summary_sentences/V
+++ b/rouge_evalauation/evaluated_data/summary_sentences/V
--- a/rouge_evalauation/evaluated_data/summary_sentences/VI
+++ b/rouge_evalauation/evaluated_data/summary_sentences/VI
--- a/rouge_evalauation/evaluated_data/summary_sentences/VII
+++ b/rouge_evalauation/evaluated_data/summary_sentences/VII
--- a/rouge_evalauation/evaluated_data/summary_sentences/VIII
+++ b/rouge_evalauation/evaluated_data/summary_sentences/VIII
--- a/rouge_evalauation/evaluated_data/summary_sentences/VIII
+++ b/rouge_evalauation/evaluated_data/summary_sentences/VIII
--- a/rouge_evalauation/evaluated_data/summary_sentences/X
+++ b/rouge_evalauation/evaluated_data/summary_sentences/X
--- a/rouge_evalauation/evaluated_data/summary_sentences/XII
+++ b/rouge_evalauation/evaluated_data/summary_sentences/XII
--- a/settings.py
+++ b/settings.py
@ -0,0 +1,12 @@
+# creating BGH data
+import spacy
+
+remove_brackets = False
+server = False
+
+no_brackets_suffix = "no_br_"
+nlp = spacy.load("de_core_news_sm")  # small one
+# nlp = spacy.load("de_dep_news_trf")  # big one, CUDA Problems on Server...
+
+
+
--- a/test/test_rouge.py
+++ b/test/test_rouge.py
@ -0,0 +1,192 @@
+from unittest import TestCase
+
+import rouge
+import utils
+
+
+def get_file_data(filename):
+    text_file = open(filename, "r", encoding='utf-8')
+    data = text_file.read()
+    text_file.close()
+    return data
+
+
+def run_tests_with_data(self, test_data):
+    for original, generated_one, generated_two, description in test_data:
+        print(description)
+        print('original: ' + original)
+        rouge_one = rouge.rouge_n(original, generated_one, n=1)
+        rouge_two = rouge.rouge_n(original, generated_two, n=1)
+        print('Rouge for one: ' + str(rouge_one) + ' ' + generated_one)
+        print('Rouge for two: ' + str(rouge_two) + ' ' + generated_two)
+        combined = generated_two + ' ' + generated_one
+        combined_rouge = rouge.rouge_n(original, combined, n=1)
+        print('Rouge for combined: ' + str(combined_rouge) + ' ' + combined)
+        self.assertGreaterEqual(combined_rouge, rouge_one)
+        self.assertGreaterEqual(combined_rouge, rouge_two)
+
+
+class RougeTest(TestCase):
+
+    def tests_from_paper(self):
+        s1 = 'police killed the gunman'
+        s2 = 'police kill the gunman'
+        s3 = 'the gunman kill police'
+        score_s2 = rouge.rouge_l(s1, s2)
+        self.assertEqual(score_s2, 0.75)
+        score_s3 = rouge.rouge_l(s1, s3)
+        self.assertEqual(score_s3, 0.5)
+
+        reference = 'affe birne club düne essen'
+        summary = 'affe birne feder geld himmel. affe club insel jagd essen.'
+        lcs = 4
+        p = lcs / 12
+        r = lcs / 5
+        f = 2*(r*p)/(r+p)
+        r_p, r_r, r_f = rouge.rouge_l(reference_summary=reference, created_summary=summary,  pp_options=[utils.pp_option_stopwords],
+                                      extended_results=True)
+        self.assertEqual(r_r, r)
+        self.assertEqual(r_p, p)
+        self.assertEqual(r_f, f)
+
+        score_equal = rouge.rouge_l(summary, summary)
+        self.assertEqual(score_equal, 1)
+
+    def test_one(self):
+        original = 'Für die Frage, ob alle in Art. 6 Abs. 1 der Richtlinie 2011/83/EU genannten Informationen objektiv in einem Werbemittel dargestellt werden können, ist erheblich, welchen Anteil diese Informationen am verfügbaren Raum des vom Unternehmer ausgewählten Werbeträgers einnehmen würden; die Werbebotschaft muss gegenüber den Verbraucherinformationen nicht zurücktreten.'
+        sent_1 = '(1) Zwar ist für die nach der Vorabentscheidung des Gerichtshofs der Europäischen Union maßgebliche Frage, ob alle in Art. 6 Abs. 1 der Richtlinie 2011/83/EU genannten Informationen objektiv in einem Werbemittel dargestellt werden können, erheblich, welchen Anteil diese Informationen am verfügbaren Raum des vom Unternehmer ausgewählten Werbeträgers einnehmen würden.'
+        sent_2 = 'Aus der Anforderung, die Informationen objektiv in der Werbebotschaft darstellen zu können, ist zu schließen, dass die Werbebotschaft gegenüber den Verbraucherinformationen nicht zurücktreten muss.'
+        rouge_v1 = rouge.rouge_n(original, sent_1, 1, pp_options=[utils.pp_option_stopwords])
+        rouge_v2 = rouge.rouge_n(original, sent_1 + ' ' + sent_2, 1, pp_options=[utils.pp_option_stopwords])
+        self.assertGreater(rouge_v2, rouge_v1)
+
+    def test_one_match(self):
+        original = 'a b c d e.'
+        score = rouge.rouge_n(original, 'd.', n=1)
+        self.assertGreater(score, 0)
+
+    def test_extension(self):
+        original_short = 'a b c d e.'
+        original_medi = 'a b c d e f g h i j k l m n o.'
+        original_long = 'a b c d e f g h i j k l m n o p q r s t u v w x y z.'
+        test_data = [[original_short, 'a b.', 'a b d.', 'small extension short sentence'],
+                     [original_short, 'a.', 'a b c d.', 'large extension short sentence'],
+                     [original_medi, 'a b c d e f g h i.', 'a b c d e f g h i j.', 'small extension medi sentence'],
+                     [original_medi, 'a b c d e f g h i.', 'a b c d e f g h i m n o l.',
+                      'large extension medi sentence'],
+                     [original_long, 'a b c d e f g h i j k l m n o p q r s t u v.',
+                      'a b c d e f g h i j k l m n o p q r s t u v w.', 'small extension long sentence'],
+                     [original_long, 'a b c d e f g h i j k.',
+                      'a b c d e f g h i j k l m n o p q r s t u v w.', 'large extension long sentence'],
+                     ]
+        print('Test extensions')
+        run_tests_with_data(self, test_data)
+
+    def test_differing(self):
+        original_short = 'a b c d e.'
+        original_medi = 'a b c d e f g h i j k l m n o.'
+        original_long = 'a b c d e f g h i j k l m n o p q r s t u v w x y z.'
+
+        test_data = [[original_short, 'a b c.', 'a b d.', 'small difference short sentence'],
+                     [original_short, 'a e.', 'a b c.', 'large difference short sentence'],
+                     [original_medi, 'a b c d e f g h i.', 'a b c d e f g h j.', 'small difference medi sentence'],
+                     [original_medi, 'a b c d e f g h i.', 'a b c d j k l m.',
+                      'large difference medi sentence'],
+                     [original_long, 'a b c d e f g h i j k l m n o p q r s t u v.',
+                      'a b c d e f g h i j k l m n o p q r s t u w.', 'small difference long sentence'],
+                     [original_long, 'a b c d e f g h i j k.',
+                      'a b l m n o p q r s t u v.', 'large difference long sentence'],
+                     ]
+        print('Test differences')
+        run_tests_with_data(self, test_data)
+
+    def test_rougel_high_precision_or_recall(self):
+        gold = 'Boot.'
+        created = 'Boot. Boot.'
+        r_p, r_r, r_f = rouge.rouge_l(created_summary=created, reference_summary=gold, extended_results=True,
+                                      pp_options=[utils.pp_option_stopwords, utils.pp_option_lemmatize])
+        self.assertEqual(r_p, 1/2)
+        self.assertEqual(r_r, 1)
+        self.assertEqual(r_f, 2/3)
+
+        gold = 'Affe Boot. Boot Club.'
+        created = 'Boot.'
+        r_p, r_r, r_f = rouge.rouge_l(created_summary=created, reference_summary=gold, extended_results=True,
+                                      pp_options=[utils.pp_option_stopwords, utils.pp_option_lemmatize])
+        self.assertEqual(r_p, 1)
+        self.assertEqual(r_r, 2/6)
+        self.assertEqual(r_f, 1/2)
+
+
+        gold = 'Im Rahmen der bei Prüfung der Schutzschranke der Berichterstattung über Tagesereignisse gemäß § 50 ' \
+               'UrhG vorzunehmenden Grundrechtsabwägung ist im Falle der Veröffentlichung eines bislang ' \
+               'unveröffentlichten Werks auch das vom Urheberpersönlichkeitsrecht geschützte Interesse an einer ' \
+               'Geheimhaltung des Werks zu berücksichtigen. Dieses schützt das urheberrechtsspezifische Interesse des ' \
+               'Urhebers, darüber zu bestimmen, ob er mit der erstmaligen Veröffentlichung den Schritt von der ' \
+               'Privatsphäre in die Öffentlichkeit tut und sich und sein Werk damit der öffentlichen Kenntnisnahme ' \
+               'und Kritik aussetzt. Nicht zu berücksichtigen ist bei dieser Abwägung dagegen das Interesse an der ' \
+               'Geheimhaltung von Umständen, deren Offenlegung Nachteile für die Interessen des Staates und seiner ' \
+               'Einrichtungen haben könnten. Dieses Interesse ist nicht durch das Urheberpersönlichkeitsrecht, ' \
+               'sondern durch andere Vorschriften - etwa das Sicherheitsüberprüfungsgesetz, § 3 Nr. 1 Buchst. b IFG ' \
+               'und die strafrechtlichen Bestimmungen gegen Landesverrat und die Gefährdung der äußeren Sicherheit ' \
+               'gemäß §§ 93 ff. StGB - geschützt. '
+        created = 'Dieses Interesse ist vielmehr durch die allgemeinen Vorschriften - etwa das ' \
+                  'Sicherheitsüberprüfungsgesetz, § 3 Nr. 1 Buchst. b IFG und die strafrechtlichen Bestimmungen gegen ' \
+                  'Landesverrat und die Gefährdung der äußeren Sicherheit gemäß §§ 93 ff. '
+        r_p, r_r, r_f = rouge.rouge_l(created_summary=created, reference_summary=gold, extended_results=True,
+                                      pp_options=[utils.pp_option_stopwords, utils.pp_option_lemmatize])
+        self.assertLessEqual(r_p, 1)
+
+        gold = 'Der Eigentümer eines Grundstücks ist hinsichtlich der von einem darauf befindlichen Baum (hier: ' \
+               'Birken) ausgehenden natürlichen Immissionen auf benachbarte Grundstücke Störer i.S.d. § 1004 Abs. 1 ' \
+               'BGB, wenn er sein Grundstück nicht ordnungsgemäß bewirtschaftet. Hieran fehlt es in aller Regel, ' \
+               'wenn die für die Anpflanzung bestehenden landesrechtlichen Abstandsregelungen eingehalten sind. 1b. ' \
+               'Ein Anspruch auf Beseitigung des Baums lässt sich in diesem Fall regelmäßig auch nicht aus dem ' \
+               'nachbarlichen Gemeinschaftsverhältnis herleiten. Hält der Grundstückseigentümer die für die ' \
+               'Anpflanzung bestehenden landesrechtlichen Abstandsregelungen ein, hat der Eigentümer des ' \
+               'Nachbargrundstücks wegen der Beeinträchtigungen durch die von den Anpflanzungen ausgehenden ' \
+               'natürlichen Immissionen weder einen Ausgleichsanspruch gemäß § 906 Abs. 2 Satz 2 BGB in unmittelbarer ' \
+               'Anwendung noch einen nachbarrechtlichen Ausgleichsanspruch gemäß § 906 Abs. 2 Satz 2 analog (' \
+               'Abgrenzung zu Senat, Urteil vom 27. Oktober 2017 - V ZR 8/17, ZfIR 2018, 190). '
+        created = "Für die Entscheidung des Meinungsstreits ist von dem oben dargelegten Grundsatz auszugehen, " \
+                  "dass der Eigentümer eines Grundstücks hinsichtlich der von einem darauf befindlichen Baum " \
+                  "ausgehenden natürlichen Immissionen auf benachbarte Grundstücke Störer i.S.d. § 1004 Abs. 1 BGB " \
+                  "ist, wenn er sein Grundstück nicht ordnungsgemäß bewirtschaftet. Hält der Grundstückseigentümer " \
+                  "die für die Anpflanzung bestehenden landes-rechtlichen Abstandsregelungen ein, hat der Eigentümer " \
+                  "des Nachbargrund-stücks wegen der Beeinträchtigungen durch die von den Anpflanzungen ausgehenden " \
+                  "natürlichen Immissionen weder einen Ausgleichsanspruch gemäß § 906 Abs. 2 Satz 2 BGB in " \
+                  "unmittelbarer Anwendung noch einen nachbarrechtlichen Ausgleichsanspruch gemäß § 906 Abs. 2 Satz 2 " \
+                  "analog. Sind die für die Anpflanzung bestehenden landesrechtlichen Abstandsregelungen eingehalten, " \
+                  "lässt sich ein Anspruch auf Beseitigung der Bäume in aller Regel - und so auch hier - nicht aus " \
+                  "dem nachbarlichen Gemeinschaftsverhältnis herleiten. Gemäß § 907 Abs. 2 BGB gehören aber Bäume und " \
+                  "Sträucher nicht zu den Anlagen i.S.d. § 907 Abs. 1 BGB. Ob den Grundstückseigentümer für " \
+                  "natürliche Immissionen eine „Sicherungspflicht“ trifft und er damit Störer i.S.d. § 1004 Abs. 1 " \
+                  "BGB ist, ist jeweils anhand der Umstände des Einzelfalls zu prüfen. Rechtsfehlerhaft ist jedoch " \
+                  "die Auffassung des Berufungsgerichts, der Beklagte sei als Störer i.S.d. § 1004 Abs. 1 BGB für die " \
+                  "von den Birken ausgehenden Immissionen auf das Grundstück des Klägers verantwortlich. In diesem " \
+                  "Fall ist er regelmäßig schon nicht Störer, so dass es bereits an einem Beseitigungsanspruch gemäß " \
+                  "§ 1004 Abs. 1 BGB fehlt und der von dem Berufungsgericht beschriebene Konflikt zwischen den Regeln " \
+                  "des Bürgerlichen Gesetzbuchs und den landesrechtlichen Vorschriften nicht besteht. Voraussetzung " \
+                  "hierfür ist jedoch, dass der in Anspruch genommene Grundstückseigentümer für die " \
+                  "Eigentumsbeeinträchtigung verantwortlich und damit Störer i.S.d. § 1004 Abs. 1 BGB ist. "
+
+        r_p, r_r, r_f = rouge.rouge_l(created_summary=created, reference_summary=gold, extended_results=True,
+                                      pp_options=[utils.pp_option_stopwords, utils.pp_option_lemmatize])
+        self.assertLessEqual(r_p, 1)
+        self.assertLessEqual(r_r, 1)
+
+    def test_specific(self):
+        gold = 'Diese Voraussetzungen hat der XII. Zivilsenat für den vorliegenden Fall bejaht.'
+        created = 'Dies ist insbesondere der Fall, wenn die Sanktion außer Verhältnis zum Gewicht des Vertragsverstoßes und den Folgen für den Schuldner der Vertragsstrafe steht.'
+        created_2 = 'Deren Untergrenze ist mit 30 € angegeben.'
+        r_p, r_r, r_f = rouge.rouge_l(created_summary=created, reference_summary=gold, extended_results=True,
+                                      pp_options=[utils.pp_option_stopwords, utils.pp_option_lemmatize])
+        r_p_2, r_r_2, r_f_2 = rouge.rouge_l(created_summary=created_2, reference_summary=gold, extended_results=True,
+                                      pp_options=[utils.pp_option_stopwords, utils.pp_option_lemmatize])
+
+        r_p_c, r_r_c, r_f_c = rouge.rouge_l(created_summary=created+ ' '+created_2, reference_summary=gold, extended_results=True,
+                                      pp_options=[utils.pp_option_stopwords, utils.pp_option_lemmatize])
+
+        self.assertGreater(r_f, r_f_2)
+        self.assertGreater(r_f_c, r_f)
+
--- a/utils.py
+++ b/utils.py
@ -0,0 +1,475 @@
+import json
+import multiprocessing
+import os
+import re
+import shutil
+
+import pandas as pd
+
+import settings
+
+pool_processes = 8
+pool_maxtask = 10
+pool_chunksize = 30
+leitsatz_str = 'leitsatz'
+tenor_str = 'tenor'
+tatbestand_str = 'tatbestand'
+entscheidungsgruende_str = 'entscheidungsgruende'
+aktenzeichen_str = 'aktenzeichen'
+rii_text_columns = [leitsatz_str, tenor_str, tatbestand_str, entscheidungsgruende_str]
+sentence_marks = ['.', ',', ';', '!', '?']
+pp_option_lemmatize = 'preprocessing: lemmatize the text'
+pp_option_stopwords = 'preprocessing: remove stopwords'
+pp_option_case_normalize = 'preprocessing: normalize cases / put to lower'
+pp_option_remove_qout_marks_sing = 'preprocessing: remove qoutation marks around single words'
+no_stopword_list = ['nicht', 'kein']
+entsch_gr_start_sentences = ['II.', 'B.', 'B']
+
+
+def server_path(current_path, path):
+    """
+    Method to add path in case it is run on server.
+
+    :param current_path: Path to add when run on server
+    :param path: Path for local
+    :return: Final path for local or server
+    """
+    if settings.server:
+        path = current_path + '/' + path
+    return path
+
+
+def open_file(current_path, path, modes, encoding=None, newline=None):
+    """
+    Wraps the builtin open function to adjust to server settings
+
+    :param current_path: path of the calling file to adjust for server (without /)
+    :param path: Path for file loading relative to calling file
+    :param modes: Modes to apply
+    :param newline: newline option of the original method, if None nothing will be passed
+    :param encoding: encoding option of the original method, if None nothing will be passed
+    :return: the opened file
+    """
+    if encoding is not None:
+        return open(server_path(current_path=current_path, path=path), modes, encoding=encoding)
+    if newline is not None:
+        return open(server_path(current_path=current_path, path=path), modes, newline=newline)
+    if newline is not None and encoding is not None:
+        return open(server_path(current_path=current_path, path=path), modes, encoding=encoding, newline=newline)
+    return open(server_path(current_path=current_path, path=path), modes)
+
+
+def file_exists(current_path, path):
+    """
+    Wraps the builtin exists function to adjust to server settings
+
+    :param current_path: path of the calling file to adjust for server (without /)
+    :param path: Path for file loading relative to calling file
+    :return: True if the file exists
+    """
+    return os.path.exists(server_path(current_path=current_path, path=path))
+
+
+def list_dir_files(current_path, path):
+    """
+    Wraps the builtin os.listdir function to adjust to server settings
+
+    :param current_path: path of the calling file to adjust for server (without /)
+    :param path: Path for file loading relative to calling file
+    :return: The filenames of the directory
+    """
+    return os.listdir(server_path(current_path=current_path, path=path))
+
+
+def df_from_pickle(current_path, path):
+    """
+    Wraps the pd.read_pickle function to adjust to server settings
+
+    :param current_path: path of the calling file to adjust for server (without /)
+    :param path: Path for file loading relative to calling file
+    :return: The loaded dataframe
+    """
+    return pd.read_pickle(server_path(current_path=current_path, path=path))
+
+
+def df_to_json(current_path, path, dataframe):
+    """
+    Wraps the df.to_json function to adjust to server settings
+
+    :param current_path: path of the calling file to adjust for server (without /)
+    :param path: Path for file loading relative to calling file
+    :param dataframe: The dataframe to save
+    """
+    dataframe.to_json(server_path(current_path=current_path, path=path))
+
+
+def df_from_json(current_path, path):
+    """
+    Wraps the json.load function in combination with a dataframe creation to adjust to server settings
+
+    :param current_path: path of the calling file to adjust for server (without /)
+    :param path: Path for file loading relative to calling file
+    :return: The loaded dataframe
+    """
+    return pd.DataFrame(json.load(open_file(current_path=current_path, path=path, modes="r")))
+
+
+def time_convert(sec):
+    """
+    Gibt eine Zeitangabe hübsch aus. Format : Time Lapsed = hh:mm:ss
+
+    :param sec: Zeit zu zeigen
+    """
+    mins = sec // 60
+    sec = sec % 60
+    hours = mins // 60
+    mins = mins % 60
+    print("Time Lapsed = {0}:{1}:{2}".format(int(hours), int(mins), sec))
+
+
+def parallel_imap(function, packaged_args):
+    """
+    Executes the given function in a parallel way. For list data.
+
+    :param function: Function to do in parallel.
+    :param packaged_args: Iterable of argumentpairs for each run to be done.
+    :return: Result of the parallel work
+    """
+    if settings.server:
+        pool_obj = multiprocessing.Pool(maxtasksperchild=pool_maxtask)
+        result = pool_obj.imap(function, packaged_args, chunksize=pool_chunksize)
+    else:
+        pool_obj = multiprocessing.Pool(processes=pool_processes)
+        result = pool_obj.imap(function, packaged_args)
+    pool_obj.close()
+    pool_obj.join()
+    return result
+
+
+def get_step_subset_raw(steps, path_to_dest_dataframe, source_data, dest_data, call_path):
+    """
+    Method for stepwise work on datasets. Reads in the already present data and starts
+    where last time ended. Used for raw pickle-files in destination
+
+    :param steps: How many rows should be selcted now
+    :param path_to_dest_dataframe: Path on where to load the destination data
+    :param source_data: Source dataframe to select the rows
+    :param dest_data: empty dataframe to load the data into
+    :param call_path: path from which the method was called, for server path
+    :return: the subset of the source data an the loaded destintion data (source, dest)
+    """
+    if steps > 0:
+        try:
+            try:
+                var = df_from_pickle(current_path=call_path, path=path_to_dest_dataframe)
+            except Exception:
+                var = df_from_json(current_path=call_path, path=path_to_dest_dataframe)
+            dest_data = pd.concat([dest_data, var], ignore_index=True)
+            start = dest_data.shape[0]
+        except OSError as _:
+            start = 0
+        finally:
+            end = start + steps
+            try:  # case source is a dataframe
+                if end >= source_data.shape[0]:
+                    return source_data.iloc[start:], dest_data  # subset
+                else:
+                    return source_data.iloc[start:end], dest_data  # subset
+            except Exception:
+                if end >= len(source_data):
+                    return source_data[start:], dest_data  # subset
+                else:
+                    return source_data[start:end], dest_data  # subset
+
+
+def remove_spaces_before_sentence_marks(text):
+    """
+    Removes unneccessary spaces before '.' etc.
+
+    :param text: Text to replace in
+    :return: The cleaned text
+    """
+    for sentence_mark in sentence_marks:
+        while ' ' + sentence_mark in text:
+            text = text.replace(' ' + sentence_mark, sentence_mark)
+    return text
+
+
+def remove_brackets(text):
+    """
+    Removes all matching round bracktet pairs () with their content. Always takes the first brackets that
+    appear in the text, so could also be an enumeration like a)
+
+    :param text: Text to remove the brackets from.
+    :return: Resulting text
+    """
+    startindex = text.find('(')
+    res = ''
+    while startindex > -1:
+        endindex = startindex + text[startindex:].find(')')
+        if endindex > -1:
+            # in case there is a ' ' in front or after the brackets, remove one space
+            if startindex > 0 and text[startindex - 1] == ' ':
+                startindex -= 1
+            # if endindex < len(text) - 1 and text[endindex + 1] == ' ':
+            #   endindex += 1
+            res += text[:startindex]
+            text = text[endindex + 1:]
+        else:
+            break
+        startindex = text.find('(')
+    res += text
+    return res
+
+
+def remove_leading_keywords_and_listing_sentences(sentences):
+    """
+    Method intended for Leitsätze. Some of them start with a single keyword in the first line.
+    This is removed. Additionally, Sentences which are only a listin ('1.') will also be removed.
+
+    :param sentences: List of sentences in the original order to remove these things from
+    :return: the list of sentences after removing
+    """
+    # remove leading keywords and sentences which are only enumerations
+    sentences_var = list()
+    sentence_var = ''
+    for i in range(len(sentences)):
+        sentence = sentences[i].strip()
+        if len(sentence) > 1 and sentence[-1] == '.' and ' ' not in sentence:  # at least two chars
+            if any(char.isdigit() for char in sentence) and sentence[0].isdigit():  # most likely enumeration like '1.'
+                continue
+        if i > 0 or (i == 0 and len(sentence) > 20):
+            # most likely not a short keyword at the beginning
+            if sentence[-1] == '.' or sentence[-1] == ',' or sentence[-1] == ':' or \
+                    sentence[-1] == ';' or sentence[-1] == '!' or sentence[-1] == '?':
+                # sentence end
+                sentence_var += sentence
+                sentences_var.append(remove_spaces_before_sentence_marks(sentence_var))
+                sentence_var = ''
+            else:
+                # continuing sentence
+                sentence_var += sentence + ' '
+    return sentences_var
+
+
+def prepare_leitsatz(l_satz):
+    """
+    Does the preparation for Leitsätze: First splits into sentences, removes leading keywords and
+    single listing sentences and leading listings of sentences
+
+    :param l_satz: Original Leitsatz as one string
+    :return: prepared Leitsatz as a list of String
+    """
+    sentences = split_into_sentences(l_satz)
+    sentences = remove_leading_keywords_and_listing_sentences(sentences)
+    sentences = [remove_leading_listing(sentence) for sentence in sentences]
+    return sentences
+
+
+def select_list_subset(list_of_string, start_strings, end_string=None):
+    """
+    Selects a subset of a list of strings. If the start_string is not in the list,
+    the whole original list is returned. (case-sensitive)
+    If more start strings are given, then it will be copied from the first occuring start string.
+
+    sometimes entscheidungsgruende II. is started not with II. but B. Use start_String_2 here
+
+    :param list_of_string: List to get subset from
+    :param start_strings: List of Strings to start to copy
+    :param end_string: First string where one shouldn't copy anymore, if none is given, then till the end
+    :return: Selected subset
+    """
+    result_list = []
+    copy = False
+    for i in range(len(list_of_string)):
+        string = list_of_string[i]
+        if string in start_strings:
+            copy = True
+        if end_string is not None and string == end_string:
+            copy = False
+        if copy:
+            result_list.append(string)
+    # if nothing was found or very little was found
+    if len(result_list) == 0 or len(result_list) / len(list_of_string) < 0.2:
+        return list_of_string
+    return result_list
+
+
+def abbreviation_ending(text):
+    """
+    Checks for an input text whether it ends with a known legal abbreviation.
+    Known issues: numbers and roman numbering with following dots arent matched
+
+    :param text: Input Text
+    :return: True, if it does and with such an abbreviation, False otherwise
+    """
+    abbrev_list = ['A.', ' a.', 'a.A.', 'a.a.O.', 'ABl.', ' abl.', 'Abs.', ' abs.', 'Abschn.', 'Abse.',
+                   ' abzgl.', 'a.D.', 'a.E.', ' a.F.', ' ähnl.', 'a.l.i.c.', ' allg.', ' allgem.',
+                   'Alt.', 'AmtsBl.', ' and.', ' angef.', 'Anh.', 'Anl.', 'Anm.', ' Art.', '(Art.', ' aufgeh.',
+                   'Aufl.', ' ausf.', 'Ausn.', 'BAnz.', 'BArbBl.', 'BayJMBl.', 'Bd.', 'Bde.', 'Bdg.',
+                   'Bearb.', ' begr.', 'Beil.', 'Bek.', ' ber.', ' bes.', 'Beschl.', ' best.', ' bestr.',
+                   'Betr.', ' betr.', 'Bf.', 'BGBl.', ' bish.', ' Bl.', 'BPräs.', 'BReg.', 'Bsp.', 'Bst.',
+                   'BStBl.', 'BT-Drucks.', 'Buchst.', 'bzgl.', 'bzw.', 'c.i.c.', 'Co.', 'c.p.c.',
+                   'c.s.q.n.', 'Ct.', ' dar.', 'Darst.', ' ders.', 'd.h.', 'Diss.', ' div.', 'Dr.',
+                   'Drucks.', ' dto.', 'DVBl.', ' ebd.', ' Ed.', 'E.G.', ' eingef.', 'Einf.', 'Einl.',
+                   ' einschl.', 'Erg.', ' erk.Sen.', ' erk.', ' Erl.', 'etc.', 'E.U.', ' e.V.',
+                   'EVertr.', ' evtl.', 'E.W.G.', ' F.', ' f.', ' Fa.', ' Festschr.', ' ff.', ' Fn.',
+                   ' form.', ' fr.', ' fr.Rspr.', ' Fz.', 'GBl.', ' geänd.', 'Gedschr.', ' geg.',
+                   ' gem.', 'Ges.', ' gg.', ' ggf.', ' ggü.', ' ggüb.', ' Gl.', ' GMBl.', 'G.o.A.',
+                   'Grds.', ' grdsl.', 'Großkomm.', 'Großkomm.z.', 'GVBl.', 'GVOBl.', ' h.A.', 'Halbs.',
+                   ' h.c.', 'Hdlg.', 'Hess.', ' heut.', ' heut.Rspr.', ' hins.', ' h.L.', ' h.Lit.',
+                   ' h.M.', 'Hrsg.', ' h.Rspr.', 'HS.', 'Hs.', ' i.A.', ' ib.', ' ibd.', ' ibid.',
+                   'i.d.', 'i.d.F.', 'i.d.R.', 'i.d.S.', 'i.E.', 'i.e.', 'i.e.S.', 'i.H.d.', 'i.H.v.',
+                   'i.K.', ' incl.', ' inkl.', 'inkl.MwSt.', ' insb.', ' insbes.', 'Int.', ' i.O.',
+                   ' i.R.', ' i.R.d.', 'i.S.', 'i.S.d.', 'i.S.e.', 'i.S.v.', 'i.ü.', ' iur.', 'i.V.',
+                   'i.V.m.', 'i.W.', 'i.Wes.', 'i.w.S.', 'i.Zw.', 'Jahrb.', ' jew.', ' Jh.', 'JMBl.',
+                   ' jur.', ' Kap.', ' Ko.', ' krit.', ' kzfr.', 'Lb.', 'Lfg.', 'lgfr.', ' Lief.',
+                   'Lit.', ' lit.',  ' lt.', 'Ltd.', 'M.A.', 'm.Änd.', 'MABl.', 'mat.', 'm.a.W.', 'm.E.',
+                   ' med.', 'mgl.', 'Mglkt.', 'MinBl.', 'Mio.', ' Mot.', 'M.M.', 'm.N.', 'Mod.',
+                   ' mögl.', 'Mot.', 'MünchKomm.', 'm.w.', 'm.w.N.', 'MwSt.', 'Mwst.', 'm.W.v.',
+                   'm.zust.Anm.', 'Nachw.', 'Nachw.b.', ' nat.', 'Nds.', 'Neubearb.',  'Neuf.',
+                   ' neugef.', 'n.F.', 'Nr.', 'Nrn.', ' o.', 'o.Ä.', ' od.', ' oec.', ' öff.',
+                   ' o.g.', ' österr.', 'p.F.V.', ' pharm.', ' phil.', ' pol.', 'Postf.', ' pp.',
+                   ' ppA.', ' ppa.', 'Prof.', 'Prot.', ' publ.', ' p.V.', 'p.V.V.', 'q.e.d.',
+                   'RdErl.', 'Rdn.', 'Rdnr.', 'RdSchr.', ' rel.', ' rer.', 'RGBl.', 'Rn.', 'Rspr.',
+                   'Rz.', 'S.', ' s.', 's.a.', 'Schr.', ' scil.', 'Sen.', ' sinngem.', 'SiZess.',
+                   'Slg.', 's.o.', ' sog.', 'Sonderbeil.', 'Stpfl.', ' str.', ' st.', 'st.Rspr.',
+                   ' st. Rspr.', 'stud.iur.', 's.u.', ' teilw.', ' theol.', 'Thür.', ' TO.', ' tw.',
+                   'Tz.', ' u.', 'u.a.', 'UAbs.', 'u.a.m.', ' umstr.', ' unmgl.', 'Unmglkt.', ' unmögl.',
+                   'Urt.', ' usw.', ' u.U.', ' V.', ' v.', 'Var.', 'Ver.', ' vgl.', 'V.m.', 'VOBl.',
+                   'Vor.', 'Vorbem.', 'Warn.', ' weg.', ' wg.', 'W.G.G.', 'w.z.b.w.', 'z.B.', 'z.Hd.',
+                   'Ziff.', 'z.T.', ' zust.', 'zust.Anm.', ' zw.' 'z.Z.', ' zzgl.', ';', 'II.1.a.',  '(s.',
+                   ]
+    for abbrev in abbrev_list:
+        if text.endswith(abbrev):
+            return True
+    if len(text) >= 3 and re.search(" .\\.", text[-3:]):
+        return True
+    return False
+
+
+def remove_leading_listing(sentence):
+    """
+    removes leading listings / enumerations like 1. or a)
+
+    :param sentence: Sentence to remove from
+    :return: Processed sentence
+    """
+    return split_leading_listing(sentence)[1]
+
+
+def split_leading_listing(sentence):
+    """
+    Splits the sentence from a possible listing (1. or a) ) at the start.
+
+    :param sentence: Sentence to split
+    :return: (start, rest) with start being the listing or None, if there is no listing and
+                rest being the rest of the sentence or the original sentence if there was no listing
+    """
+    first_word = sentence.split()
+    if first_word is None or len(first_word) == 0:
+        first_word = ''
+    else:
+        first_word = first_word[0]
+    rest = sentence[len(first_word) + 1:]
+    # could be a name like M. Leicht
+    if (first_word.endswith('.') or first_word.endswith(')')) and len(rest.split()) > 1 and first_word != 'Art.':
+        # Enumeration!
+        return first_word, rest
+    else:
+        return None, sentence
+
+
+def split_into_sentences(input_text):
+    """
+    Splits text into sentences. Uses spacy sentences but fixes broken sentences on \n or Abbreviations
+
+    :param input_text: Text to split into sentences
+    :return: A list of sentences which where split
+    """
+
+    paragraphs = input_text.split('\n')
+    sentences = list()
+    sentence_var = ''
+    # roughly split original leitsatz into sentences
+    for paragraph in paragraphs:
+        nlp_paragraph = settings.nlp(paragraph)
+        sentences_paragraph = []
+        for sent in nlp_paragraph.sents:
+            sent = sent.text.strip()
+            # some leading listings aren't detected
+            a, b = split_leading_listing(sent)
+            if a is not None:
+                sentences_paragraph.append(a)
+            sentences_paragraph.append(b)
+        for i in range(0, len(sentences_paragraph)):
+            # add a space before next token if it isn't a sentence mark
+            if not (sentences_paragraph[i].startswith('.') or sentences_paragraph[i].startswith(':')
+                    or sentences_paragraph[i].startswith('?') or sentences_paragraph[i].startswith('!')):
+                sentence_var += ' '
+            sentence_var += sentences_paragraph[i]
+            # if not sentence_var.count('(') > sentence_var.count(
+            #        ')') and not sentence_var.strip() == '':  # no unclosed brackets
+            if (sentences_paragraph[i].endswith('.') or sentences_paragraph[i].endswith(':')
+                or sentences_paragraph[i].endswith('?') or sentences_paragraph[i].endswith('!')) \
+                    and not abbreviation_ending(sentence_var) \
+                    and not sentence_var.strip() == '':
+                # Satz sehr wahrscheinlich wirklich zuende
+                sentences.append(sentence_var.strip())
+                sentence_var = ''
+        if not sentence_var.strip() == '':
+            #        if not sentence_var.count('(') > sentence_var.count(
+            #               ')') and not sentence_var.strip() == '':  # no unclosed brackets
+            sentences.append(sentence_var.strip())  # am Ende des Paragraphen soll auch fertig sein
+            sentence_var = ''
+    # end of whole text
+    if sentence_var.strip() != '':
+        sentences.append(sentence_var.strip())
+    return sentences
+
+
+def preprocess_text(text, options):
+    """
+    Allows simple preprocessing like lemmatization on strings.
+
+    :param text: Text to preprocess
+    :param options: Options specifying on what preprocessing is to be done, if None, text will be returned
+    :return: the preprocessed text, if text is None, the result will also be ''
+    """
+    if text is None:
+        return ''
+    if options is None:
+        return text
+    text_spacy = settings.nlp(text)
+    result_text = ''
+    for token in text_spacy:
+        # stop-words removing: no stopwords or stopwords shouldn't be removed
+        if not token.is_stop or pp_option_stopwords not in options or token.lemma_ in no_stopword_list:
+            # lemmatization if wanted
+            if pp_option_lemmatize in options and token.text not in sentence_marks:
+                to_append = token.lemma_
+            else:
+                to_append = token.text
+            if pp_option_remove_qout_marks_sing in options and to_append[0] == '"' and to_append[-1] == '"':
+                to_append = to_append.replace('"', '')
+            result_text += to_append + ' '
+    result_text = result_text.strip()
+    # case-normlaization, all to lower
+    if pp_option_case_normalize in options:
+        return result_text.lower()
+    else:
+        return result_text
+
+
+def create_dir(current_path, directory_name, delete=True):
+    """
+    Creates a directory if it doesn't exist
+
+    :param current_path: path of the calling file
+    :param directory_name: name / path to create
+    :param delete: if True, than an old directory with same name will be delted
+    """
+    if delete and file_exists(current_path=current_path, path=directory_name):
+        shutil.rmtree(server_path(current_path=current_path, path=directory_name))
+    if not file_exists(current_path=current_path, path=directory_name):
+        os.makedirs(server_path(current_path=current_path, path=directory_name))