commit 13d30ecd6fabcc757af24610f6dff9a46ea6c3f3 Author: Bianca Steffes Date: Wed Apr 26 14:43:45 2023 +0200 Added Code and data diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9eafcfe --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +.idea +__pycache__ +data/dataframes +rouge_evalauation/dataframes +rouge_evalauation/figures +rouge_evalauation/evaluated_data/second_eval/AufeinanderfolgendeSätze.xlsx +rouge_evalauation/manual_evaluation diff --git a/data/download_rii.py b/data/download_rii.py new file mode 100644 index 0000000..662d134 --- /dev/null +++ b/data/download_rii.py @@ -0,0 +1,214 @@ +import time +import xml.etree.ElementTree as ET +import urllib.request as request +import zipfile +import os + +import pandas as pd + +import settings +import utils +from utils import time_convert + +base_dir_bgh = 'raw_data/BGH_Data' +extended_dir_bgh = base_dir_bgh + '/senates' +dataframe_dir_bgh = 'dataframes/bgh/' +pickle_name_bgh = 'bgh_data.pkl' +simple_attributes = ["doknr", "ecli", "gertyp", "gerort", "spruchkoerper", "entsch-datum", + "aktenzeichen", "doktyp", "norm", "vorinstanz", "mitwirkung", "titelzeile", + "leitsatz", "sonstosatz", "tenor", "tatbestand", "entscheidungsgruende", + "gruende", "abwmeinung", "sonstlt", "identifier", "coverage", "language", + "publisher", "accessRights"] +nested_attributes = ["region_abk", "region_long"] +text_attributes = ["titelzeile", "leitsatz", "sonstosatz", "tenor", "tatbestand", + "entscheidungsgruende", "gruende", "abwmeinung", "sonstlt"] +stopword_extension = '_no_stopwords' +current_path = 'data' + + +def get_file_list(): + """ + Makes http request for the files + :return: the web page with all current cases as an xml-tree + """ + xml_file, https_message = request.urlretrieve('https://www.rechtsprechung-im-internet.de/rii-toc.xml') + tree = ET.parse(xml_file) + root = tree.getroot() + return root + + +def count_cases(root, tag): + """ + counts all cases belonging to the given tag and returns the count + :param root: downloaded xml-tree with all files + :param tag: tag to find in the name + :return: number of cases belonging to the BGH + """ + count = 0 + for child in root: + if tag in child[0].text: + count += 1 + return count + + +def download(base_dir, extended_dir, tag): + """ + download all cases to a folder related to their senats + :param base_dir: Name of the directory for the data + :param extended_dir: name of the subdirectory for saving + :param tag: tag to recognize the court (BGH, BVerwG) + """ + # set up directories + utils.create_dir(current_path=current_path, directory_name=base_dir) + utils.create_dir(current_path=current_path, directory_name=extended_dir) + # do the download + root = get_file_list() # 0 ist gericht, 3 ist link + max_cases = count_cases(root, tag) + downloaded = 0 + for child in root: + while True: + try: + if tag in child[0].text: + filename, http = request.urlretrieve(child[3].text) + with zipfile.ZipFile(filename, 'r') as zip_ref: + zip_ref.extractall( + utils.server_path(current_path=current_path, + path=extended_dir + '/' + child[0].text.replace('\n', '') + '/')) + os.remove(filename) + downloaded += 1 + print("\rDownloaded %d of %d " % (downloaded, max_cases) + tag + "Cases", end="") + finally: + break + print("\nDone!") + + +def read_file_data(file): + """ + Reads the data of one case / file. + + :param file: package containing (filename, directory, directory extension) to address the file + :return: a dictionary with key: attribute_name and value: attribute_value + """ + filename, directory, extended_dir = file + tree = ET.parse(utils.server_path(current_path=current_path, path=os.path.join(extended_dir, directory, filename))) + root = tree.getroot() + res = {} + for attribute in simple_attributes: + attr = root.find(attribute) # leitsatz überprüfen: zwei Worte zusammen, aber leerzeichen immer noch da! + text = '' + for t in attr.itertext(): + if t == '.' or t == ',' or t == ';' or t == '!' or t == '?': + text = text.strip() # remove space before these characters + text += t + ' ' + text = text.strip() + if text == '': + res[attribute] = None + else: + res[attribute] = text + + for attribute in nested_attributes: + nesting = attribute.split('_') + xml_tag = root + # find nested attribute + for i in range(len(nesting)): + xml_tag = xml_tag.find(nesting[i]) + text = "" + for t in xml_tag.itertext(): + if t == '.' or t == ',' or t == ';' or t == '!' or t == '?': + text = text.strip() # remove space before these characters + text += t + ' ' + text = text.strip() + if text == '': + res[attribute] = None + else: + res[attribute] = text + + for attribute in utils.rii_text_columns: + if res[attribute] is not None: + if settings.remove_brackets: + res[attribute] = utils.remove_brackets(res[attribute]) + res[attribute] = utils.remove_spaces_before_sentence_marks(res[attribute]) + + return pd.DataFrame(res, index=[0]) + + +def create_pickle(extended_dir, pickle_name, steps): + """ + Combines all downloaded files of the given extended directory into one pickle + + :param extended_dir: extended dir to find the files + :param pickle_name: name of the pickle to save + :param steps: how many cases should be worked on now + """ + utils.create_dir(current_path=current_path, directory_name=dataframe_dir_bgh, delete=False) + start_time = time.time() + extension = '' + if settings.remove_brackets: + extension = settings.no_brackets_suffix + + files = [(filename, directory, extended_dir) for directory in + utils.list_dir_files(current_path=current_path, path=extended_dir) for filename in + utils.list_dir_files(current_path=current_path, path=os.path.join(extended_dir, directory)) + if filename.endswith(".xml")] + + original_length = len(files) + data = pd.DataFrame(columns=simple_attributes + nested_attributes) + + pickle_path = dataframe_dir_bgh+extension+pickle_name + + files, data = utils.get_step_subset_raw(steps=steps, + path_to_dest_dataframe=pickle_path, + source_data=files, + dest_data=data, + call_path=current_path) + + result = utils.parallel_imap(read_file_data, files) + for row in result: + data = pd.concat([data, row], ignore_index=True) + with utils.open_file(current_path=current_path, path=pickle_path, modes='wb') as f: + data.to_pickle(f) + + print('Resulting dataframes have length ' + str(data.shape[0]) + + ' (' + str(data.shape[0] / original_length * 100) + '%)') + end_time = time.time() + time_lapsed = end_time - start_time + time_convert(time_lapsed) + + +def get_selected_bgh_data(directory='.\\'): + """ + Shortcut for getting the BGH data currently needed. Selects all data from the Civil copurts which contain 'Urteile' + + :param directory: directory offset from current position, with ending slashes + :return: the data + """ + return get_data(pickle_name_bgh, directory, spruchkoerper='Zivilsenat', doktyp='Urteil') + + +def get_data(pickle_name, directory='../data/', spruchkoerper=None, doktyp=None): + """ + Method for access to the bgh pickle + :param pickle_name: name to identify the data + :param directory: directory path to the data file (with ending slash) + :param spruchkoerper: Parameter can be used to select the senates (checks whether the given string is contained + in the datas spruchkoerper) + :param doktyp: can be used to select specific documents (like 'Urteil', 'Beschluss', etc.), must contain the word + :return: The data as a pandas dataframe + """ + extension = '' + if settings.remove_brackets: + extension = settings.no_brackets_suffix + data = utils.df_from_pickle(current_path=current_path, path=directory + dataframe_dir_bgh + extension + pickle_name) + if spruchkoerper is not None: + data = data[data['spruchkoerper'].notnull()] + data = data[data['spruchkoerper'].str.contains(spruchkoerper)] + if doktyp is not None: + data = data[data['doktyp'].str.lower().str.contains(doktyp.lower())] + data = data.dropna(axis=1, how='all') # drop all columns with no value + data = data.drop_duplicates() + return data + + +# if __name__ == "__main__": + # download(base_dir=base_dir_bgh, extended_dir=extended_dir_bgh, tag='BGH') + # create_pickle(extended_dir=extended_dir_bgh, pickle_name=pickle_name_bgh, steps=2) diff --git a/pm_summary/annotation_evaluation.py b/pm_summary/annotation_evaluation.py new file mode 100644 index 0000000..87adfba --- /dev/null +++ b/pm_summary/annotation_evaluation.py @@ -0,0 +1,47 @@ +import os + +import pandas as pd + + +import utils + +pm_sent_no = 'pm_sent_number' +pm_sent = 'Sätze der Pressemitteilung' +judgement_sent_no = 'judgement_sent_number' +judgement_sent = 'Dazu passende Sätze des Urteils' +keywords = 'Schlagworte' +comments = 'Anmerkung' +duration = 'Wie lange hast Du für die Bearbeitung dieses Urteils gebraucht?' +bad_pm = 'Ist diese Pressemitteilung eine schlechte Darstellung / Zusammenfassung des Urteils?' +current_dir = 'pm_summary/' + + +def prepare_file(path): + """ + Liest eine Datei aus und überführt sie in ein einheitliches Format. + + :param path: Pfad zur Datei. + :return: dictionary, in dem die Ergebnisse stehen. Für jeden Satz der PM gibt es ein Unterdict (Zahl als key). + """ + res = {} + raw_data = pd.read_excel(path, names=[pm_sent_no, pm_sent], header=None) + for index, row in raw_data.iterrows(): + current_sentence = {pm_sent_no: row[pm_sent_no], pm_sent: row[pm_sent]} + res[current_sentence[pm_sent_no]] = current_sentence + return res + + +def get_all_pm_files(): + """ + Returns the list of all annotated pm-files + + :return: [(pm_filename, file_data)*] + """ + file_path_base = utils.server_path(current_path=current_dir, + path='../rouge_evalauation/evaluated_data/extractive_judgments') + res = [] + for judgment in os.listdir(file_path_base): + if '.xlsx' in judgment: + filename = file_path_base + '/' + judgment + res.append((judgment, prepare_file(filename))) + return res diff --git a/rouge.py b/rouge.py new file mode 100644 index 0000000..42f924f --- /dev/null +++ b/rouge.py @@ -0,0 +1,148 @@ +import utils + +beta = 1 +# https://aclanthology.org/W04-1013/ + + +def rouge_n(reference_summary, created_summary, n, pp_options=None, extended_results=False): + """ + Calculates the rouge n score + + :param reference_summary: gold standard summary + :param created_summary: summary to evaluate + :param n: size of n-grams + :param pp_options: list of options for preprocessing, if None then no preprocessing will be done + :param extended_results: indicates, whether, precision, recall and f-measure should be returned + :return: the score or (precision, recall, f-measure) if extended results are wanted + """ + # preprocess + if pp_options is not None: # otherwise don't preprocess. Text is already preprocessed + reference_summary = utils.preprocess_text(reference_summary, pp_options) + created_summary = utils.preprocess_text(created_summary, pp_options) + else: # seperate sentence marks from tokens + for sentence_mark in utils.sentence_marks: + reference_summary = reference_summary.replace(sentence_mark, ' '+sentence_mark) + created_summary = created_summary.replace(sentence_mark, ' ' + sentence_mark) + # split into n-grams of size n + # count occurances of single ngrams + reference_ngrams, ref_complete_count = count_n_grams(reference_summary, n) + created_ngrams, created_complete_count = count_n_grams(created_summary, n) + + overlapping_count = 0 + for ref_key in reference_ngrams.keys(): + created_count = created_ngrams.get(ref_key) + if created_count is not None: # ngrams in both dicts + ref_count = reference_ngrams[ref_key] + overlapping_count += min(ref_count, created_count) + + # calculate score + if ref_complete_count == 0: + return 0 + recall = overlapping_count / ref_complete_count + if extended_results: + precision = overlapping_count / created_complete_count + return precision, recall, (2*precision*recall) / (precision+recall) + return recall + + +def count_n_grams(pp_summary, n): + """ + Counts the n-grams of the given size in a summary. + + :param pp_summary: Pre-processed summary + :param n: n for the size of ngrams + :return: {ngram:count} for all ngrams in the summary + """ + words = pp_summary.split(' ') + complete_count = 0 + n_grams = {} + for i in range(len(words)-(n-1)): + n_gram = ' '.join(words[i:i+n]) + if n_gram != '': + complete_count += 1 + count = n_grams.get(n_gram) + if count is None: + count = 0 + n_grams[n_gram] = count + 1 + return n_grams, complete_count + + +def rouge_l(reference_summary, created_summary, pp_options=None, extended_results=False): + """ + Calculates the rouge-l value of a summary and its gold standard summary + + :param reference_summary: Gold standard summary + :param created_summary: Created summary to compare + :param pp_options: options for preprocessing, if None then there will be no preprocessing + :param extended_results: if True, precision, recall and f-score will be returned + :return: The calculated score, if extended results are wanted (precision, recall, f-measure) + """ + # preprocess + if pp_options is not None: # otherwise don't preprocess. Text is already preprocessed + reference_summary = utils.preprocess_text(reference_summary, pp_options) + created_summary = utils.preprocess_text(created_summary, pp_options) + # seperate sentence marks from words + # split into sentences + m_reference_word_number = len(reference_summary.split(' ')) + reference_summary = utils.split_into_sentences(reference_summary) + n_created_word_number = len(created_summary.split(' ')) + created_summary = utils.split_into_sentences(created_summary) + total_sum_subsequences = 0 + # to make sure every word in the created summary is used only once + used_created_indices = [set()]*len(created_summary) + used_gold_indices = [set()]*len(reference_summary) + for j in range(len(reference_summary)): + ref_sentence = reference_summary[j] + # calculate union longest subsequence + for i in range(len(created_summary)): + created_sentence = created_summary[i] + indices_a, indices_b = get_subsequence(ref_sentence, created_sentence) + used_gold_indices[j] = (used_gold_indices[j]).union(indices_a) + used_created_indices[i] = (used_created_indices[i]).union(indices_b) + # used indices of b here to ensure words arent used twice + used_created_indices = [len(sent_set) for sent_set in used_created_indices] + used_gold_indices = [len(sent_set) for sent_set in used_gold_indices] + total_sum_subsequences = min(sum(used_gold_indices), sum(used_created_indices)) + + if total_sum_subsequences == 0: + return 0 + p_lcs = total_sum_subsequences / n_created_word_number + r_lcs = total_sum_subsequences / m_reference_word_number + f_lcs = ((1 + beta * beta) * r_lcs*p_lcs) / (r_lcs + beta * beta * p_lcs) + if extended_results: + return p_lcs, r_lcs, f_lcs + return f_lcs + + +def get_subsequence(sent_a, sent_b): + """ + Finds all (not necessarily consecutive) subsequences of a in b. + :param sent_a: Sentence to find subsequences from + :param sent_b: Sentence to find subsequence in + :return: (ind_a, ind_b) two sets of indices of sent_a and sent_b of the longest subsequence + """ + result_a = set() + words_a = sent_a.split(' ') + words_b = sent_b.split(' ') + for word_index_a in range(len(words_a)): + word_result = set() + char_index_b = 0 + while word_index_a < len(words_a): + # word is contained + try: + found_index = words_b.index(words_a[word_index_a], char_index_b) + word_result.add(word_index_a) + char_index_b = found_index + word_index_a += 1 + except ValueError: + # word not in b contained, do nth + word_index_a += 1 + if len(word_result) > len(result_a): + result_a = word_result + result_b = set([words_b.index(words_a[a_ind]) for a_ind in result_a]) + return result_a, result_b + + +if __name__ == "__main__": + + print('Done') diff --git a/rouge_evalauation/create_evaluation_files.py b/rouge_evalauation/create_evaluation_files.py new file mode 100644 index 0000000..c059f84 --- /dev/null +++ b/rouge_evalauation/create_evaluation_files.py @@ -0,0 +1,1439 @@ +import math + +import numpy as np +import pandas as pd +import xlsxwriter as xlsxwriter +import seaborn as sns +import matplotlib.pyplot as plt +from nlp_rake import Rake + +import data.download_rii +import pm_summary.annotation_evaluation +import rouge +import settings +import utils +from pm_summary.annotation_evaluation import pm_sent + +current_dir = 'rouge_evalauation/' +eval_path = 'manual_evaluation/' +eval_run_two_path = 'second_eval/' +evaluated_path = 'evaluated_data/' +picture_path = 'figures/' +summary_sentences_path = 'summary_sentences/' +data_path = 'dataframes/' +all_summaries_df_name = 'all_summaries.json' +sum_sentences_df_name = 'summary_sentences.json' +issues_str = 'legal_issue' +abstr_complete_str = 'abstr_complete' +content_complete_str = 'content_complete' +type_str = 'ls_pm' +sent_no_str = 'sentence_number' +original_str = 'original' +interval_start_str = 'interval_start' +sentence_str = 'sentence' +evaluation_str = 'eval' +summary_id_str = 'summary_id:' +summary_str = 'summary' +rouge_r_str = 'rouge_recall' +rouge_p_str = 'rouge_precision' +rouge_f_str = 'rouge_f_measure' +rouge1_str = 'rouge1' +rougel_str = 'rougel' +rouge_type_str = rouge1_str + ' or ' + rougel_str +pp_options = [utils.pp_option_stopwords, utils.pp_option_lemmatize] +cases_one_list = ['I_ZR_23-18', 'I_ZR_139-15', 'III_ZR_35-18', 'III_ZR_42-19', + 'III_ZR_55-19', 'III_ZR_67-18', 'III_ZR_113-18', 'III_ZR_292-17', + 'III_ZR_391-17', 'V_ZR_112-18', 'V_ZR_176-17', 'V_ZR_218-18', + 'V_ZR_254-17', 'V_ZR_273-17', 'VI_ZR_506-17', 'VII_ZR_151-18', + 'VIII_ZR_94-17', 'VIII_ZR_277-16', 'X_ZR_96-17', 'XII_ZR_13-19'] +cases_two_list = ['I_ZR_146-20', 'I_ZR_153-17', 'II_ZR_84-20', 'II_ZR_152-20', + 'III_ZR_25-20', 'III_ZR_79-21', 'IV_ZR_144-21', 'IV_ZR_253-20', + 'V_ZR_8-19', 'V_ZR_299-19', 'VI_ZR_128-20', 'VI_ZR_252-19', + 'VIa_ZR_418-21', 'VII_ZR_78-20', 'VII_ZR_192-20', 'VIII_ZR_21-19', + 'VIII_ZR_66-17', 'X_ZR_107-16', 'XI_ZR_7-19', 'XI_ZR_345-18'] + + +def select_greedy_summary(split_text, gold_summary, interval_aim, eval_func): + """ + Selects a summary from a text in a greedy fashion. + + :param split_text: List of sentences, Text to choose sentences from, already split into sentences! + :param gold_summary: ideal summary + :param interval_aim: (start, end) of interval for the intended final rouge score + :param eval_func: rouge score to evaluate the summary, as a function with the arguments (created, gold) + :return: [(rouge, summary)*] with summary the created summary and rouge the corresponding score. Contains rouge for + every added sentence + """ + start_aim, end_aim = interval_aim + result = [] + result_summary = '' + current_split_text = [sent for sent in split_text] + max_rouge = eval_func(result_summary, gold_summary) + while max_rouge <= start_aim and len(current_split_text) > 0: + new_sent = '' + for sent in current_split_text: + var_result_summary = result_summary + ' ' + sent + var_result_summary = var_result_summary.strip() + new_rouge = eval_func(var_result_summary, gold_summary) + if new_rouge > end_aim: + current_split_text.remove(sent) + elif max_rouge < new_rouge: + max_rouge = new_rouge + new_sent = sent + if new_sent != '': + result_summary += ' ' + new_sent + result_summary = result_summary.strip() + current_split_text.remove(new_sent) + result.append((max_rouge, new_sent)) + else: + break + + return result + + +def get_evaluation_data(case_list): + """ + Loads the judgement data and the press realease evaluation files, combines and returns them + Removes press releases without Leitsatz or judgement data + pm_prepared_list = [(pm, + ls_data[ls_data[utils.aktenzeichen_str] == aktenzeichen] + [[utils.aktenzeichen_str, utils.leitsatz_str, utils.tenor_str, + utils.entscheidungsgruende_str]].squeeze()) + :param case_list: list of cases for the run + :return: [(pm, judg_row_data(aktenzeichen, leitsatz, tenor, entsch_gr))] + """ + ls_data = data.download_rii.get_selected_bgh_data(directory='..//data//') + pm_data_list = pm_summary.annotation_evaluation.get_all_pm_files() + pm_prepared_list = [] + for pm_filename, pm_file_data in pm_data_list: + if not any(case in pm_filename for case in case_list): + continue + aktenzeichen = pm_filename.replace('.xlsx', '').replace('_', ' ').replace('-', '/') + original_pm = '' + for i in range(len(pm_file_data)): + sent_dict = pm_file_data[i + 1] + original_pm += ' ' + str(sent_dict[pm_sent]) + pm_prepared_list.append((aktenzeichen, original_pm)) + + pm_prepared_list = [(pm, + ls_data[ls_data[utils.aktenzeichen_str] == aktenzeichen] + [[utils.aktenzeichen_str, utils.leitsatz_str, utils.tenor_str, + utils.entscheidungsgruende_str]].squeeze()) + for (aktenzeichen, pm) in pm_prepared_list] + # remove pms without ls + pm_prepared_list = [(pm, row) for (pm, row) + in pm_prepared_list if row[utils.leitsatz_str] is not None] + + return pm_prepared_list + + +def elaborated_sentence_splitting(text_to_split): + """ + Sentence Splitting for entscheidungsgruende with readjusting of splitting if something was wrong + + :param text_to_split: Raw text + :return: split text as list + """ + # select entscheidungsgruende II as split sentences + res = [] + for sentence in utils.split_into_sentences(text_to_split): + first, rest = utils.split_leading_listing(sentence) + if first is not None: + res.append(first) + res.append(rest) + res = rejoin_wrong_splitting(res) + res = readjust_splitting(res) + return res + + +def prepare_sentences(row_data): + """ + Prepares leitsatz, entscheidungsgruende and tenor. Splits them into senteces, removes listings, etc. + + :param row_data: series containing the data + :return: l_fin, l_list, eg_list, combined_list with l_fin the leitsatz as a string, l_list leitsatz as list of + string, eg_list list of entscheidungsgruende sentences and combined_list the list of sentences of + entscheidungsgruende and tenor + """ + l_list = utils.prepare_leitsatz(row_data[utils.leitsatz_str]) + l_fin = ' '.join(l_list) + + eg_list = elaborated_sentence_splitting(row_data[utils.entscheidungsgruende_str]) + # select entscheidungsgruende II as split sentences + eg_list = utils.select_list_subset(eg_list, utils.entsch_gr_start_sentences) + eg_list = [sent for sent in eg_list if len(sent.split()) > 1] + + combined_list = eg_list + elaborated_sentence_splitting(row_data[utils.tenor_str]) + return l_fin, l_list, eg_list, combined_list + + +def rejoin_wrong_splitting(sent_list): + """ + Some sentences are split wrongly. They are connected here again. + + :param sent_list: list to check + :return: updated list + """ + res = [] + combined = '' + for string in sent_list: + combined += ' ' + string + combined = combined.strip() + if not string.endswith('für die Bemessung des Nutzungsvorteils:') and \ + not string.endswith('GB, Stand:') and not string.endswith('Probefahrt:') \ + and not string.strip() == '§ 89 Abs. 1 II.' and not string.endswith('Medizinprodukte, A. VI.')\ + and not combined.endswith('te, A. VI. 2.') \ + and not string.endswith('Gemeinschaft:') and not string.endswith('InfoV.'): # no special cases + res.append(combined) + combined = '' + return res + + +def update_summaries_if_needed(interval_id, existings_summaries, possible_sentences, gold_sum, rouge_1, max_intervals, + max_interval_index): + """ + Method tries to create a summary of given interval. By keeping track off all rouge values along the way, summaries + of lower rouge may also be found. This information is then updated in max_intervals and existing_summaries + + :param interval_id: interval_id for interval to check. Interval start = (interval_id - 1) / 10 + :param existings_summaries: already existing summaries to intervals for this task + :param possible_sentences: possible sentences to choose + :param gold_sum: gold summary to compare + :param rouge_1: True if ROUGE-1 should be calculated, False otherwise + :param max_intervals: List for keeping track of the max intervals. By starting with highest possible rouge, + impossible rouge values can be detected and not run, since they are impossible. + :param max_interval_index: index in list for current task + :return: existings_summaries, max_intervals with updated values + """ + if rouge_1: + rouge_index = 1 + else: + rouge_index = 2 + + if existings_summaries[interval_id - 1][rouge_index] == '' and \ + (interval_id - 1 < max_intervals[max_interval_index] or max_intervals[max_interval_index] == -1): + + # no summary yet + if rouge_1: + result_list = select_greedy_summary(split_text=possible_sentences, gold_summary=gold_sum, + interval_aim=[(interval_id - 1) / 10, interval_id / 10], + eval_func=lambda created, gold: + rouge.rouge_n(reference_summary=gold, + created_summary=created, + pp_options=pp_options, n=1)) + else: + result_list = select_greedy_summary(split_text=possible_sentences, gold_summary=gold_sum, + interval_aim=[(interval_id - 1) / 10, interval_id / 10], + eval_func=lambda created, gold: + rouge.rouge_l(reference_summary=gold, + created_summary=created, + pp_options=pp_options)) + + summary = '' + for (rouge_v, sentence) in result_list: + summary += ' ' + sentence + summary = summary.strip() + index = math.floor(rouge_v * 10) + if existings_summaries[index][rouge_index] == '': # summary found + if rouge_1: + existings_summaries[index] = (existings_summaries[index][0], summary, existings_summaries[index][2]) + else: + existings_summaries[index] = \ + (existings_summaries[index][0], existings_summaries[index][1], summary) + if index > max_intervals[max_interval_index]: + max_intervals[max_interval_index] = index + return existings_summaries, max_intervals + + +def preselect_sentences(sentence_list, gold_sum): + """ + Makes a preselection of sentences. Removes sentences without two keywords or with a given phrase and combines + konjunktiv until next indicative. + + :param sentence_list: list tu preselect from + :param gold_sum: gold summary to create keywords from + :return: resulting sentences + """ + # combine konjunktiv + res = combine_modus(sentence_list) + # keywords + rake = Rake( + min_chars=1, + max_words=4, + language_code='de', + stopwords=settings.nlp.Defaults.stop_words, + ) + keywords = rake.apply(gold_sum) + wordlist = set() + for keywordstring, _ in keywords: + for token in settings.nlp(keywordstring): + wordlist.add(token.lemma_) + res_var = [] + for sent in res: + keyword_counts = [1 for word in sent.split(' ') if (len(settings.nlp(word)) > 0) + and settings.nlp(word)[0].lemma_ in wordlist] + if sum(keyword_counts) >= 2: + res_var.append(sent) + res = res_var + + # remove sentences with bad phrases + phrases_list = ['Es kann dahinstehen', 'Es kann dahingestellt bleiben', + 'Dabei kann dahingestellt bleiben ', + 'Es kann offenbleiben, dass', 'Es kann offenbleiben, ob', + 'Es bedarf keiner Entscheidung, ob', + 'Das Berufungsgericht hat zu hohe Anforderungen gestellt,', + 'Entgegen der Auffassung der Revision', + 'Entgegen der Auffassung des Berufungsgerichts', + 'Jedenfalls greift die Argumentation des Berufungsgerichts nicht', + 'Jedenfalls greift die Argumentation des Berufungsgerichts zu kurz', 'Selbst wenn'] + res = [sentence for sentence in res if not sentence.startswith(tuple(phrases_list))] + return res + + +def write_files_for_one_judgement(case): + """ + Writes the three files for one judgement. Goes through al intervals from 0.0-0.1 0.9-1.0, + creates summaries and writes the results to the fieles + + :param case: (tag, pm, row, improved) as resulting from get_evalution_data, improved indicates, whether the + improved version should be used + """ + pm, row, improved = case + l_fin, l_list, eg_list, combined_list = prepare_sentences(row) + if improved: + eg_list = preselect_sentences(eg_list, l_fin) + combined_list = preselect_sentences(combined_list, pm) + max_intervals = [-1] * 4 + ls_sums = [(str((i - 1) / 10) + '-' + str(i / 10), '', '') for i in range(1, 12, 1)] + pm_sums = [(str((i - 1) / 10) + '-' + str(i / 10), '', '') for i in range(1, 12, 1)] + for i in range(11, 1, -1): + ls_sums, max_intervals = update_summaries_if_needed(interval_id=i, gold_sum=l_fin, existings_summaries=ls_sums, + max_interval_index=0, max_intervals=max_intervals, + possible_sentences=eg_list, rouge_1=True) + + ls_sums, max_intervals = update_summaries_if_needed(interval_id=i, gold_sum=l_fin, existings_summaries=ls_sums, + max_interval_index=1, max_intervals=max_intervals, + possible_sentences=eg_list, rouge_1=False) + + pm_sums, max_intervals = update_summaries_if_needed(interval_id=i, gold_sum=pm, existings_summaries=pm_sums, + max_interval_index=2, max_intervals=max_intervals, + possible_sentences=combined_list, rouge_1=True) + + pm_sums, max_intervals = update_summaries_if_needed(interval_id=i, gold_sum=pm, existings_summaries=pm_sums, + max_interval_index=3, max_intervals=max_intervals, + possible_sentences=combined_list, rouge_1=False) + + # ROUGE Overviews + write_rouge_overview((l_fin, ls_sums, eg_list), (pm, pm_sums, combined_list), row[utils.aktenzeichen_str], + improved=improved) + # Evaluation Files + write_evaluation_files((l_fin, ls_sums), (pm, pm_sums), row[utils.aktenzeichen_str], (eg_list, combined_list), + improved) + + +def combine_consecutive_sentences(sentences_to_combine, original_list): + """ + In case sentences in the first list are consecutive sentences in the second list, they are combined into one string + + :param sentences_to_combine: strings here might be combined + :param original_list: original list for getting order + :return: updated list + """ + indices = sorted([(get_index_in_list(sent, original_list), sent) for sent in sentences_to_combine]) + res = [] + old_index = -1 + current_package = '' + for index, sent in indices: + if index - old_index == 1: # consecutive sentences + current_package += ' ' + sent + current_package = current_package.strip() + else: # old package is done + if current_package != '': + res.append(current_package) + current_package = sent + old_index = index + if current_package != '': + res.append(current_package) + return res + + +def write_evaluation_files(ls_data, pm_data, aktenzeichen, sent_lists, improved): + """ + Writes the excel files for legal evaluation. + + :param improved: if True, the improved versio is used + :param ls_data: ls, ls_sums with ls the leitsatz and ls_sums the created summaries and intervals + :param pm_data: p, pm_sums with pm the press release annd pm_sums the created summaries and intervals + :param aktenzeichen: aktenzeichen of the case + :param sent_lists: (eg_list, combined_list) with the original sentences for finding consecutive sentences + """ + eg_list, combined_list = sent_lists + ls_sentences = [] + pm_sentences = [] + ls, ls_sums = ls_data + pm, pm_sums = pm_data + for _, r1, rl in ls_sums: + r1_sents = elaborated_sentence_splitting(r1) + if improved: + r1_sents = combine_modus(r1_sents) + r1_sents = combine_consecutive_sentences(r1_sents, eg_list) + r1_sents = [sent for sent in r1_sents if sent not in ls_sentences] + ls_sentences += r1_sents + rl_sents = elaborated_sentence_splitting(rl) + if improved: + rl_sents = combine_modus(rl_sents) + rl_sents = combine_consecutive_sentences(rl_sents, eg_list) + rl_sents = [sent for sent in rl_sents if sent not in ls_sentences] + ls_sentences += rl_sents + for _, r1, rl in pm_sums: + r1_sents = elaborated_sentence_splitting(r1) + if improved: + r1_sents = combine_modus(r1_sents) + r1_sents = combine_consecutive_sentences(r1_sents, combined_list) + r1_sents = [sent for sent in r1_sents if sent not in pm_sentences] + pm_sentences += r1_sents + rl_sents = elaborated_sentence_splitting(rl) + if improved: + rl_sents = combine_modus(rl_sents) + rl_sents = combine_consecutive_sentences(rl_sents, combined_list) + rl_sents = [sent for sent in rl_sents if sent not in pm_sentences] + pm_sentences += rl_sents + + if improved: + savepath = eval_path + eval_run_two_path + 'sentences/' + else: + savepath = eval_path + 'sentences/' + utils.create_dir(current_path=current_dir, directory_name=savepath, delete=False) + workbook = xlsxwriter.Workbook( + utils.server_path(current_path=current_dir, + path=savepath + aktenzeichen.replace('/', '-') + '.xlsx')) + + # sorting to give no indicateion of object ranking + ls_sentences = sorted(ls_sentences) + pm_sentences = sorted(pm_sentences) + write_one_evaluation_worksheet(workbook, 'Leitsatz', ls_sentences) + write_one_evaluation_worksheet(workbook, 'Pressemitteilung', pm_sentences) + workbook.close() + + +def write_one_evaluation_worksheet(workbook, worksheetname, sentences): + """ + Writes one excel sheet either for press releases or leitsatz + + :param workbook: excel workbook to write in + :param worksheetname: name of the sheet + :param sentences: the sentences to write + """ + worksheet = workbook.add_worksheet(name=worksheetname) + cell_format = workbook.add_format() + cell_format.set_text_wrap() + worksheet.set_column(2, 20, 20) + worksheet.set_column(1, 1, 55) + + # description line + worksheet.write(1, 0, 'Nummer') + worksheet.write(1, 1, 'Satz') + for i in range(0, 10, 2): + worksheet.write(0, 2 + i, 'rechtliche Aussage:') + # split line + worksheet.write(1, 2 + i, 'Kategorie') + worksheet.write(1, 2 + i + 1, 'Dopplung') + # sentences with numbers + for i in range(len(sentences)): + worksheet.write(2 + i, 0, i + 1) + worksheet.write(2 + i, 1, sentences[i], cell_format) + # ending line + for i in range(0, 10, 2): + worksheet.write(2 + len(sentences), 2 + i, + 'Falls der Inhalt der rechtlichen Aussage vollständig abgebildet wurde, welche Sätze ' + 'werden dazu benötigt?', cell_format) + worksheet.write(2 + len(sentences) + 1, 2 + i, + 'Falls der Inhalt der rechtlichen Aussage insgesamt in einem passenden ' + 'Abstraktionsniveau angegeben wurden, ' + 'welche Sätze werden dazu benötigt?', cell_format) + + +def write_one_overview_worksheet(workbook, worksheet_name, sum_data): + """ + Writes one worksheet for the rouge overview files. + + :param workbook: workbook to write in + :param worksheet_name: name of the sheet + :param sum_data: (gold, created, original_sents, improves) summaries + """ + worksheet = workbook.add_worksheet(name=worksheet_name) + gold, created, original_list, improved = sum_data + # original text + worksheet.write(0, 0, gold) + row = 2 + current = '' + sentences = utils.split_into_sentences(created) + if improved: + sentences = combine_consecutive_sentences(sentences_to_combine=sentences, original_list=original_list) + for i in range(len(sentences)): + # sentence + current += ' ' + sentences[i] + current = current.strip() + worksheet.write(row, 0, sentences[i]) + row += 1 + + +def write_rouge_overview(ls_data, pm_data, aktenzeichen, improved): + """ + Writes the rouge overview files + + :param ls_data: ls, ls_sums, eg_list with ls the leitsatz and ls_sums the created summaries + :param pm_data: pm, pm_sums, comb_list with pm the press release and pm_sums the created summaries + :param aktenzeichen: aktenzeichen of the case + :param improved: True if the improved version should be run + """ + if improved: + savepath = eval_path + eval_run_two_path + 'rouge_overview/' + else: + savepath = eval_path + 'rouge_overview/' + utils.create_dir(current_path=current_dir, directory_name=savepath, delete=False) + workbook = xlsxwriter.Workbook( + utils.server_path(current_path=current_dir, + path=savepath + aktenzeichen.replace('/', '-') + '.xlsx')) + ls, ls_sums, eg_list = ls_data + for interval, sum_r1, sum_rl in ls_sums: + write_one_overview_worksheet(workbook, 'ls rouge1 ' + interval, sum_data=(ls, sum_r1, eg_list, improved)) + write_one_overview_worksheet(workbook, 'ls rougel ' + interval, sum_data=(ls, sum_rl, eg_list, improved)) + pm, pm_sums, comb_list = pm_data + for interval, sum_r1, sum_rl in pm_sums: + write_one_overview_worksheet(workbook, 'pm rouge1 ' + interval, sum_data=(pm, sum_r1, comb_list, improved)) + write_one_overview_worksheet(workbook, 'pm rougel ' + interval, sum_data=(pm, sum_rl, comb_list, improved)) + workbook.close() + + +def read_or_load_summaries(): + """ + Reads or loads the the summaries from their files. For every intervalm rouge-l and rouge-1 and pm and ls. + Loads if exists, reads otherwise. + + :return: (summaries, sentences) dataframes with summaries the summary data, sentences the corresponding + sentences and counts the counts of existing summaries for each combination + """ + try: + summaries = utils.df_from_json(current_path=current_dir, path=data_path + all_summaries_df_name) + sentences = utils.df_from_json(current_path=current_dir, path=data_path + sum_sentences_df_name) + except Exception: + sentences = pd.DataFrame(columns=[type_str, utils.aktenzeichen_str, rouge_type_str, sentence_str]) + summaries = pd.DataFrame(columns=[type_str, utils.aktenzeichen_str, original_str, summary_id_str, summary_str, + rouge_p_str, rouge_r_str, rouge_f_str, rouge_type_str]) + rouge_overview_path = eval_path + 'rouge_overview/' + + for file in utils.list_dir_files(current_path=current_dir, path=rouge_overview_path): + current_summary_id = 0 + file_sentences = pd.DataFrame( + columns=[type_str, utils.aktenzeichen_str, rouge_type_str, sentence_str]) + + aktenzeichen = file.replace('.xlsx', '') + # intervalle durchgehen + for i in range(1, 11, 1): + # pm und leitsatz durchgehen + for identifier in ['pm', 'ls']: + # rouge 1+ l durchgehen + for rouge_metric in [rouge1_str, rougel_str]: + sheetname = identifier + ' ' + rouge_metric + ' ' + \ + str((i - 1) / 10) + '-' + str(i / 10) + df_sheet_data = pd.read_excel(rouge_overview_path + '/' + file, + sheet_name=sheetname) + if df_sheet_data.shape[0] == 0: + continue + my_summary = '' + my_sentences = [False] * file_sentences.shape[0] + original_summary = df_sheet_data.columns.values[0] + for index, row in df_sheet_data.iterrows(): + if index == 0: # first row is empty + continue + sent = row[original_summary] + my_summary += ' ' + sent + my_summary = my_summary.strip() + # add sent to sentences or mark the old index + existing_sent = file_sentences.loc[(file_sentences[utils.aktenzeichen_str] == + aktenzeichen) & + (file_sentences[sentence_str] == sent) & + (file_sentences[type_str] == identifier) & + (file_sentences[ + rouge_type_str] == rouge_metric)] + if existing_sent.shape[0] > 0: + my_sentences[existing_sent.index.values[0]] = True + else: + file_sentences.loc[len(file_sentences.index)] = [identifier, aktenzeichen, + rouge_metric, sent] + \ + [False] * ( + file_sentences.shape[ + 1] - 4) + my_sentences.append(True) + + file_sentences[current_summary_id] = my_sentences + file_sentences = file_sentences.T.drop_duplicates().T + if file_sentences.shape[1] <= (4 + current_summary_id): # duplicate summary + continue + if rouge_metric == rouge1_str: + r_p, r_r, r_f = rouge.rouge_n(reference_summary=original_summary, + created_summary=my_summary, + pp_options=pp_options, n=1, extended_results=True) + else: + r_p, r_r, r_f = rouge.rouge_l(reference_summary=original_summary, + created_summary=my_summary, + pp_options=pp_options, extended_results=True) + summaries.loc[len(summaries.index)] = [identifier, aktenzeichen, original_summary, + current_summary_id, my_summary, r_p, r_r, + r_f, + rouge_metric] + current_summary_id += 1 + + sentences = pd.concat([sentences, file_sentences], ignore_index=True) + + sentences = sentences.fillna(False) + utils.create_dir(current_path=current_dir, directory_name=data_path, delete=False) + utils.df_to_json(current_path=current_dir, path=data_path + all_summaries_df_name, dataframe=summaries) + utils.df_to_json(current_path=current_dir, path=data_path + sum_sentences_df_name, dataframe=sentences) + + return summaries, sentences + + +def read_or_load_summaries_run_two(): + """ + Reads or loads the the summaries from their files. For every intervalm rouge-l and rouge-1 and pm and ls. + Loads if exists, reads otherwise. + + :return: (summaries, sentences) dataframes with summaries the summary data, sentences the corresponding + sentences and counts the counts of existing summaries for each combination + """ + try: + summaries = utils.df_from_json(current_path=current_dir, + path=data_path + eval_run_two_path + all_summaries_df_name) + sentences = utils.df_from_json(current_path=current_dir, + path=data_path + eval_run_two_path + sum_sentences_df_name) + except Exception: + sentences = pd.DataFrame(columns=[type_str, utils.aktenzeichen_str, rouge_type_str, sentence_str]) + summaries = pd.DataFrame(columns=[type_str, utils.aktenzeichen_str, original_str, summary_id_str, summary_str, + rouge_p_str, rouge_r_str, rouge_f_str, rouge_type_str]) + rouge_overview_path = eval_path + eval_run_two_path + 'rouge_overview/' + + for file in utils.list_dir_files(current_path=current_dir, path=rouge_overview_path): + current_summary_id = 0 + file_sentences = pd.DataFrame( + columns=[type_str, utils.aktenzeichen_str, rouge_type_str, sentence_str]) + + aktenzeichen = file.replace('.xlsx', '') + # intervalle durchgehen + for i in range(1, 11, 1): + # pm und leitsatz durchgehen + for identifier in ['pm', 'ls']: + # rouge 1+ l durchgehen + for rouge_metric in [rouge1_str, rougel_str]: + sheetname = identifier + ' ' + rouge_metric + ' ' + \ + str((i - 1) / 10) + '-' + str(i / 10) + df_sheet_data = pd.read_excel(rouge_overview_path + '/' + file, + sheet_name=sheetname) + if df_sheet_data.shape[0] == 0: + continue + my_summary = '' + my_sentences = [False] * file_sentences.shape[0] + original_summary = df_sheet_data.columns.values[0] + for index, row in df_sheet_data.iterrows(): + if index == 0: # first row is empty + continue + sent = row[original_summary] + my_summary += ' ' + sent + my_summary = my_summary.strip() + # add sent to sentences or mark the old index + existing_sent = file_sentences.loc[(file_sentences[utils.aktenzeichen_str] == + aktenzeichen) & + (file_sentences[sentence_str] == sent) & + (file_sentences[type_str] == identifier) & + (file_sentences[ + rouge_type_str] == rouge_metric)] + if existing_sent.shape[0] > 0: + my_sentences[existing_sent.index.values[0]] = True + else: + file_sentences.loc[len(file_sentences.index)] = [identifier, aktenzeichen, + rouge_metric, sent] + \ + [False] * ( + file_sentences.shape[ + 1] - 4) + my_sentences.append(True) + + file_sentences[current_summary_id] = my_sentences + file_sentences = file_sentences.T.drop_duplicates().T + if file_sentences.shape[1] <= (4 + current_summary_id): # duplicate summary + continue + if rouge_metric == rouge1_str: + r_p, r_r, r_f = rouge.rouge_n(reference_summary=original_summary, + created_summary=my_summary, + pp_options=pp_options, n=1, extended_results=True) + else: + r_p, r_r, r_f = rouge.rouge_l(reference_summary=original_summary, + created_summary=my_summary, + pp_options=pp_options, extended_results=True) + summaries.loc[len(summaries.index)] = [identifier, aktenzeichen, original_summary, + current_summary_id, my_summary, r_p, r_r, + r_f, + rouge_metric] + current_summary_id += 1 + + sentences = pd.concat([sentences, file_sentences], ignore_index=True) + + sentences = sentences.fillna(False) + utils.create_dir(current_path=current_dir, directory_name=data_path + eval_run_two_path, delete=False) + utils.df_to_json(current_path=current_dir, path=data_path + eval_run_two_path + all_summaries_df_name, + dataframe=summaries) + utils.df_to_json(current_path=current_dir, path=data_path + eval_run_two_path + sum_sentences_df_name, + dataframe=sentences) + + return summaries, sentences + + +def get_evaluated_sentences(run_two=False): + """ + Reads the evaluated sentences + + :return: (sentences, info) two dataframes containing all sentences and additional info about sentences needed for a + complete representation of the original + """ + path = evaluated_path + summary_sentences_path + if run_two: + path = evaluated_path + eval_run_two_path + summary_sentences_path + + result_sentences = pd.DataFrame() + result_info = pd.DataFrame() + + for file in utils.list_dir_files(current_path=current_dir, path=path): + ls_data = pd.read_excel(utils.server_path(current_path=current_dir, path=path + file), + sheet_name='Leitsatz') + pm_data = pd.read_excel(utils.server_path(current_path=current_dir, path=path + file), + sheet_name='Pressemitteilung') + aktenzeichen = file.replace('.xlsx', '') + + sentences, info = extract_one_type_data(ls_data, 'ls', aktenzeichen) + sentences_pm, info_pm = extract_one_type_data(pm_data, 'pm', aktenzeichen) + result_sentences = pd.concat([result_sentences, sentences, sentences_pm]) + result_info = pd.concat([result_info, info, info_pm], ignore_index=True) + + return result_sentences, result_info + + +def extract_one_type_data(dataframe, current_type, aktenzeichen): + """ + Extracts the data of one worksheet + + :param dataframe: raw dataframe of the sheet + :param current_type: ls or pm + :param aktenzeichen: aktenzeichen of the judgement + :return: sentences, info with sentences the evaluated sentences and info a dataframe of legal issues and the + sentences needed to complete them + """ + # check if column empty + data_row_count = dataframe.shape[0] - 3 + + drop_columns = [] + for i in range(len(dataframe.columns)): + column = dataframe.columns.values[i] + if (i % 2) == 0: + if dataframe[column][1:-2].isnull().sum() == data_row_count: + # everything empty + index = dataframe.columns.get_loc(column) + drop_columns.append(index) + if index + 1 < len(dataframe.columns): + drop_columns.append(index + 1) + + dataframe.drop(dataframe.columns[drop_columns], axis=1, inplace=True) # also dop dopplung + + # sentence duplicates + duplicates = [] + duplicate_rows = dataframe.iloc[1:-2, 3::2].dropna(how='all') + for index, row in duplicate_rows.iterrows(): + for dup_index in row.unique(): + if not pd.isna(dup_index): + duplicates.append((index, dup_index)) + + # refill values for duplicates if not given + for a, b in duplicates: + a_v = dataframe.iloc[a].iloc[2::2].dropna().unique() + b_v = dataframe.iloc[b].iloc[2::2].dropna().unique() + if len(b_v) == 0: # missing value + dataframe.iloc[b, 2] = a_v[0] + if len(a_v) == 0: # missing value + dataframe.iloc[a, 2] = b_v[0] + + # Sentence evaluations + sentences = dataframe.iloc[1:-2, :2] + sentences.columns = [sent_no_str, sentence_str] + legal_issues = dataframe.iloc[1:-2, 2:].T.apply(lambda col: col.iloc[::2].dropna()).T + column_names = [issues_str + str(i) for i in range(1, legal_issues.shape[1] + 1)] + legal_issues.columns = column_names + sentences = pd.concat([sentences, legal_issues], axis=1) + # Kombination für vollständige Abbildung finden + abstr_matching = [] + content_matching = [] + subset = dataframe.iloc[-2:, 2:] + for i in range(1, subset.shape[1], 2): + if not pd.isna(subset.iloc[0, i]): + content_matching.append(subset.iloc[0, i]) + else: + content_matching.append('') + if not pd.isna(subset.iloc[1, i]): + abstr_matching.append(subset.iloc[1, i]) + else: + abstr_matching.append('') + + sentences[type_str] = current_type + + sentences[utils.aktenzeichen_str] = aktenzeichen + abstr_matching = insert_duplicates(abstr_matching, duplicates) + content_matching = insert_duplicates(content_matching, duplicates) + info = pd.DataFrame(columns=[utils.aktenzeichen_str, type_str, issues_str, abstr_complete_str, + content_complete_str]) + for i in range(len(abstr_matching)): + info.loc[len(info.index)] = [aktenzeichen, current_type, str(i), abstr_matching[i], content_matching[i]] + + return sentences, info + + +def insert_duplicates(match, duplicates): + """ + In the list of sentences needed for a complete content or abstraction level, the duplicates are inserted + + :param match: list of either abstraction or content matchings + :param duplicates: [(a,b)*] the list of duplicates + :return: the updated input list + """ + for i in range(len(match)): + matching = str(match[i]).replace(',', '').split(' ') + for a, b in duplicates: + if str(a) in matching: + matching.remove(str(a)) + matching.append(str(a) + ',' + str(b)) + if str(b) in matching: + matching.remove(str(b)) + matching.append(str(a) + ',' + str(b)) + match[i] = ' '.join(matching) + return match + + +def get_interval_counts(summaries): + """ + Counts all summaries for the intervals + + :param summaries: all summaries + :return: dataframe with countet result + """ + res = pd.DataFrame() + for type_id in ['ls', 'pm']: + for rouge_type in [rouge1_str, rougel_str]: + selected_summaries = summaries[(summaries[type_str] == type_id) & + (summaries[rouge_type_str] == rouge_type)] + counts = selected_summaries[interval_start_str].value_counts() + counts.name = rouge_type + '_' + type_id + res = pd.concat([res, counts], axis=1) + res = res.sort_index() + + return res + + +def get_interval_mean_ranking(summaries): + """ + Gets mean ranking of all summaries for the intervals + + :param summaries: all summaries + :return: dataframe with countet result + """ + res = pd.DataFrame() + for type_id in ['ls', 'pm']: + for rouge_type in [rouge1_str, rougel_str]: + selected_summaries = summaries[(summaries[type_str] == type_id) & + (summaries[rouge_type_str] == rouge_type)] + means = selected_summaries.groupby(interval_start_str)[evaluation_str].mean() + means.name = rouge_type + '_' + type_id + res = pd.concat([res, means], axis=1) + res = res.sort_index() + + return res + + +def sentences_complete(number_list, sentence_index_list): + """ + Determines wether all sentence are contained which are needed for completion. + + :param number_list: abstr_ or content_ list + :param sentence_index_list: all sentence indices of the summary + :return: true if all sentences are contained, false otherwise + """ + if number_list == '': + return False + else: + numbers = number_list.split(' ') + for number in numbers: + if ',' in number: # duplicate numbering, 'or' + a, b = number.split(',') + if a not in sentence_index_list and b not in sentence_index_list: + return False + else: + if number not in sentence_index_list: + return False + return True + + +def get_cat_values(all_values): + """ + Reads all evaluated categories and writes them to a set + + :param all_values: raw evaluation + :return: set of all letters contained. + """ + all_cats = set() + for cat in all_values: + if not pd.isna(cat): + for char in cat: + all_cats.add(char) + return all_cats + + +def get_one_summary_evaluation(package): + """ + Evaluates one summary. + + :param package: my_info, my_sum_sents, sum_index + :return: sum_index, evaluation. + """ + my_info, my_sum_sents, sum_index = package + # for each legal issue + result = 0 + num_legal_issues = my_info.shape[0] + for issue in range(num_legal_issues): + content_list = my_info[my_info[issues_str] == str(issue)][content_complete_str].iloc[0] + sentence_list = my_sum_sents[sent_no_str].unique() + sentence_list = [str(v) for v in sentence_list] + cat_v_content_comp = sentences_complete(content_list, sentence_list) + all_categories = get_cat_values(my_sum_sents[issues_str + str(issue + 1)].unique()) + if 'F' in all_categories: + result = -4 + elif 'L' in all_categories: + result -= 2 + + if 'V' in all_categories or cat_v_content_comp: + result += 2 + elif 'P' in all_categories or 'E' in all_categories: + result += 1 + + if 'S' in all_categories: # as soon as there is one S, only one point + result += 1 + elif 'E' in all_categories or 'G' in all_categories: # if no S, but G or E, then 2 points + result += 2 + return sum_index, result / num_legal_issues / 4 # divide with 4 for range + + +def evaluate_all_summaries(info, sents, sums): + """ + Coordinates calculation of all summary evaluations + + :param info: infos to the summaries + :param sents: sentences of all summaries + :param sums: summarie overviews of all summaries + :return: sums with an appended column cointaining the evaluation + """ + packaged_info = [(info[(info[utils.aktenzeichen_str] == row[utils.aktenzeichen_str]) & + (info[type_str] == row[type_str])], + sents[(sents[utils.aktenzeichen_str] == row[utils.aktenzeichen_str]) + & (sents[str(row[summary_id_str])] == True)], index) + for index, row in sums.iterrows()] + res = utils.parallel_imap(get_one_summary_evaluation, packaged_args=packaged_info) + var = [] + for content in res: + index, evaluation = content + var.append((index, evaluation)) + + idx, values = zip(*var) + evaluations = pd.Series(values, idx) + evaluations.name = evaluation_str + res_summaries = pd.concat([sums, evaluations], axis=1) + return res_summaries + + +def get_category_overview_data(sentences, name): + """ + Gets the data for plotting a bar plot of categories + + :param sentences: sentences to include + :param name: name for the column + :return: dataframe containing prepared data + """ + sent_count = sentences.shape[0] + evals = sentences.apply(pd.Series.value_counts).sum(axis=1) + res = {} + for i in evals.index: + for char in i: + if char != ' ': + vals = res.get(char) + if vals is None: + vals = 0 + vals += evals[i] + res[char] = vals + for key in res.keys(): + res[key] = [res[key] / sent_count] + res_df = pd.DataFrame.from_dict(res).T.sort_index() + res_df.columns = [name] + return res_df + + +def get_category_counts(sentences, name, sums, evaluation_cols): + """ + Gets the data for plotting a bar plot of categories + + :param sentences: sentences to include + :param name: name for the column + :param evaluation_cols: number of evaluation columns + :param sums: all summaries + :return: dataframe containing prepared data + """ + summary_count = sums.shape[0] + sentences_all = pd.DataFrame() + for _, row in sentences.iterrows(): + for _ in range(len(elaborated_sentence_splitting(row[sentence_str]))): + occurance_count = row[[str(x) for x in range(0, sums[summary_id_str].max() + 1)]].value_counts()[True] + for _ in range(occurance_count): + sentences_all = pd.concat([sentences_all, row], axis=1) + sentences_all = sentences_all.T[[issues_str + str(i) for i in range(1, evaluation_cols + 1)]] + sent_count = sentences_all.shape[0] + evals = sentences_all.apply(pd.Series.value_counts).sum(axis=1) + res = {} + for i in evals.index: + for char in i: + if char != ' ': + vals = res.get(char) + if vals is None: + vals = 0 + vals += evals[i] + res[char] = vals + for key in res.keys(): + res[key] = [res[key] / sent_count] + res['avg number of sentences'] = [sent_count / summary_count] + res_df = pd.DataFrame.from_dict(res).T.sort_index() + res_df.columns = [name] + return res_df + + +def draw_pics(sents, sums, info, run_two=False): + """ + Draws the images for the visual evaluation. + + :param sents: sentences to visualize + :param sums: summaries to visualize + :param info: corresponding info + :param run_two: if True, then files are written to dedicated directory for second run + """ + my_picture_path = picture_path + if run_two: + my_picture_path += eval_run_two_path + utils.create_dir(current_path=current_dir, directory_name=my_picture_path, delete=False) + + # sent_subset = sents[~sents[[issues_str+str(i) for i in range(1, int(info[issues_str].max())+2)]] + # .isin(['R', 'T']).any(axis=1)] + # sum_subset = pd.DataFrame() + # for _, row in sums.iterrows(): + # subset = sent_subset[(sent_subset[type_str] == row[type_str]) & + # (sent_subset[rouge_type_str] == row[rouge_type_str]) & + # (sent_subset[utils.aktenzeichen_str] == row[utils.aktenzeichen_str]) + # & (sent_subset[str(row[summary_id_str])] == True)] + # if subset.shape[0] > 0: + # sum_subset = pd.concat([sum_subset, row], axis=1) + # sum_subset = sum_subset.T + # sents = sent_subset + # sums = sum_subset + + fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(11, 6)) + for sum_type in ['ls', 'pm']: + for r_type in [rouge1_str, rougel_str]: + x_ticks = ['0.1 -\n0.2', '0.2 -\n0.3', '0.3 -\n0.4', '0.4 -\n0.5', '0.5 -\n0.6', '0.6 -\n0.7', + '0.7 -\n0.8', '0.8 -\n0.9', '0.9 -\n1.0'] + if sum_type == 'ls': + if r_type == rouge1_str: + ax = ax1 + x_label = 'ROUGE-1' + y_label = 'Guiding principles' + else: + ax = ax2 + x_label = 'ROUGE-L' + y_label = '' + else: + if r_type == rouge1_str: + ax = ax3 + x_label = '' + y_label = 'Press releases' + else: + ax = ax4 + x_label = '' + y_label = '' + x_ticks = [tick for tick in x_ticks if float(tick[:3]) in sums[(sums[type_str] == sum_type) & + (sums[rouge_type_str] == + r_type)][interval_start_str].unique()] + sums[(sums[type_str] == sum_type) & (sums[rouge_type_str] == r_type)][ + ['eval', interval_start_str]].plot(kind='box', ax=ax, by=interval_start_str, + color=dict(boxes='black', whiskers='black', + medians='black', caps='black'), + ylabel=y_label) + ax.set_title(x_label) + ax.set_ylim(0, 1) + ax.set_xticklabels(x_ticks) + + fig.savefig(my_picture_path + 'boxplots.png') + xticklabels = ['ROUGE-1\ngp', 'ROUGE-L\ngp', 'ROUGE-1\npr', + 'ROUGE-L\npr'] + yticklabels = ['0.1-0.2', '0.2-0.3', '0.3-0.4', '0.4-0.5', '0.5-0.6', '0.6-0.7', '0.7-0.8', '0.8-0.9', '0.9-1.0'] + fig, (ax, ax2) = plt.subplots(ncols=2, figsize=(12, 5)) + mean_rankings = get_interval_mean_ranking(sums).fillna(0) + sns.heatmap(mean_rankings, annot=True, ax=ax, cmap='Greys', vmax=1, xticklabels=xticklabels, + yticklabels=yticklabels) + + all_counts = get_interval_counts(sums).fillna(0) + sns.heatmap(all_counts, annot=True, ax=ax2, cmap='Greys_r', xticklabels=xticklabels, yticklabels=yticklabels) + ax2.tick_params(rotation=0) + ax.tick_params(rotation=0) + ax2.set_ylabel('Interval') + ax.set_ylabel('Interval') + fig.savefig(my_picture_path + 'heatmaps.png') + + evaluations_cols = info[issues_str].astype(int).max() + 1 + evals = sents[[issues_str + str(i) for i in range(1, evaluations_cols + 1)]] + res_df = get_category_overview_data(evals, 'all') + for sum_type in ['ls', 'pm']: + for r_type in [rouge1_str, rougel_str]: + evals = sents[(sents[rouge_type_str] == r_type) & (sents[type_str] == sum_type)] + evals = evals[[issues_str + str(i) for i in range(1, evaluations_cols + 1)]] + res_df = pd.concat([res_df, get_category_overview_data(evals, sum_type + ' ' + r_type)], axis=1) + fig = res_df.plot(kind='bar').get_figure() + fig.savefig(utils.server_path(current_path=current_dir, path=my_picture_path + 'cat_perc_types.png')) + + evals = sents[[issues_str + str(i) for i in range(1, evaluations_cols + 1)]] + res_df_intervals = get_category_overview_data(evals, 'all') + for interval in sums[interval_start_str].unique(): + selection_list = sums[sums[interval_start_str] == interval][[utils.aktenzeichen_str, summary_id_str]].values + evals = pd.DataFrame() + for aktenzeichen, sum_id in selection_list: + evals = pd.concat([evals, sents[(sents[str(sum_id)] == True) & + (sents[utils.aktenzeichen_str] == aktenzeichen)]]) + evals = evals[[issues_str + str(i) for i in range(1, evaluations_cols + 1)]] + res_df_intervals = pd.concat([res_df_intervals, get_category_overview_data(evals, interval)], axis=1) + fig = res_df_intervals.plot(kind='bar').get_figure() + fig.savefig(utils.server_path(current_path=current_dir, path=my_picture_path + 'cat_perc_intervals.png')) + + # table overview + for sum_type in ['ls', 'pm']: + for r_type in [rouge1_str, rougel_str]: + print(sum_type + ' ' + r_type) + my_sents = sents[(sents[type_str] == sum_type) & (sents[rouge_type_str] == r_type)] + my_sums = sums[(sums[type_str] == sum_type) & (sums[rouge_type_str] == r_type)] + table_data = pd.DataFrame() + for interval in my_sums[interval_start_str].unique(): + my_current_sums = my_sums[my_sums[interval_start_str] == interval] + selection_list = my_current_sums[[utils.aktenzeichen_str, summary_id_str]].values + evals = pd.DataFrame() + for aktenzeichen, sum_id in selection_list: + evals = pd.concat([evals, my_sents[(my_sents[str(sum_id)] == True) & + (my_sents[utils.aktenzeichen_str] == aktenzeichen)]]) + table_data = pd.concat([table_data, get_category_counts(evals, interval, my_current_sums, + evaluations_cols).T], axis=0) + print_data = pd.DataFrame() + print_data['R+T'] = table_data['R'] + table_data['T'] + print_data['U'] = table_data['U'] + print_data['Rest'] = 1 - (print_data['U'] + print_data['R+T']) + print_data['avg sentences'] = table_data['avg number of sentences'] + print(print_data.to_string()) + + +def readjust_consecutive_sentences(sents, sums, info): + """ + Inputs the additional consecutive sentences from the file + + :param sents: all existing sentences + :param sums: all existing summaries + :param info: all existing infos + :return: the new sentences + """ + res = pd.DataFrame() + comb_sents = pd.read_excel(evaluated_path + eval_run_two_path + 'AufeinanderfolgendeSätze.xlsx') + comb_sents.columns = [utils.aktenzeichen_str, type_str, sentence_str, evaluation_str, + evaluation_str + '_new'] + + new_sent_indices = {} + for _, comb_row in comb_sents.iterrows(): + sentences = elaborated_sentence_splitting(comb_row[sentence_str]) + if len(sentences) != len(comb_row[evaluation_str].split()): + print('Wrong splitting!') + elaborated_sentence_splitting(comb_row[sentence_str]) + r1_sents = sents[(sents[type_str] == comb_row[type_str]) & + (sents[utils.aktenzeichen_str] == comb_row[utils.aktenzeichen_str].replace('/', '-')) & + (sents[rouge_type_str] == rouge1_str) & sents[sentence_str].isin(sentences)] + rl_sents = sents[(sents[type_str] == comb_row[type_str]) & + (sents[utils.aktenzeichen_str] == comb_row[utils.aktenzeichen_str].replace('/', '-')) & + (sents[rouge_type_str] == rougel_str) & sents[sentence_str].isin(sentences)] + r1_sums = [] + rl_sums = [] + + # find summaries containing the new sentence package + for col_name in [str(i) for i in range(1, sums[summary_id_str].max() + 1)]: + var_r1 = r1_sents[col_name].unique() + if len(var_r1) == 1 and var_r1[0] == True: + r1_sums.append(col_name) + var_rl = rl_sents[col_name].unique() + if len(var_rl) == 1 and var_rl[0] == True: + rl_sums.append(col_name) + + if len(r1_sums) > 0: + new_index, new_sent_indices = get_new_sent_index(new_sent_indices, comb_row[utils.aktenzeichen_str], sents) + res = pd.concat([res, get_new_row(comb_row=comb_row, my_sents=r1_sents, r_string=rouge1_str, + sums=sums, sum_list=r1_sums, info=info, sent_no=new_index)]) + if len(rl_sums) > 0: + new_index, new_sent_indices = get_new_sent_index(new_sent_indices, comb_row[utils.aktenzeichen_str], sents) + res = pd.concat([res, get_new_row(comb_row=comb_row, my_sents=rl_sents, r_string=rougel_str, + sums=sums, sum_list=rl_sums, info=info, sent_no=new_index)]) + for index in r1_sents.index.values: + for sum_id in r1_sums: + sents.at[index, sum_id] = False + for index in rl_sents.index.values: + for sum_id in rl_sums: + sents.at[index, sum_id] = False + + return pd.concat([sents, res]) + + +def get_new_sent_index(index_dict, aktenzeichen, sentences): + """ + Gets a new number for a sentence to add that wasnt used for that aktenzeichen before + + :param index_dict: dict with existing new indices + :param aktenzeichen: aktenzeichen to look for + :param sentences: sentences to find existing numbers in + :return: a new sentences numer to use + """ + new_index = index_dict.get(aktenzeichen) + if new_index is None: + new_index = sentences[(sentences[utils.aktenzeichen_str] == + aktenzeichen.replace('/', '-'))][sent_no_str].max() + 1 + index_dict[aktenzeichen] = new_index + 1 + return new_index, index_dict + + +def get_new_row(comb_row, my_sents, r_string, sums, sum_list, info, sent_no): + """ + Creates a new sentence (package) row + + :param comb_row: information concerning the sentence + :param my_sents: sentences appeasring in that sentence + :param r_string: rouge_string + :param sums: all existing summaries + :param sum_list: index list of summaries for that sentence + :param info: all infos + :param sent_no: sentence number for the row + :return: a row as dataframe + """ + row_data = {type_str: [comb_row[type_str]], rouge_type_str: [r_string], + utils.aktenzeichen_str: [my_sents.iloc[0][utils.aktenzeichen_str]], + sentence_str: [comb_row[sentence_str]], sent_no_str: [sent_no]} + for col_name in [str(i) for i in range(sums[summary_id_str].max())]: + if col_name not in sum_list: + row_data[col_name] = [False] + else: + row_data[col_name] = [True] + + # legal issue + legal_issues = my_sents[[issues_str + str(i + 1) for i in range(int(info[issues_str].max()) + 1)]].dropna(axis=1, + how='all') + for col in legal_issues.columns: + row_data[col] = comb_row[evaluation_str + '_new'] + if 'P' in str(comb_row[evaluation_str + '_new']): + # completeness + old_sentences = info[(info[utils.aktenzeichen_str] == comb_row[utils.aktenzeichen_str].replace('/', '-')) & + (info[type_str] == comb_row[type_str]) & + (info[issues_str] == str(int(col[-1]) - 1))][content_complete_str].values[0].split(' ') + new_sentences = '' + or_string = '' + for sentence_no in old_sentences: + if sentence_no in [str(x) for x in my_sents[sent_no_str].values]: + or_string += ' ' + sentence_no + or_string = or_string.strip() + else: + new_sentences += ' ' + sentence_no + new_sentences = new_sentences.strip() + if or_string != '': + or_string = '(' + or_string.replace(' ', '-') + ',' + str(sent_no) + ')' + new_sentences += ' ' + or_string + else: + new_sentences += ' ' + str(sent_no) + new_sentences = new_sentences.strip() + info.at[info[(info[utils.aktenzeichen_str] == comb_row[utils.aktenzeichen_str].replace('/', '-')) & + (info[type_str] == comb_row[type_str]) & + (info[issues_str] == str(int(col[-1]) - 1))] + [content_complete_str].index.values[0], content_complete_str] = new_sentences + return pd.DataFrame.from_dict(row_data) + + +def remove_bad_cats(sums, sents, infos): + res_sums = pd.DataFrame() + res_sents = sents[~sents[[issues_str + str(i + 1) for i in range(int(infos[issues_str].max()) + 1)]].isin( + ['R', 'T', np.nan]).all(axis=1)] + for _, row in sums.iterrows(): + my_sents = res_sents[(res_sents[utils.aktenzeichen_str] == row[utils.aktenzeichen_str]) & + (res_sents[type_str] == row[type_str]) & + (res_sents[rouge_type_str] == row[rouge_type_str]) & + (res_sents[str(row[summary_id_str])] == True)] + if my_sents.shape[0] > 0: + summary = ' '.join(my_sents[sentence_str].values) + new_row = {type_str: [row[type_str]], utils.aktenzeichen_str: [row[utils.aktenzeichen_str]], + original_str: [row[original_str]], summary_id_str: [row[summary_id_str]], + rouge_type_str: [row[rouge_type_str]], summary_str: [summary]} + if row[rouge_type_str] == rouge1_str: + p, r, f = rouge.rouge_n(reference_summary=row[original_str], created_summary=summary, n=1, + pp_options=pp_options, extended_results=True) + else: + p, r, f = rouge.rouge_l(reference_summary=row[original_str], created_summary=summary, + pp_options=pp_options, extended_results=True) + new_row['rouge_precision'] = p + new_row['rouge_recall'] = r + new_row['rouge_f_measure'] = f + res_sums = pd.concat([res_sums, pd.DataFrame.from_dict(new_row)], ignore_index=True) + + return res_sents, res_sums + + +def evaluate_summaries_run_two(): + """ + Coordinates evaluation of the summaries from the sentences evaluated + """ + all_summaries, all_summary_sentences = read_or_load_summaries_run_two() + evaluated_sentences, evaluated_infos = get_evaluated_sentences(run_two=True) + final_summaries = all_summaries[all_summaries[utils.aktenzeichen_str].isin(evaluated_infos[utils.aktenzeichen_str])] + final_sentences = all_summary_sentences.merge(evaluated_sentences, how='inner', + on=[utils.aktenzeichen_str, type_str, sentence_str]) + + # final_sentences, final_summaries = remove_bad_cats(sents=final_sentences, + # sums=final_summaries, infos=evaluated_infos) + final_summaries = evaluate_all_summaries(evaluated_infos, final_sentences, final_summaries) + final_summaries[interval_start_str] = final_summaries.apply( + lambda row: (np.ceil(row['rouge_recall'] * 10) - 1) / 10 + if row[rouge_type_str] == rouge1_str + else (np.ceil(row['rouge_f_measure'] * 10) - 1) / 10, axis=1) + final_summaries = final_summaries.drop_duplicates([utils.aktenzeichen_str, type_str, rouge_type_str, + interval_start_str]) + final_summaries = final_summaries[final_summaries[interval_start_str] > 0.0] + draw_pics(sents=final_sentences, sums=final_summaries, info=evaluated_infos, run_two=True) + + +def evaluate_summaries(): + """ + Coordinates evaluation of the summaries from the sentences evaluated + """ + all_summaries, all_summary_sentences = read_or_load_summaries() + evaluated_sentences, evaluated_infos = get_evaluated_sentences() + final_sentences = all_summary_sentences.merge(evaluated_sentences, how='inner', + on=[utils.aktenzeichen_str, type_str, sentence_str]) + final_summaries = all_summaries[all_summaries[utils.aktenzeichen_str].isin(evaluated_infos[utils.aktenzeichen_str])] + + # final_sentences, final_summaries = remove_bad_cats(sents=final_sentences, sums=final_summaries, + # infos=evaluated_infos) + final_summaries = evaluate_all_summaries(evaluated_infos, final_sentences, final_summaries) + final_summaries[interval_start_str] = final_summaries.apply( + lambda row: (np.ceil(row['rouge_recall'] * 10) - 1) / 10 + if row[rouge_type_str] == rouge1_str + else (np.ceil(row['rouge_f_measure'] * 10) - 1) / 10, axis=1) + final_summaries = final_summaries.drop_duplicates([utils.aktenzeichen_str, type_str, rouge_type_str, + interval_start_str]) + final_summaries = final_summaries[final_summaries[interval_start_str] > 0.0] + draw_pics(sents=final_sentences, sums=final_summaries, info=evaluated_infos) + + +def combine_modus(list_of_string): + """ + For the input list of consecutive strings, konjunktiv sentences are combined with all following sentences until + there is an indicative sentence + + :param list_of_string: strings for comibning + :return: updated list + """ + result_strings = [] + + current_string = '' + for string in list_of_string: + nlp_string = settings.nlp(string) + ind = False + sub = False + verb_mod_tag_list = ['VVFIN', 'VAFIN', 'VMFIN', 'VVIMP', 'VAIMP'] + for token in nlp_string: + if token.tag_ in verb_mod_tag_list: + mood = token.morph.get('Mood') + if 'Ind' in mood: + if token.head.tag_ in verb_mod_tag_list: + head_mood = token.head.morph.get('Mood') + if 'Sub' not in head_mood: + ind = True + else: + ind = True + if 'Sub' in mood: + sub = True + current_string += ' ' + string + current_string = current_string.strip() + if ind or not sub: + result_strings.append(current_string) + current_string = '' + + return result_strings + + +def get_index_in_list(sentence, string_list): + """ + Returns index of a string in a list with some leniance. + + :param sentence: sentence to find + :param string_list: list to look in + :return: index or None, if nothing was found + """ + try: + return string_list.index(sentence) + except ValueError: + short_length = 200 + for i in range(100): + result_list = [string_list.index(l_item) for l_item in string_list + if sentence[:min(100, len(sentence))] in l_item] + if len(result_list) == 1: + return result_list[0] + if len(result_list) == 0: + short_length -= 10 + if len(result_list) > 1: + short_length += 50 + return None + + +def readjust_splitting(old_list): + """ + Some sentences might not be split correctly + + :param old_list: original split sentences + :return: new split sentences + """ + res = [] + for string in old_list: + if 'InfoV. ' in string: + split_list = string.split('InfoV. ') + for i in range(len(split_list) - 1): + split_list[i] = split_list[i] + 'InfoV.' + res += split_list + else: + res.append(string) + return res + + +if __name__ == "__main__": + # pm_judgments = get_evaluation_data(case_list=cases_one_list) + # data = [(a, b, False) for (a, b) in pm_judgments] + # utils.parallel_imap(write_files_for_one_judgement, data) + # evaluate_summaries() + + # pm_judgments = get_evaluation_data(case_list=cases_two_list) + # print_following_sentences() + # data = [(a, b, c, True) for (a, b, c) in pm_judgments] + evaluate_summaries_run_two() + print('Done') diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_113-18.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_113-18.xlsx new file mode 100644 index 0000000..dcf27e2 Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_113-18.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_25-20.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_25-20.xlsx new file mode 100644 index 0000000..35e4097 Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_25-20.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_292-17.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_292-17.xlsx new file mode 100644 index 0000000..8dd36bd Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_292-17.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_35-18.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_35-18.xlsx new file mode 100644 index 0000000..cdb6204 Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_35-18.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_391-17.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_391-17.xlsx new file mode 100644 index 0000000..26551d4 Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_391-17.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_42-19.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_42-19.xlsx new file mode 100644 index 0000000..df65db2 Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_42-19.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_55-19.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_55-19.xlsx new file mode 100644 index 0000000..379b85f Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_55-19.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_67-18.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_67-18.xlsx new file mode 100644 index 0000000..7951f40 Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_67-18.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_79-21.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_79-21.xlsx new file mode 100644 index 0000000..daf1b10 Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_79-21.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/II_ZR_152-20.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/II_ZR_152-20.xlsx new file mode 100644 index 0000000..fd5d683 Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/II_ZR_152-20.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/II_ZR_84-20.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/II_ZR_84-20.xlsx new file mode 100644 index 0000000..08a7728 Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/II_ZR_84-20.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/IV_ZR_144-21.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/IV_ZR_144-21.xlsx new file mode 100644 index 0000000..7f2665a Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/IV_ZR_144-21.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/IV_ZR_253-20.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/IV_ZR_253-20.xlsx new file mode 100644 index 0000000..0280506 Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/IV_ZR_253-20.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/I_ZR_139-15.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/I_ZR_139-15.xlsx new file mode 100644 index 0000000..692e522 Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/I_ZR_139-15.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/I_ZR_146-20.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/I_ZR_146-20.xlsx new file mode 100644 index 0000000..9f8e3d0 Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/I_ZR_146-20.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/I_ZR_153-17.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/I_ZR_153-17.xlsx new file mode 100644 index 0000000..d6b9c02 Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/I_ZR_153-17.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/I_ZR_23-18.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/I_ZR_23-18.xlsx new file mode 100644 index 0000000..135de89 Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/I_ZR_23-18.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/VIII_ZR_21-19.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/VIII_ZR_21-19.xlsx new file mode 100644 index 0000000..1eae348 Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/VIII_ZR_21-19.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/VIII_ZR_277-16.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/VIII_ZR_277-16.xlsx new file mode 100644 index 0000000..52a315c Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/VIII_ZR_277-16.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/VIII_ZR_66-17.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/VIII_ZR_66-17.xlsx new file mode 100644 index 0000000..a37e1b2 Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/VIII_ZR_66-17.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/VIII_ZR_94-17.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/VIII_ZR_94-17.xlsx new file mode 100644 index 0000000..5c19665 Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/VIII_ZR_94-17.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/VII_ZR_151-18.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/VII_ZR_151-18.xlsx new file mode 100644 index 0000000..774a8ff Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/VII_ZR_151-18.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/VII_ZR_192-20.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/VII_ZR_192-20.xlsx new file mode 100644 index 0000000..a6e6bc4 Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/VII_ZR_192-20.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/VII_ZR_78-20.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/VII_ZR_78-20.xlsx new file mode 100644 index 0000000..4d9b92a Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/VII_ZR_78-20.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/VI_ZR_128-20.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/VI_ZR_128-20.xlsx new file mode 100644 index 0000000..6e26093 Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/VI_ZR_128-20.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/VI_ZR_252-19.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/VI_ZR_252-19.xlsx new file mode 100644 index 0000000..c8a3013 Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/VI_ZR_252-19.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/VI_ZR_506-17.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/VI_ZR_506-17.xlsx new file mode 100644 index 0000000..51aeefc Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/VI_ZR_506-17.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/VIa_ZR_418-21.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/VIa_ZR_418-21.xlsx new file mode 100644 index 0000000..9b49871 Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/VIa_ZR_418-21.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_112-18.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_112-18.xlsx new file mode 100644 index 0000000..4a014cb Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_112-18.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_176-17.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_176-17.xlsx new file mode 100644 index 0000000..b13de77 Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_176-17.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_218-18.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_218-18.xlsx new file mode 100644 index 0000000..a28f3b8 Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_218-18.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_254-17.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_254-17.xlsx new file mode 100644 index 0000000..656ea75 Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_254-17.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_273-17.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_273-17.xlsx new file mode 100644 index 0000000..d6ae081 Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_273-17.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_299-19.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_299-19.xlsx new file mode 100644 index 0000000..c202d38 Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_299-19.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_8-19.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_8-19.xlsx new file mode 100644 index 0000000..f2236e7 Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_8-19.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/XII_ZR_13-19.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/XII_ZR_13-19.xlsx new file mode 100644 index 0000000..1a61d1a Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/XII_ZR_13-19.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/XI_ZR_345-18.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/XI_ZR_345-18.xlsx new file mode 100644 index 0000000..b65241d Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/XI_ZR_345-18.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/XI_ZR_7-19.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/XI_ZR_7-19.xlsx new file mode 100644 index 0000000..f9dcfe0 Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/XI_ZR_7-19.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/X_ZR_107-16.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/X_ZR_107-16.xlsx new file mode 100644 index 0000000..3c64b7c Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/X_ZR_107-16.xlsx differ diff --git a/rouge_evalauation/evaluated_data/extractive_judgments/X_ZR_96-17.xlsx b/rouge_evalauation/evaluated_data/extractive_judgments/X_ZR_96-17.xlsx new file mode 100644 index 0000000..62fbddb Binary files /dev/null and b/rouge_evalauation/evaluated_data/extractive_judgments/X_ZR_96-17.xlsx differ diff --git a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/I ZR 146-20.xlsx b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/I ZR 146-20.xlsx new file mode 100644 index 0000000..1331532 Binary files /dev/null and b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/I ZR 146-20.xlsx differ diff --git a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/I ZR 153-17.xlsx b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/I ZR 153-17.xlsx new file mode 100644 index 0000000..aa268f4 Binary files /dev/null and b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/I ZR 153-17.xlsx differ diff --git a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/II ZR 152-20.xlsx b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/II ZR 152-20.xlsx new file mode 100644 index 0000000..2ddff83 Binary files /dev/null and b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/II ZR 152-20.xlsx differ diff --git a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/II ZR 84-20.xlsx b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/II ZR 84-20.xlsx new file mode 100644 index 0000000..dfb10f6 Binary files /dev/null and b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/II ZR 84-20.xlsx differ diff --git a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/III ZR 25-20.xlsx b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/III ZR 25-20.xlsx new file mode 100644 index 0000000..11c5b50 Binary files /dev/null and b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/III ZR 25-20.xlsx differ diff --git a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/III ZR 79-21.xlsx b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/III ZR 79-21.xlsx new file mode 100644 index 0000000..08a32ad Binary files /dev/null and b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/III ZR 79-21.xlsx differ diff --git a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/IV ZR 144-21.xlsx b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/IV ZR 144-21.xlsx new file mode 100644 index 0000000..90e8c16 Binary files /dev/null and b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/IV ZR 144-21.xlsx differ diff --git a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/IV ZR 253-20.xlsx b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/IV ZR 253-20.xlsx new file mode 100644 index 0000000..589956e Binary files /dev/null and b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/IV ZR 253-20.xlsx differ diff --git a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/V ZR 299-19.xlsx b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/V ZR 299-19.xlsx new file mode 100644 index 0000000..482f717 Binary files /dev/null and b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/V ZR 299-19.xlsx differ diff --git a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/V ZR 8-19.xlsx b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/V ZR 8-19.xlsx new file mode 100644 index 0000000..8674bb7 Binary files /dev/null and b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/V ZR 8-19.xlsx differ diff --git a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VI ZR 128-20.xlsx b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VI ZR 128-20.xlsx new file mode 100644 index 0000000..f5e6de6 Binary files /dev/null and b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VI ZR 128-20.xlsx differ diff --git a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VI ZR 252-19.xlsx b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VI ZR 252-19.xlsx new file mode 100644 index 0000000..e4cfe6a Binary files /dev/null and b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VI ZR 252-19.xlsx differ diff --git a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VII ZR 192-20.xlsx b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VII ZR 192-20.xlsx new file mode 100644 index 0000000..fabe9de Binary files /dev/null and b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VII ZR 192-20.xlsx differ diff --git a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VII ZR 78-20.xlsx b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VII ZR 78-20.xlsx new file mode 100644 index 0000000..e28c8c6 Binary files /dev/null and b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VII ZR 78-20.xlsx differ diff --git a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VIII ZR 21-19.xlsx b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VIII ZR 21-19.xlsx new file mode 100644 index 0000000..a63a49f Binary files /dev/null and b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VIII ZR 21-19.xlsx differ diff --git a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VIII ZR 66-17.xlsx b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VIII ZR 66-17.xlsx new file mode 100644 index 0000000..dfde839 Binary files /dev/null and b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VIII ZR 66-17.xlsx differ diff --git a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VIa ZR 418-21.xlsx b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VIa ZR 418-21.xlsx new file mode 100644 index 0000000..c10fd45 Binary files /dev/null and b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/VIa ZR 418-21.xlsx differ diff --git a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/X ZR 107-16.xlsx b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/X ZR 107-16.xlsx new file mode 100644 index 0000000..2826344 Binary files /dev/null and b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/X ZR 107-16.xlsx differ diff --git a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/XI ZR 345-18.xlsx b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/XI ZR 345-18.xlsx new file mode 100644 index 0000000..c77e3f5 Binary files /dev/null and b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/XI ZR 345-18.xlsx differ diff --git a/rouge_evalauation/evaluated_data/second_eval/summary_sentences/XI ZR 7-19.xlsx b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/XI ZR 7-19.xlsx new file mode 100644 index 0000000..4c3ef0a Binary files /dev/null and b/rouge_evalauation/evaluated_data/second_eval/summary_sentences/XI ZR 7-19.xlsx differ diff --git a/rouge_evalauation/evaluated_data/summary_sentences/I ZR 139-15.xlsx b/rouge_evalauation/evaluated_data/summary_sentences/I ZR 139-15.xlsx new file mode 100644 index 0000000..9a42bf0 Binary files /dev/null and b/rouge_evalauation/evaluated_data/summary_sentences/I ZR 139-15.xlsx differ diff --git a/rouge_evalauation/evaluated_data/summary_sentences/I ZR 23-18.xlsx b/rouge_evalauation/evaluated_data/summary_sentences/I ZR 23-18.xlsx new file mode 100644 index 0000000..fde1ae4 Binary files /dev/null and b/rouge_evalauation/evaluated_data/summary_sentences/I ZR 23-18.xlsx differ diff --git a/rouge_evalauation/evaluated_data/summary_sentences/III ZR 113-18.xlsx b/rouge_evalauation/evaluated_data/summary_sentences/III ZR 113-18.xlsx new file mode 100644 index 0000000..59a978c Binary files /dev/null and b/rouge_evalauation/evaluated_data/summary_sentences/III ZR 113-18.xlsx differ diff --git a/rouge_evalauation/evaluated_data/summary_sentences/III ZR 292-17.xlsx b/rouge_evalauation/evaluated_data/summary_sentences/III ZR 292-17.xlsx new file mode 100644 index 0000000..918b89e Binary files /dev/null and b/rouge_evalauation/evaluated_data/summary_sentences/III ZR 292-17.xlsx differ diff --git a/rouge_evalauation/evaluated_data/summary_sentences/III ZR 35-18.xlsx b/rouge_evalauation/evaluated_data/summary_sentences/III ZR 35-18.xlsx new file mode 100644 index 0000000..d676d29 Binary files /dev/null and b/rouge_evalauation/evaluated_data/summary_sentences/III ZR 35-18.xlsx differ diff --git a/rouge_evalauation/evaluated_data/summary_sentences/III ZR 391-17.xlsx b/rouge_evalauation/evaluated_data/summary_sentences/III ZR 391-17.xlsx new file mode 100644 index 0000000..a535a94 Binary files /dev/null and b/rouge_evalauation/evaluated_data/summary_sentences/III ZR 391-17.xlsx differ diff --git a/rouge_evalauation/evaluated_data/summary_sentences/III ZR 42-19.xlsx b/rouge_evalauation/evaluated_data/summary_sentences/III ZR 42-19.xlsx new file mode 100644 index 0000000..0099a33 Binary files /dev/null and b/rouge_evalauation/evaluated_data/summary_sentences/III ZR 42-19.xlsx differ diff --git a/rouge_evalauation/evaluated_data/summary_sentences/III ZR 55-19.xlsx b/rouge_evalauation/evaluated_data/summary_sentences/III ZR 55-19.xlsx new file mode 100644 index 0000000..ccc4a67 Binary files /dev/null and b/rouge_evalauation/evaluated_data/summary_sentences/III ZR 55-19.xlsx differ diff --git a/rouge_evalauation/evaluated_data/summary_sentences/III ZR 67-18.xlsx b/rouge_evalauation/evaluated_data/summary_sentences/III ZR 67-18.xlsx new file mode 100644 index 0000000..9b092e4 Binary files /dev/null and b/rouge_evalauation/evaluated_data/summary_sentences/III ZR 67-18.xlsx differ diff --git a/rouge_evalauation/evaluated_data/summary_sentences/V ZR 112-18.xlsx b/rouge_evalauation/evaluated_data/summary_sentences/V ZR 112-18.xlsx new file mode 100644 index 0000000..fd575c6 Binary files /dev/null and b/rouge_evalauation/evaluated_data/summary_sentences/V ZR 112-18.xlsx differ diff --git a/rouge_evalauation/evaluated_data/summary_sentences/V ZR 176-17.xlsx b/rouge_evalauation/evaluated_data/summary_sentences/V ZR 176-17.xlsx new file mode 100644 index 0000000..06c89f4 Binary files /dev/null and b/rouge_evalauation/evaluated_data/summary_sentences/V ZR 176-17.xlsx differ diff --git a/rouge_evalauation/evaluated_data/summary_sentences/V ZR 218-18.xlsx b/rouge_evalauation/evaluated_data/summary_sentences/V ZR 218-18.xlsx new file mode 100644 index 0000000..3c7fdd2 Binary files /dev/null and b/rouge_evalauation/evaluated_data/summary_sentences/V ZR 218-18.xlsx differ diff --git a/rouge_evalauation/evaluated_data/summary_sentences/V ZR 254-17.xlsx b/rouge_evalauation/evaluated_data/summary_sentences/V ZR 254-17.xlsx new file mode 100644 index 0000000..57be3ab Binary files /dev/null and b/rouge_evalauation/evaluated_data/summary_sentences/V ZR 254-17.xlsx differ diff --git a/rouge_evalauation/evaluated_data/summary_sentences/V ZR 273-17.xlsx b/rouge_evalauation/evaluated_data/summary_sentences/V ZR 273-17.xlsx new file mode 100644 index 0000000..1acc22e Binary files /dev/null and b/rouge_evalauation/evaluated_data/summary_sentences/V ZR 273-17.xlsx differ diff --git a/rouge_evalauation/evaluated_data/summary_sentences/VI ZR 506-17.xlsx b/rouge_evalauation/evaluated_data/summary_sentences/VI ZR 506-17.xlsx new file mode 100644 index 0000000..94a3e00 Binary files /dev/null and b/rouge_evalauation/evaluated_data/summary_sentences/VI ZR 506-17.xlsx differ diff --git a/rouge_evalauation/evaluated_data/summary_sentences/VII ZR 151-18.xlsx b/rouge_evalauation/evaluated_data/summary_sentences/VII ZR 151-18.xlsx new file mode 100644 index 0000000..af9d165 Binary files /dev/null and b/rouge_evalauation/evaluated_data/summary_sentences/VII ZR 151-18.xlsx differ diff --git a/rouge_evalauation/evaluated_data/summary_sentences/VIII ZR 277-16.xlsx b/rouge_evalauation/evaluated_data/summary_sentences/VIII ZR 277-16.xlsx new file mode 100644 index 0000000..c3d5aab Binary files /dev/null and b/rouge_evalauation/evaluated_data/summary_sentences/VIII ZR 277-16.xlsx differ diff --git a/rouge_evalauation/evaluated_data/summary_sentences/VIII ZR 94-17.xlsx b/rouge_evalauation/evaluated_data/summary_sentences/VIII ZR 94-17.xlsx new file mode 100644 index 0000000..113418f Binary files /dev/null and b/rouge_evalauation/evaluated_data/summary_sentences/VIII ZR 94-17.xlsx differ diff --git a/rouge_evalauation/evaluated_data/summary_sentences/X ZR 96-17.xlsx b/rouge_evalauation/evaluated_data/summary_sentences/X ZR 96-17.xlsx new file mode 100644 index 0000000..e20e6d3 Binary files /dev/null and b/rouge_evalauation/evaluated_data/summary_sentences/X ZR 96-17.xlsx differ diff --git a/rouge_evalauation/evaluated_data/summary_sentences/XII ZR 13-19.xlsx b/rouge_evalauation/evaluated_data/summary_sentences/XII ZR 13-19.xlsx new file mode 100644 index 0000000..72af682 Binary files /dev/null and b/rouge_evalauation/evaluated_data/summary_sentences/XII ZR 13-19.xlsx differ diff --git a/settings.py b/settings.py new file mode 100644 index 0000000..1c60355 --- /dev/null +++ b/settings.py @@ -0,0 +1,12 @@ +# creating BGH data +import spacy + +remove_brackets = False +server = False + +no_brackets_suffix = "no_br_" +nlp = spacy.load("de_core_news_sm") # small one +# nlp = spacy.load("de_dep_news_trf") # big one, CUDA Problems on Server... + + + diff --git a/test/test_rouge.py b/test/test_rouge.py new file mode 100644 index 0000000..00d569b --- /dev/null +++ b/test/test_rouge.py @@ -0,0 +1,192 @@ +from unittest import TestCase + +import rouge +import utils + + +def get_file_data(filename): + text_file = open(filename, "r", encoding='utf-8') + data = text_file.read() + text_file.close() + return data + + +def run_tests_with_data(self, test_data): + for original, generated_one, generated_two, description in test_data: + print(description) + print('original: ' + original) + rouge_one = rouge.rouge_n(original, generated_one, n=1) + rouge_two = rouge.rouge_n(original, generated_two, n=1) + print('Rouge for one: ' + str(rouge_one) + ' ' + generated_one) + print('Rouge for two: ' + str(rouge_two) + ' ' + generated_two) + combined = generated_two + ' ' + generated_one + combined_rouge = rouge.rouge_n(original, combined, n=1) + print('Rouge for combined: ' + str(combined_rouge) + ' ' + combined) + self.assertGreaterEqual(combined_rouge, rouge_one) + self.assertGreaterEqual(combined_rouge, rouge_two) + + +class RougeTest(TestCase): + + def tests_from_paper(self): + s1 = 'police killed the gunman' + s2 = 'police kill the gunman' + s3 = 'the gunman kill police' + score_s2 = rouge.rouge_l(s1, s2) + self.assertEqual(score_s2, 0.75) + score_s3 = rouge.rouge_l(s1, s3) + self.assertEqual(score_s3, 0.5) + + reference = 'affe birne club düne essen' + summary = 'affe birne feder geld himmel. affe club insel jagd essen.' + lcs = 4 + p = lcs / 12 + r = lcs / 5 + f = 2*(r*p)/(r+p) + r_p, r_r, r_f = rouge.rouge_l(reference_summary=reference, created_summary=summary, pp_options=[utils.pp_option_stopwords], + extended_results=True) + self.assertEqual(r_r, r) + self.assertEqual(r_p, p) + self.assertEqual(r_f, f) + + score_equal = rouge.rouge_l(summary, summary) + self.assertEqual(score_equal, 1) + + def test_one(self): + original = 'Für die Frage, ob alle in Art. 6 Abs. 1 der Richtlinie 2011/83/EU genannten Informationen objektiv in einem Werbemittel dargestellt werden können, ist erheblich, welchen Anteil diese Informationen am verfügbaren Raum des vom Unternehmer ausgewählten Werbeträgers einnehmen würden; die Werbebotschaft muss gegenüber den Verbraucherinformationen nicht zurücktreten.' + sent_1 = '(1) Zwar ist für die nach der Vorabentscheidung des Gerichtshofs der Europäischen Union maßgebliche Frage, ob alle in Art. 6 Abs. 1 der Richtlinie 2011/83/EU genannten Informationen objektiv in einem Werbemittel dargestellt werden können, erheblich, welchen Anteil diese Informationen am verfügbaren Raum des vom Unternehmer ausgewählten Werbeträgers einnehmen würden.' + sent_2 = 'Aus der Anforderung, die Informationen objektiv in der Werbebotschaft darstellen zu können, ist zu schließen, dass die Werbebotschaft gegenüber den Verbraucherinformationen nicht zurücktreten muss.' + rouge_v1 = rouge.rouge_n(original, sent_1, 1, pp_options=[utils.pp_option_stopwords]) + rouge_v2 = rouge.rouge_n(original, sent_1 + ' ' + sent_2, 1, pp_options=[utils.pp_option_stopwords]) + self.assertGreater(rouge_v2, rouge_v1) + + def test_one_match(self): + original = 'a b c d e.' + score = rouge.rouge_n(original, 'd.', n=1) + self.assertGreater(score, 0) + + def test_extension(self): + original_short = 'a b c d e.' + original_medi = 'a b c d e f g h i j k l m n o.' + original_long = 'a b c d e f g h i j k l m n o p q r s t u v w x y z.' + test_data = [[original_short, 'a b.', 'a b d.', 'small extension short sentence'], + [original_short, 'a.', 'a b c d.', 'large extension short sentence'], + [original_medi, 'a b c d e f g h i.', 'a b c d e f g h i j.', 'small extension medi sentence'], + [original_medi, 'a b c d e f g h i.', 'a b c d e f g h i m n o l.', + 'large extension medi sentence'], + [original_long, 'a b c d e f g h i j k l m n o p q r s t u v.', + 'a b c d e f g h i j k l m n o p q r s t u v w.', 'small extension long sentence'], + [original_long, 'a b c d e f g h i j k.', + 'a b c d e f g h i j k l m n o p q r s t u v w.', 'large extension long sentence'], + ] + print('Test extensions') + run_tests_with_data(self, test_data) + + def test_differing(self): + original_short = 'a b c d e.' + original_medi = 'a b c d e f g h i j k l m n o.' + original_long = 'a b c d e f g h i j k l m n o p q r s t u v w x y z.' + + test_data = [[original_short, 'a b c.', 'a b d.', 'small difference short sentence'], + [original_short, 'a e.', 'a b c.', 'large difference short sentence'], + [original_medi, 'a b c d e f g h i.', 'a b c d e f g h j.', 'small difference medi sentence'], + [original_medi, 'a b c d e f g h i.', 'a b c d j k l m.', + 'large difference medi sentence'], + [original_long, 'a b c d e f g h i j k l m n o p q r s t u v.', + 'a b c d e f g h i j k l m n o p q r s t u w.', 'small difference long sentence'], + [original_long, 'a b c d e f g h i j k.', + 'a b l m n o p q r s t u v.', 'large difference long sentence'], + ] + print('Test differences') + run_tests_with_data(self, test_data) + + def test_rougel_high_precision_or_recall(self): + gold = 'Boot.' + created = 'Boot. Boot.' + r_p, r_r, r_f = rouge.rouge_l(created_summary=created, reference_summary=gold, extended_results=True, + pp_options=[utils.pp_option_stopwords, utils.pp_option_lemmatize]) + self.assertEqual(r_p, 1/2) + self.assertEqual(r_r, 1) + self.assertEqual(r_f, 2/3) + + gold = 'Affe Boot. Boot Club.' + created = 'Boot.' + r_p, r_r, r_f = rouge.rouge_l(created_summary=created, reference_summary=gold, extended_results=True, + pp_options=[utils.pp_option_stopwords, utils.pp_option_lemmatize]) + self.assertEqual(r_p, 1) + self.assertEqual(r_r, 2/6) + self.assertEqual(r_f, 1/2) + + + gold = 'Im Rahmen der bei Prüfung der Schutzschranke der Berichterstattung über Tagesereignisse gemäß § 50 ' \ + 'UrhG vorzunehmenden Grundrechtsabwägung ist im Falle der Veröffentlichung eines bislang ' \ + 'unveröffentlichten Werks auch das vom Urheberpersönlichkeitsrecht geschützte Interesse an einer ' \ + 'Geheimhaltung des Werks zu berücksichtigen. Dieses schützt das urheberrechtsspezifische Interesse des ' \ + 'Urhebers, darüber zu bestimmen, ob er mit der erstmaligen Veröffentlichung den Schritt von der ' \ + 'Privatsphäre in die Öffentlichkeit tut und sich und sein Werk damit der öffentlichen Kenntnisnahme ' \ + 'und Kritik aussetzt. Nicht zu berücksichtigen ist bei dieser Abwägung dagegen das Interesse an der ' \ + 'Geheimhaltung von Umständen, deren Offenlegung Nachteile für die Interessen des Staates und seiner ' \ + 'Einrichtungen haben könnten. Dieses Interesse ist nicht durch das Urheberpersönlichkeitsrecht, ' \ + 'sondern durch andere Vorschriften - etwa das Sicherheitsüberprüfungsgesetz, § 3 Nr. 1 Buchst. b IFG ' \ + 'und die strafrechtlichen Bestimmungen gegen Landesverrat und die Gefährdung der äußeren Sicherheit ' \ + 'gemäß §§ 93 ff. StGB - geschützt. ' + created = 'Dieses Interesse ist vielmehr durch die allgemeinen Vorschriften - etwa das ' \ + 'Sicherheitsüberprüfungsgesetz, § 3 Nr. 1 Buchst. b IFG und die strafrechtlichen Bestimmungen gegen ' \ + 'Landesverrat und die Gefährdung der äußeren Sicherheit gemäß §§ 93 ff. ' + r_p, r_r, r_f = rouge.rouge_l(created_summary=created, reference_summary=gold, extended_results=True, + pp_options=[utils.pp_option_stopwords, utils.pp_option_lemmatize]) + self.assertLessEqual(r_p, 1) + + gold = 'Der Eigentümer eines Grundstücks ist hinsichtlich der von einem darauf befindlichen Baum (hier: ' \ + 'Birken) ausgehenden natürlichen Immissionen auf benachbarte Grundstücke Störer i.S.d. § 1004 Abs. 1 ' \ + 'BGB, wenn er sein Grundstück nicht ordnungsgemäß bewirtschaftet. Hieran fehlt es in aller Regel, ' \ + 'wenn die für die Anpflanzung bestehenden landesrechtlichen Abstandsregelungen eingehalten sind. 1b. ' \ + 'Ein Anspruch auf Beseitigung des Baums lässt sich in diesem Fall regelmäßig auch nicht aus dem ' \ + 'nachbarlichen Gemeinschaftsverhältnis herleiten. Hält der Grundstückseigentümer die für die ' \ + 'Anpflanzung bestehenden landesrechtlichen Abstandsregelungen ein, hat der Eigentümer des ' \ + 'Nachbargrundstücks wegen der Beeinträchtigungen durch die von den Anpflanzungen ausgehenden ' \ + 'natürlichen Immissionen weder einen Ausgleichsanspruch gemäß § 906 Abs. 2 Satz 2 BGB in unmittelbarer ' \ + 'Anwendung noch einen nachbarrechtlichen Ausgleichsanspruch gemäß § 906 Abs. 2 Satz 2 analog (' \ + 'Abgrenzung zu Senat, Urteil vom 27. Oktober 2017 - V ZR 8/17, ZfIR 2018, 190). ' + created = "Für die Entscheidung des Meinungsstreits ist von dem oben dargelegten Grundsatz auszugehen, " \ + "dass der Eigentümer eines Grundstücks hinsichtlich der von einem darauf befindlichen Baum " \ + "ausgehenden natürlichen Immissionen auf benachbarte Grundstücke Störer i.S.d. § 1004 Abs. 1 BGB " \ + "ist, wenn er sein Grundstück nicht ordnungsgemäß bewirtschaftet. Hält der Grundstückseigentümer " \ + "die für die Anpflanzung bestehenden landes-rechtlichen Abstandsregelungen ein, hat der Eigentümer " \ + "des Nachbargrund-stücks wegen der Beeinträchtigungen durch die von den Anpflanzungen ausgehenden " \ + "natürlichen Immissionen weder einen Ausgleichsanspruch gemäß § 906 Abs. 2 Satz 2 BGB in " \ + "unmittelbarer Anwendung noch einen nachbarrechtlichen Ausgleichsanspruch gemäß § 906 Abs. 2 Satz 2 " \ + "analog. Sind die für die Anpflanzung bestehenden landesrechtlichen Abstandsregelungen eingehalten, " \ + "lässt sich ein Anspruch auf Beseitigung der Bäume in aller Regel - und so auch hier - nicht aus " \ + "dem nachbarlichen Gemeinschaftsverhältnis herleiten. Gemäß § 907 Abs. 2 BGB gehören aber Bäume und " \ + "Sträucher nicht zu den Anlagen i.S.d. § 907 Abs. 1 BGB. Ob den Grundstückseigentümer für " \ + "natürliche Immissionen eine „Sicherungspflicht“ trifft und er damit Störer i.S.d. § 1004 Abs. 1 " \ + "BGB ist, ist jeweils anhand der Umstände des Einzelfalls zu prüfen. Rechtsfehlerhaft ist jedoch " \ + "die Auffassung des Berufungsgerichts, der Beklagte sei als Störer i.S.d. § 1004 Abs. 1 BGB für die " \ + "von den Birken ausgehenden Immissionen auf das Grundstück des Klägers verantwortlich. In diesem " \ + "Fall ist er regelmäßig schon nicht Störer, so dass es bereits an einem Beseitigungsanspruch gemäß " \ + "§ 1004 Abs. 1 BGB fehlt und der von dem Berufungsgericht beschriebene Konflikt zwischen den Regeln " \ + "des Bürgerlichen Gesetzbuchs und den landesrechtlichen Vorschriften nicht besteht. Voraussetzung " \ + "hierfür ist jedoch, dass der in Anspruch genommene Grundstückseigentümer für die " \ + "Eigentumsbeeinträchtigung verantwortlich und damit Störer i.S.d. § 1004 Abs. 1 BGB ist. " + + r_p, r_r, r_f = rouge.rouge_l(created_summary=created, reference_summary=gold, extended_results=True, + pp_options=[utils.pp_option_stopwords, utils.pp_option_lemmatize]) + self.assertLessEqual(r_p, 1) + self.assertLessEqual(r_r, 1) + + def test_specific(self): + gold = 'Diese Voraussetzungen hat der XII. Zivilsenat für den vorliegenden Fall bejaht.' + created = 'Dies ist insbesondere der Fall, wenn die Sanktion außer Verhältnis zum Gewicht des Vertragsverstoßes und den Folgen für den Schuldner der Vertragsstrafe steht.' + created_2 = 'Deren Untergrenze ist mit 30 € angegeben.' + r_p, r_r, r_f = rouge.rouge_l(created_summary=created, reference_summary=gold, extended_results=True, + pp_options=[utils.pp_option_stopwords, utils.pp_option_lemmatize]) + r_p_2, r_r_2, r_f_2 = rouge.rouge_l(created_summary=created_2, reference_summary=gold, extended_results=True, + pp_options=[utils.pp_option_stopwords, utils.pp_option_lemmatize]) + + r_p_c, r_r_c, r_f_c = rouge.rouge_l(created_summary=created+ ' '+created_2, reference_summary=gold, extended_results=True, + pp_options=[utils.pp_option_stopwords, utils.pp_option_lemmatize]) + + self.assertGreater(r_f, r_f_2) + self.assertGreater(r_f_c, r_f) + diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..1b2deb8 --- /dev/null +++ b/utils.py @@ -0,0 +1,475 @@ +import json +import multiprocessing +import os +import re +import shutil + +import pandas as pd + +import settings + +pool_processes = 8 +pool_maxtask = 10 +pool_chunksize = 30 +leitsatz_str = 'leitsatz' +tenor_str = 'tenor' +tatbestand_str = 'tatbestand' +entscheidungsgruende_str = 'entscheidungsgruende' +aktenzeichen_str = 'aktenzeichen' +rii_text_columns = [leitsatz_str, tenor_str, tatbestand_str, entscheidungsgruende_str] +sentence_marks = ['.', ',', ';', '!', '?'] +pp_option_lemmatize = 'preprocessing: lemmatize the text' +pp_option_stopwords = 'preprocessing: remove stopwords' +pp_option_case_normalize = 'preprocessing: normalize cases / put to lower' +pp_option_remove_qout_marks_sing = 'preprocessing: remove qoutation marks around single words' +no_stopword_list = ['nicht', 'kein'] +entsch_gr_start_sentences = ['II.', 'B.', 'B'] + + +def server_path(current_path, path): + """ + Method to add path in case it is run on server. + + :param current_path: Path to add when run on server + :param path: Path for local + :return: Final path for local or server + """ + if settings.server: + path = current_path + '/' + path + return path + + +def open_file(current_path, path, modes, encoding=None, newline=None): + """ + Wraps the builtin open function to adjust to server settings + + :param current_path: path of the calling file to adjust for server (without /) + :param path: Path for file loading relative to calling file + :param modes: Modes to apply + :param newline: newline option of the original method, if None nothing will be passed + :param encoding: encoding option of the original method, if None nothing will be passed + :return: the opened file + """ + if encoding is not None: + return open(server_path(current_path=current_path, path=path), modes, encoding=encoding) + if newline is not None: + return open(server_path(current_path=current_path, path=path), modes, newline=newline) + if newline is not None and encoding is not None: + return open(server_path(current_path=current_path, path=path), modes, encoding=encoding, newline=newline) + return open(server_path(current_path=current_path, path=path), modes) + + +def file_exists(current_path, path): + """ + Wraps the builtin exists function to adjust to server settings + + :param current_path: path of the calling file to adjust for server (without /) + :param path: Path for file loading relative to calling file + :return: True if the file exists + """ + return os.path.exists(server_path(current_path=current_path, path=path)) + + +def list_dir_files(current_path, path): + """ + Wraps the builtin os.listdir function to adjust to server settings + + :param current_path: path of the calling file to adjust for server (without /) + :param path: Path for file loading relative to calling file + :return: The filenames of the directory + """ + return os.listdir(server_path(current_path=current_path, path=path)) + + +def df_from_pickle(current_path, path): + """ + Wraps the pd.read_pickle function to adjust to server settings + + :param current_path: path of the calling file to adjust for server (without /) + :param path: Path for file loading relative to calling file + :return: The loaded dataframe + """ + return pd.read_pickle(server_path(current_path=current_path, path=path)) + + +def df_to_json(current_path, path, dataframe): + """ + Wraps the df.to_json function to adjust to server settings + + :param current_path: path of the calling file to adjust for server (without /) + :param path: Path for file loading relative to calling file + :param dataframe: The dataframe to save + """ + dataframe.to_json(server_path(current_path=current_path, path=path)) + + +def df_from_json(current_path, path): + """ + Wraps the json.load function in combination with a dataframe creation to adjust to server settings + + :param current_path: path of the calling file to adjust for server (without /) + :param path: Path for file loading relative to calling file + :return: The loaded dataframe + """ + return pd.DataFrame(json.load(open_file(current_path=current_path, path=path, modes="r"))) + + +def time_convert(sec): + """ + Gibt eine Zeitangabe hübsch aus. Format : Time Lapsed = hh:mm:ss + + :param sec: Zeit zu zeigen + """ + mins = sec // 60 + sec = sec % 60 + hours = mins // 60 + mins = mins % 60 + print("Time Lapsed = {0}:{1}:{2}".format(int(hours), int(mins), sec)) + + +def parallel_imap(function, packaged_args): + """ + Executes the given function in a parallel way. For list data. + + :param function: Function to do in parallel. + :param packaged_args: Iterable of argumentpairs for each run to be done. + :return: Result of the parallel work + """ + if settings.server: + pool_obj = multiprocessing.Pool(maxtasksperchild=pool_maxtask) + result = pool_obj.imap(function, packaged_args, chunksize=pool_chunksize) + else: + pool_obj = multiprocessing.Pool(processes=pool_processes) + result = pool_obj.imap(function, packaged_args) + pool_obj.close() + pool_obj.join() + return result + + +def get_step_subset_raw(steps, path_to_dest_dataframe, source_data, dest_data, call_path): + """ + Method for stepwise work on datasets. Reads in the already present data and starts + where last time ended. Used for raw pickle-files in destination + + :param steps: How many rows should be selcted now + :param path_to_dest_dataframe: Path on where to load the destination data + :param source_data: Source dataframe to select the rows + :param dest_data: empty dataframe to load the data into + :param call_path: path from which the method was called, for server path + :return: the subset of the source data an the loaded destintion data (source, dest) + """ + if steps > 0: + try: + try: + var = df_from_pickle(current_path=call_path, path=path_to_dest_dataframe) + except Exception: + var = df_from_json(current_path=call_path, path=path_to_dest_dataframe) + dest_data = pd.concat([dest_data, var], ignore_index=True) + start = dest_data.shape[0] + except OSError as _: + start = 0 + finally: + end = start + steps + try: # case source is a dataframe + if end >= source_data.shape[0]: + return source_data.iloc[start:], dest_data # subset + else: + return source_data.iloc[start:end], dest_data # subset + except Exception: + if end >= len(source_data): + return source_data[start:], dest_data # subset + else: + return source_data[start:end], dest_data # subset + + +def remove_spaces_before_sentence_marks(text): + """ + Removes unneccessary spaces before '.' etc. + + :param text: Text to replace in + :return: The cleaned text + """ + for sentence_mark in sentence_marks: + while ' ' + sentence_mark in text: + text = text.replace(' ' + sentence_mark, sentence_mark) + return text + + +def remove_brackets(text): + """ + Removes all matching round bracktet pairs () with their content. Always takes the first brackets that + appear in the text, so could also be an enumeration like a) + + :param text: Text to remove the brackets from. + :return: Resulting text + """ + startindex = text.find('(') + res = '' + while startindex > -1: + endindex = startindex + text[startindex:].find(')') + if endindex > -1: + # in case there is a ' ' in front or after the brackets, remove one space + if startindex > 0 and text[startindex - 1] == ' ': + startindex -= 1 + # if endindex < len(text) - 1 and text[endindex + 1] == ' ': + # endindex += 1 + res += text[:startindex] + text = text[endindex + 1:] + else: + break + startindex = text.find('(') + res += text + return res + + +def remove_leading_keywords_and_listing_sentences(sentences): + """ + Method intended for Leitsätze. Some of them start with a single keyword in the first line. + This is removed. Additionally, Sentences which are only a listin ('1.') will also be removed. + + :param sentences: List of sentences in the original order to remove these things from + :return: the list of sentences after removing + """ + # remove leading keywords and sentences which are only enumerations + sentences_var = list() + sentence_var = '' + for i in range(len(sentences)): + sentence = sentences[i].strip() + if len(sentence) > 1 and sentence[-1] == '.' and ' ' not in sentence: # at least two chars + if any(char.isdigit() for char in sentence) and sentence[0].isdigit(): # most likely enumeration like '1.' + continue + if i > 0 or (i == 0 and len(sentence) > 20): + # most likely not a short keyword at the beginning + if sentence[-1] == '.' or sentence[-1] == ',' or sentence[-1] == ':' or \ + sentence[-1] == ';' or sentence[-1] == '!' or sentence[-1] == '?': + # sentence end + sentence_var += sentence + sentences_var.append(remove_spaces_before_sentence_marks(sentence_var)) + sentence_var = '' + else: + # continuing sentence + sentence_var += sentence + ' ' + return sentences_var + + +def prepare_leitsatz(l_satz): + """ + Does the preparation for Leitsätze: First splits into sentences, removes leading keywords and + single listing sentences and leading listings of sentences + + :param l_satz: Original Leitsatz as one string + :return: prepared Leitsatz as a list of String + """ + sentences = split_into_sentences(l_satz) + sentences = remove_leading_keywords_and_listing_sentences(sentences) + sentences = [remove_leading_listing(sentence) for sentence in sentences] + return sentences + + +def select_list_subset(list_of_string, start_strings, end_string=None): + """ + Selects a subset of a list of strings. If the start_string is not in the list, + the whole original list is returned. (case-sensitive) + If more start strings are given, then it will be copied from the first occuring start string. + + sometimes entscheidungsgruende II. is started not with II. but B. Use start_String_2 here + + :param list_of_string: List to get subset from + :param start_strings: List of Strings to start to copy + :param end_string: First string where one shouldn't copy anymore, if none is given, then till the end + :return: Selected subset + """ + result_list = [] + copy = False + for i in range(len(list_of_string)): + string = list_of_string[i] + if string in start_strings: + copy = True + if end_string is not None and string == end_string: + copy = False + if copy: + result_list.append(string) + # if nothing was found or very little was found + if len(result_list) == 0 or len(result_list) / len(list_of_string) < 0.2: + return list_of_string + return result_list + + +def abbreviation_ending(text): + """ + Checks for an input text whether it ends with a known legal abbreviation. + Known issues: numbers and roman numbering with following dots arent matched + + :param text: Input Text + :return: True, if it does and with such an abbreviation, False otherwise + """ + abbrev_list = ['A.', ' a.', 'a.A.', 'a.a.O.', 'ABl.', ' abl.', 'Abs.', ' abs.', 'Abschn.', 'Abse.', + ' abzgl.', 'a.D.', 'a.E.', ' a.F.', ' ähnl.', 'a.l.i.c.', ' allg.', ' allgem.', + 'Alt.', 'AmtsBl.', ' and.', ' angef.', 'Anh.', 'Anl.', 'Anm.', ' Art.', '(Art.', ' aufgeh.', + 'Aufl.', ' ausf.', 'Ausn.', 'BAnz.', 'BArbBl.', 'BayJMBl.', 'Bd.', 'Bde.', 'Bdg.', + 'Bearb.', ' begr.', 'Beil.', 'Bek.', ' ber.', ' bes.', 'Beschl.', ' best.', ' bestr.', + 'Betr.', ' betr.', 'Bf.', 'BGBl.', ' bish.', ' Bl.', 'BPräs.', 'BReg.', 'Bsp.', 'Bst.', + 'BStBl.', 'BT-Drucks.', 'Buchst.', 'bzgl.', 'bzw.', 'c.i.c.', 'Co.', 'c.p.c.', + 'c.s.q.n.', 'Ct.', ' dar.', 'Darst.', ' ders.', 'd.h.', 'Diss.', ' div.', 'Dr.', + 'Drucks.', ' dto.', 'DVBl.', ' ebd.', ' Ed.', 'E.G.', ' eingef.', 'Einf.', 'Einl.', + ' einschl.', 'Erg.', ' erk.Sen.', ' erk.', ' Erl.', 'etc.', 'E.U.', ' e.V.', + 'EVertr.', ' evtl.', 'E.W.G.', ' F.', ' f.', ' Fa.', ' Festschr.', ' ff.', ' Fn.', + ' form.', ' fr.', ' fr.Rspr.', ' Fz.', 'GBl.', ' geänd.', 'Gedschr.', ' geg.', + ' gem.', 'Ges.', ' gg.', ' ggf.', ' ggü.', ' ggüb.', ' Gl.', ' GMBl.', 'G.o.A.', + 'Grds.', ' grdsl.', 'Großkomm.', 'Großkomm.z.', 'GVBl.', 'GVOBl.', ' h.A.', 'Halbs.', + ' h.c.', 'Hdlg.', 'Hess.', ' heut.', ' heut.Rspr.', ' hins.', ' h.L.', ' h.Lit.', + ' h.M.', 'Hrsg.', ' h.Rspr.', 'HS.', 'Hs.', ' i.A.', ' ib.', ' ibd.', ' ibid.', + 'i.d.', 'i.d.F.', 'i.d.R.', 'i.d.S.', 'i.E.', 'i.e.', 'i.e.S.', 'i.H.d.', 'i.H.v.', + 'i.K.', ' incl.', ' inkl.', 'inkl.MwSt.', ' insb.', ' insbes.', 'Int.', ' i.O.', + ' i.R.', ' i.R.d.', 'i.S.', 'i.S.d.', 'i.S.e.', 'i.S.v.', 'i.ü.', ' iur.', 'i.V.', + 'i.V.m.', 'i.W.', 'i.Wes.', 'i.w.S.', 'i.Zw.', 'Jahrb.', ' jew.', ' Jh.', 'JMBl.', + ' jur.', ' Kap.', ' Ko.', ' krit.', ' kzfr.', 'Lb.', 'Lfg.', 'lgfr.', ' Lief.', + 'Lit.', ' lit.', ' lt.', 'Ltd.', 'M.A.', 'm.Änd.', 'MABl.', 'mat.', 'm.a.W.', 'm.E.', + ' med.', 'mgl.', 'Mglkt.', 'MinBl.', 'Mio.', ' Mot.', 'M.M.', 'm.N.', 'Mod.', + ' mögl.', 'Mot.', 'MünchKomm.', 'm.w.', 'm.w.N.', 'MwSt.', 'Mwst.', 'm.W.v.', + 'm.zust.Anm.', 'Nachw.', 'Nachw.b.', ' nat.', 'Nds.', 'Neubearb.', 'Neuf.', + ' neugef.', 'n.F.', 'Nr.', 'Nrn.', ' o.', 'o.Ä.', ' od.', ' oec.', ' öff.', + ' o.g.', ' österr.', 'p.F.V.', ' pharm.', ' phil.', ' pol.', 'Postf.', ' pp.', + ' ppA.', ' ppa.', 'Prof.', 'Prot.', ' publ.', ' p.V.', 'p.V.V.', 'q.e.d.', + 'RdErl.', 'Rdn.', 'Rdnr.', 'RdSchr.', ' rel.', ' rer.', 'RGBl.', 'Rn.', 'Rspr.', + 'Rz.', 'S.', ' s.', 's.a.', 'Schr.', ' scil.', 'Sen.', ' sinngem.', 'SiZess.', + 'Slg.', 's.o.', ' sog.', 'Sonderbeil.', 'Stpfl.', ' str.', ' st.', 'st.Rspr.', + ' st. Rspr.', 'stud.iur.', 's.u.', ' teilw.', ' theol.', 'Thür.', ' TO.', ' tw.', + 'Tz.', ' u.', 'u.a.', 'UAbs.', 'u.a.m.', ' umstr.', ' unmgl.', 'Unmglkt.', ' unmögl.', + 'Urt.', ' usw.', ' u.U.', ' V.', ' v.', 'Var.', 'Ver.', ' vgl.', 'V.m.', 'VOBl.', + 'Vor.', 'Vorbem.', 'Warn.', ' weg.', ' wg.', 'W.G.G.', 'w.z.b.w.', 'z.B.', 'z.Hd.', + 'Ziff.', 'z.T.', ' zust.', 'zust.Anm.', ' zw.' 'z.Z.', ' zzgl.', ';', 'II.1.a.', '(s.', + ] + for abbrev in abbrev_list: + if text.endswith(abbrev): + return True + if len(text) >= 3 and re.search(" .\\.", text[-3:]): + return True + return False + + +def remove_leading_listing(sentence): + """ + removes leading listings / enumerations like 1. or a) + + :param sentence: Sentence to remove from + :return: Processed sentence + """ + return split_leading_listing(sentence)[1] + + +def split_leading_listing(sentence): + """ + Splits the sentence from a possible listing (1. or a) ) at the start. + + :param sentence: Sentence to split + :return: (start, rest) with start being the listing or None, if there is no listing and + rest being the rest of the sentence or the original sentence if there was no listing + """ + first_word = sentence.split() + if first_word is None or len(first_word) == 0: + first_word = '' + else: + first_word = first_word[0] + rest = sentence[len(first_word) + 1:] + # could be a name like M. Leicht + if (first_word.endswith('.') or first_word.endswith(')')) and len(rest.split()) > 1 and first_word != 'Art.': + # Enumeration! + return first_word, rest + else: + return None, sentence + + +def split_into_sentences(input_text): + """ + Splits text into sentences. Uses spacy sentences but fixes broken sentences on \n or Abbreviations + + :param input_text: Text to split into sentences + :return: A list of sentences which where split + """ + + paragraphs = input_text.split('\n') + sentences = list() + sentence_var = '' + # roughly split original leitsatz into sentences + for paragraph in paragraphs: + nlp_paragraph = settings.nlp(paragraph) + sentences_paragraph = [] + for sent in nlp_paragraph.sents: + sent = sent.text.strip() + # some leading listings aren't detected + a, b = split_leading_listing(sent) + if a is not None: + sentences_paragraph.append(a) + sentences_paragraph.append(b) + for i in range(0, len(sentences_paragraph)): + # add a space before next token if it isn't a sentence mark + if not (sentences_paragraph[i].startswith('.') or sentences_paragraph[i].startswith(':') + or sentences_paragraph[i].startswith('?') or sentences_paragraph[i].startswith('!')): + sentence_var += ' ' + sentence_var += sentences_paragraph[i] + # if not sentence_var.count('(') > sentence_var.count( + # ')') and not sentence_var.strip() == '': # no unclosed brackets + if (sentences_paragraph[i].endswith('.') or sentences_paragraph[i].endswith(':') + or sentences_paragraph[i].endswith('?') or sentences_paragraph[i].endswith('!')) \ + and not abbreviation_ending(sentence_var) \ + and not sentence_var.strip() == '': + # Satz sehr wahrscheinlich wirklich zuende + sentences.append(sentence_var.strip()) + sentence_var = '' + if not sentence_var.strip() == '': + # if not sentence_var.count('(') > sentence_var.count( + # ')') and not sentence_var.strip() == '': # no unclosed brackets + sentences.append(sentence_var.strip()) # am Ende des Paragraphen soll auch fertig sein + sentence_var = '' + # end of whole text + if sentence_var.strip() != '': + sentences.append(sentence_var.strip()) + return sentences + + +def preprocess_text(text, options): + """ + Allows simple preprocessing like lemmatization on strings. + + :param text: Text to preprocess + :param options: Options specifying on what preprocessing is to be done, if None, text will be returned + :return: the preprocessed text, if text is None, the result will also be '' + """ + if text is None: + return '' + if options is None: + return text + text_spacy = settings.nlp(text) + result_text = '' + for token in text_spacy: + # stop-words removing: no stopwords or stopwords shouldn't be removed + if not token.is_stop or pp_option_stopwords not in options or token.lemma_ in no_stopword_list: + # lemmatization if wanted + if pp_option_lemmatize in options and token.text not in sentence_marks: + to_append = token.lemma_ + else: + to_append = token.text + if pp_option_remove_qout_marks_sing in options and to_append[0] == '"' and to_append[-1] == '"': + to_append = to_append.replace('"', '') + result_text += to_append + ' ' + result_text = result_text.strip() + # case-normlaization, all to lower + if pp_option_case_normalize in options: + return result_text.lower() + else: + return result_text + + +def create_dir(current_path, directory_name, delete=True): + """ + Creates a directory if it doesn't exist + + :param current_path: path of the calling file + :param directory_name: name / path to create + :param delete: if True, than an old directory with same name will be delted + """ + if delete and file_exists(current_path=current_path, path=directory_name): + shutil.rmtree(server_path(current_path=current_path, path=directory_name)) + if not file_exists(current_path=current_path, path=directory_name): + os.makedirs(server_path(current_path=current_path, path=directory_name))