You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

1439 lines
67 KiB

import math
import numpy as np
import pandas as pd
import xlsxwriter as xlsxwriter
import seaborn as sns
import matplotlib.pyplot as plt
from nlp_rake import Rake
import data.download_rii
import pm_summary.annotation_evaluation
import rouge
import settings
import utils
from pm_summary.annotation_evaluation import pm_sent
current_dir = 'rouge_evalauation/'
eval_path = 'manual_evaluation/'
eval_run_two_path = 'second_eval/'
evaluated_path = 'evaluated_data/'
picture_path = 'figures/'
summary_sentences_path = 'summary_sentences/'
data_path = 'dataframes/'
all_summaries_df_name = 'all_summaries.json'
sum_sentences_df_name = 'summary_sentences.json'
issues_str = 'legal_issue'
abstr_complete_str = 'abstr_complete'
content_complete_str = 'content_complete'
type_str = 'ls_pm'
sent_no_str = 'sentence_number'
original_str = 'original'
interval_start_str = 'interval_start'
sentence_str = 'sentence'
evaluation_str = 'eval'
summary_id_str = 'summary_id:'
summary_str = 'summary'
rouge_r_str = 'rouge_recall'
rouge_p_str = 'rouge_precision'
rouge_f_str = 'rouge_f_measure'
rouge1_str = 'rouge1'
rougel_str = 'rougel'
rouge_type_str = rouge1_str + ' or ' + rougel_str
pp_options = [utils.pp_option_stopwords, utils.pp_option_lemmatize]
cases_one_list = ['I_ZR_23-18', 'I_ZR_139-15', 'III_ZR_35-18', 'III_ZR_42-19',
'III_ZR_55-19', 'III_ZR_67-18', 'III_ZR_113-18', 'III_ZR_292-17',
'III_ZR_391-17', 'V_ZR_112-18', 'V_ZR_176-17', 'V_ZR_218-18',
'V_ZR_254-17', 'V_ZR_273-17', 'VI_ZR_506-17', 'VII_ZR_151-18',
'VIII_ZR_94-17', 'VIII_ZR_277-16', 'X_ZR_96-17', 'XII_ZR_13-19']
cases_two_list = ['I_ZR_146-20', 'I_ZR_153-17', 'II_ZR_84-20', 'II_ZR_152-20',
'III_ZR_25-20', 'III_ZR_79-21', 'IV_ZR_144-21', 'IV_ZR_253-20',
'V_ZR_8-19', 'V_ZR_299-19', 'VI_ZR_128-20', 'VI_ZR_252-19',
'VIa_ZR_418-21', 'VII_ZR_78-20', 'VII_ZR_192-20', 'VIII_ZR_21-19',
'VIII_ZR_66-17', 'X_ZR_107-16', 'XI_ZR_7-19', 'XI_ZR_345-18']
def select_greedy_summary(split_text, gold_summary, interval_aim, eval_func):
"""
Selects a summary from a text in a greedy fashion.
:param split_text: List of sentences, Text to choose sentences from, already split into sentences!
:param gold_summary: ideal summary
:param interval_aim: (start, end) of interval for the intended final rouge score
:param eval_func: rouge score to evaluate the summary, as a function with the arguments (created, gold)
:return: [(rouge, summary)*] with summary the created summary and rouge the corresponding score. Contains rouge for
every added sentence
"""
start_aim, end_aim = interval_aim
result = []
result_summary = ''
current_split_text = [sent for sent in split_text]
max_rouge = eval_func(result_summary, gold_summary)
while max_rouge <= start_aim and len(current_split_text) > 0:
new_sent = ''
for sent in current_split_text:
var_result_summary = result_summary + ' ' + sent
var_result_summary = var_result_summary.strip()
new_rouge = eval_func(var_result_summary, gold_summary)
if new_rouge > end_aim:
current_split_text.remove(sent)
elif max_rouge < new_rouge:
max_rouge = new_rouge
new_sent = sent
if new_sent != '':
result_summary += ' ' + new_sent
result_summary = result_summary.strip()
current_split_text.remove(new_sent)
result.append((max_rouge, new_sent))
else:
break
return result
def get_evaluation_data(case_list):
"""
Loads the judgement data and the press realease evaluation files, combines and returns them
Removes press releases without Leitsatz or judgement data
pm_prepared_list = [(pm,
ls_data[ls_data[utils.aktenzeichen_str] == aktenzeichen]
[[utils.aktenzeichen_str, utils.leitsatz_str, utils.tenor_str,
utils.entscheidungsgruende_str]].squeeze())
:param case_list: list of cases for the run
:return: [(pm, judg_row_data(aktenzeichen, leitsatz, tenor, entsch_gr))]
"""
ls_data = data.download_rii.get_selected_bgh_data(directory='..//data//')
pm_data_list = pm_summary.annotation_evaluation.get_all_pm_files()
pm_prepared_list = []
for pm_filename, pm_file_data in pm_data_list:
if not any(case in pm_filename for case in case_list):
continue
aktenzeichen = pm_filename.replace('.xlsx', '').replace('_', ' ').replace('-', '/')
original_pm = ''
for i in range(len(pm_file_data)):
sent_dict = pm_file_data[i + 1]
original_pm += ' ' + str(sent_dict[pm_sent])
pm_prepared_list.append((aktenzeichen, original_pm))
pm_prepared_list = [(pm,
ls_data[ls_data[utils.aktenzeichen_str] == aktenzeichen]
[[utils.aktenzeichen_str, utils.leitsatz_str, utils.tenor_str,
utils.entscheidungsgruende_str]].squeeze())
for (aktenzeichen, pm) in pm_prepared_list]
# remove pms without ls
pm_prepared_list = [(pm, row) for (pm, row)
in pm_prepared_list if row[utils.leitsatz_str] is not None]
return pm_prepared_list
def elaborated_sentence_splitting(text_to_split):
"""
Sentence Splitting for entscheidungsgruende with readjusting of splitting if something was wrong
:param text_to_split: Raw text
:return: split text as list
"""
# select entscheidungsgruende II as split sentences
res = []
for sentence in utils.split_into_sentences(text_to_split):
first, rest = utils.split_leading_listing(sentence)
if first is not None:
res.append(first)
res.append(rest)
res = rejoin_wrong_splitting(res)
res = readjust_splitting(res)
return res
def prepare_sentences(row_data):
"""
Prepares leitsatz, entscheidungsgruende and tenor. Splits them into senteces, removes listings, etc.
:param row_data: series containing the data
:return: l_fin, l_list, eg_list, combined_list with l_fin the leitsatz as a string, l_list leitsatz as list of
string, eg_list list of entscheidungsgruende sentences and combined_list the list of sentences of
entscheidungsgruende and tenor
"""
l_list = utils.prepare_leitsatz(row_data[utils.leitsatz_str])
l_fin = ' '.join(l_list)
eg_list = elaborated_sentence_splitting(row_data[utils.entscheidungsgruende_str])
# select entscheidungsgruende II as split sentences
eg_list = utils.select_list_subset(eg_list, utils.entsch_gr_start_sentences)
eg_list = [sent for sent in eg_list if len(sent.split()) > 1]
combined_list = eg_list + elaborated_sentence_splitting(row_data[utils.tenor_str])
return l_fin, l_list, eg_list, combined_list
def rejoin_wrong_splitting(sent_list):
"""
Some sentences are split wrongly. They are connected here again.
:param sent_list: list to check
:return: updated list
"""
res = []
combined = ''
for string in sent_list:
combined += ' ' + string
combined = combined.strip()
if not string.endswith('für die Bemessung des Nutzungsvorteils:') and \
not string.endswith('GB, Stand:') and not string.endswith('Probefahrt:') \
and not string.strip() == '§ 89 Abs. 1 II.' and not string.endswith('Medizinprodukte, A. VI.')\
and not combined.endswith('te, A. VI. 2.') \
and not string.endswith('Gemeinschaft:') and not string.endswith('InfoV.'): # no special cases
res.append(combined)
combined = ''
return res
def update_summaries_if_needed(interval_id, existings_summaries, possible_sentences, gold_sum, rouge_1, max_intervals,
max_interval_index):
"""
Method tries to create a summary of given interval. By keeping track off all rouge values along the way, summaries
of lower rouge may also be found. This information is then updated in max_intervals and existing_summaries
:param interval_id: interval_id for interval to check. Interval start = (interval_id - 1) / 10
:param existings_summaries: already existing summaries to intervals for this task
:param possible_sentences: possible sentences to choose
:param gold_sum: gold summary to compare
:param rouge_1: True if ROUGE-1 should be calculated, False otherwise
:param max_intervals: List for keeping track of the max intervals. By starting with highest possible rouge,
impossible rouge values can be detected and not run, since they are impossible.
:param max_interval_index: index in list for current task
:return: existings_summaries, max_intervals with updated values
"""
if rouge_1:
rouge_index = 1
else:
rouge_index = 2
if existings_summaries[interval_id - 1][rouge_index] == '' and \
(interval_id - 1 < max_intervals[max_interval_index] or max_intervals[max_interval_index] == -1):
# no summary yet
if rouge_1:
result_list = select_greedy_summary(split_text=possible_sentences, gold_summary=gold_sum,
interval_aim=[(interval_id - 1) / 10, interval_id / 10],
eval_func=lambda created, gold:
rouge.rouge_n(reference_summary=gold,
created_summary=created,
pp_options=pp_options, n=1))
else:
result_list = select_greedy_summary(split_text=possible_sentences, gold_summary=gold_sum,
interval_aim=[(interval_id - 1) / 10, interval_id / 10],
eval_func=lambda created, gold:
rouge.rouge_l(reference_summary=gold,
created_summary=created,
pp_options=pp_options))
summary = ''
for (rouge_v, sentence) in result_list:
summary += ' ' + sentence
summary = summary.strip()
index = math.floor(rouge_v * 10)
if existings_summaries[index][rouge_index] == '': # summary found
if rouge_1:
existings_summaries[index] = (existings_summaries[index][0], summary, existings_summaries[index][2])
else:
existings_summaries[index] = \
(existings_summaries[index][0], existings_summaries[index][1], summary)
if index > max_intervals[max_interval_index]:
max_intervals[max_interval_index] = index
return existings_summaries, max_intervals
def preselect_sentences(sentence_list, gold_sum):
"""
Makes a preselection of sentences. Removes sentences without two keywords or with a given phrase and combines
konjunktiv until next indicative.
:param sentence_list: list tu preselect from
:param gold_sum: gold summary to create keywords from
:return: resulting sentences
"""
# combine konjunktiv
res = combine_modus(sentence_list)
# keywords
rake = Rake(
min_chars=1,
max_words=4,
language_code='de',
stopwords=settings.nlp.Defaults.stop_words,
)
keywords = rake.apply(gold_sum)
wordlist = set()
for keywordstring, _ in keywords:
for token in settings.nlp(keywordstring):
wordlist.add(token.lemma_)
res_var = []
for sent in res:
keyword_counts = [1 for word in sent.split(' ') if (len(settings.nlp(word)) > 0)
and settings.nlp(word)[0].lemma_ in wordlist]
if sum(keyword_counts) >= 2:
res_var.append(sent)
res = res_var
# remove sentences with bad phrases
phrases_list = ['Es kann dahinstehen', 'Es kann dahingestellt bleiben',
'Dabei kann dahingestellt bleiben ',
'Es kann offenbleiben, dass', 'Es kann offenbleiben, ob',
'Es bedarf keiner Entscheidung, ob',
'Das Berufungsgericht hat zu hohe Anforderungen gestellt,',
'Entgegen der Auffassung der Revision',
'Entgegen der Auffassung des Berufungsgerichts',
'Jedenfalls greift die Argumentation des Berufungsgerichts nicht',
'Jedenfalls greift die Argumentation des Berufungsgerichts zu kurz', 'Selbst wenn']
res = [sentence for sentence in res if not sentence.startswith(tuple(phrases_list))]
return res
def write_files_for_one_judgement(case):
"""
Writes the three files for one judgement. Goes through al intervals from 0.0-0.1 0.9-1.0,
creates summaries and writes the results to the fieles
:param case: (tag, pm, row, improved) as resulting from get_evalution_data, improved indicates, whether the
improved version should be used
"""
pm, row, improved = case
l_fin, l_list, eg_list, combined_list = prepare_sentences(row)
if improved:
eg_list = preselect_sentences(eg_list, l_fin)
combined_list = preselect_sentences(combined_list, pm)
max_intervals = [-1] * 4
ls_sums = [(str((i - 1) / 10) + '-' + str(i / 10), '', '') for i in range(1, 12, 1)]
pm_sums = [(str((i - 1) / 10) + '-' + str(i / 10), '', '') for i in range(1, 12, 1)]
for i in range(11, 1, -1):
ls_sums, max_intervals = update_summaries_if_needed(interval_id=i, gold_sum=l_fin, existings_summaries=ls_sums,
max_interval_index=0, max_intervals=max_intervals,
possible_sentences=eg_list, rouge_1=True)
ls_sums, max_intervals = update_summaries_if_needed(interval_id=i, gold_sum=l_fin, existings_summaries=ls_sums,
max_interval_index=1, max_intervals=max_intervals,
possible_sentences=eg_list, rouge_1=False)
pm_sums, max_intervals = update_summaries_if_needed(interval_id=i, gold_sum=pm, existings_summaries=pm_sums,
max_interval_index=2, max_intervals=max_intervals,
possible_sentences=combined_list, rouge_1=True)
pm_sums, max_intervals = update_summaries_if_needed(interval_id=i, gold_sum=pm, existings_summaries=pm_sums,
max_interval_index=3, max_intervals=max_intervals,
possible_sentences=combined_list, rouge_1=False)
# ROUGE Overviews
write_rouge_overview((l_fin, ls_sums, eg_list), (pm, pm_sums, combined_list), row[utils.aktenzeichen_str],
improved=improved)
# Evaluation Files
write_evaluation_files((l_fin, ls_sums), (pm, pm_sums), row[utils.aktenzeichen_str], (eg_list, combined_list),
improved)
def combine_consecutive_sentences(sentences_to_combine, original_list):
"""
In case sentences in the first list are consecutive sentences in the second list, they are combined into one string
:param sentences_to_combine: strings here might be combined
:param original_list: original list for getting order
:return: updated list
"""
indices = sorted([(get_index_in_list(sent, original_list), sent) for sent in sentences_to_combine])
res = []
old_index = -1
current_package = ''
for index, sent in indices:
if index - old_index == 1: # consecutive sentences
current_package += ' ' + sent
current_package = current_package.strip()
else: # old package is done
if current_package != '':
res.append(current_package)
current_package = sent
old_index = index
if current_package != '':
res.append(current_package)
return res
def write_evaluation_files(ls_data, pm_data, aktenzeichen, sent_lists, improved):
"""
Writes the excel files for legal evaluation.
:param improved: if True, the improved versio is used
:param ls_data: ls, ls_sums with ls the leitsatz and ls_sums the created summaries and intervals
:param pm_data: p, pm_sums with pm the press release annd pm_sums the created summaries and intervals
:param aktenzeichen: aktenzeichen of the case
:param sent_lists: (eg_list, combined_list) with the original sentences for finding consecutive sentences
"""
eg_list, combined_list = sent_lists
ls_sentences = []
pm_sentences = []
ls, ls_sums = ls_data
pm, pm_sums = pm_data
for _, r1, rl in ls_sums:
r1_sents = elaborated_sentence_splitting(r1)
if improved:
r1_sents = combine_modus(r1_sents)
r1_sents = combine_consecutive_sentences(r1_sents, eg_list)
r1_sents = [sent for sent in r1_sents if sent not in ls_sentences]
ls_sentences += r1_sents
rl_sents = elaborated_sentence_splitting(rl)
if improved:
rl_sents = combine_modus(rl_sents)
rl_sents = combine_consecutive_sentences(rl_sents, eg_list)
rl_sents = [sent for sent in rl_sents if sent not in ls_sentences]
ls_sentences += rl_sents
for _, r1, rl in pm_sums:
r1_sents = elaborated_sentence_splitting(r1)
if improved:
r1_sents = combine_modus(r1_sents)
r1_sents = combine_consecutive_sentences(r1_sents, combined_list)
r1_sents = [sent for sent in r1_sents if sent not in pm_sentences]
pm_sentences += r1_sents
rl_sents = elaborated_sentence_splitting(rl)
if improved:
rl_sents = combine_modus(rl_sents)
rl_sents = combine_consecutive_sentences(rl_sents, combined_list)
rl_sents = [sent for sent in rl_sents if sent not in pm_sentences]
pm_sentences += rl_sents
if improved:
savepath = eval_path + eval_run_two_path + 'sentences/'
else:
savepath = eval_path + 'sentences/'
utils.create_dir(current_path=current_dir, directory_name=savepath, delete=False)
workbook = xlsxwriter.Workbook(
utils.server_path(current_path=current_dir,
path=savepath + aktenzeichen.replace('/', '-') + '.xlsx'))
# sorting to give no indicateion of object ranking
ls_sentences = sorted(ls_sentences)
pm_sentences = sorted(pm_sentences)
write_one_evaluation_worksheet(workbook, 'Leitsatz', ls_sentences)
write_one_evaluation_worksheet(workbook, 'Pressemitteilung', pm_sentences)
workbook.close()
def write_one_evaluation_worksheet(workbook, worksheetname, sentences):
"""
Writes one excel sheet either for press releases or leitsatz
:param workbook: excel workbook to write in
:param worksheetname: name of the sheet
:param sentences: the sentences to write
"""
worksheet = workbook.add_worksheet(name=worksheetname)
cell_format = workbook.add_format()
cell_format.set_text_wrap()
worksheet.set_column(2, 20, 20)
worksheet.set_column(1, 1, 55)
# description line
worksheet.write(1, 0, 'Nummer')
worksheet.write(1, 1, 'Satz')
for i in range(0, 10, 2):
worksheet.write(0, 2 + i, 'rechtliche Aussage:')
# split line
worksheet.write(1, 2 + i, 'Kategorie')
worksheet.write(1, 2 + i + 1, 'Dopplung')
# sentences with numbers
for i in range(len(sentences)):
worksheet.write(2 + i, 0, i + 1)
worksheet.write(2 + i, 1, sentences[i], cell_format)
# ending line
for i in range(0, 10, 2):
worksheet.write(2 + len(sentences), 2 + i,
'Falls der Inhalt der rechtlichen Aussage vollständig abgebildet wurde, welche Sätze '
'werden dazu benötigt?', cell_format)
worksheet.write(2 + len(sentences) + 1, 2 + i,
'Falls der Inhalt der rechtlichen Aussage insgesamt in einem passenden '
'Abstraktionsniveau angegeben wurden, '
'welche Sätze werden dazu benötigt?', cell_format)
def write_one_overview_worksheet(workbook, worksheet_name, sum_data):
"""
Writes one worksheet for the rouge overview files.
:param workbook: workbook to write in
:param worksheet_name: name of the sheet
:param sum_data: (gold, created, original_sents, improves) summaries
"""
worksheet = workbook.add_worksheet(name=worksheet_name)
gold, created, original_list, improved = sum_data
# original text
worksheet.write(0, 0, gold)
row = 2
current = ''
sentences = utils.split_into_sentences(created)
if improved:
sentences = combine_consecutive_sentences(sentences_to_combine=sentences, original_list=original_list)
for i in range(len(sentences)):
# sentence
current += ' ' + sentences[i]
current = current.strip()
worksheet.write(row, 0, sentences[i])
row += 1
def write_rouge_overview(ls_data, pm_data, aktenzeichen, improved):
"""
Writes the rouge overview files
:param ls_data: ls, ls_sums, eg_list with ls the leitsatz and ls_sums the created summaries
:param pm_data: pm, pm_sums, comb_list with pm the press release and pm_sums the created summaries
:param aktenzeichen: aktenzeichen of the case
:param improved: True if the improved version should be run
"""
if improved:
savepath = eval_path + eval_run_two_path + 'rouge_overview/'
else:
savepath = eval_path + 'rouge_overview/'
utils.create_dir(current_path=current_dir, directory_name=savepath, delete=False)
workbook = xlsxwriter.Workbook(
utils.server_path(current_path=current_dir,
path=savepath + aktenzeichen.replace('/', '-') + '.xlsx'))
ls, ls_sums, eg_list = ls_data
for interval, sum_r1, sum_rl in ls_sums:
write_one_overview_worksheet(workbook, 'ls rouge1 ' + interval, sum_data=(ls, sum_r1, eg_list, improved))
write_one_overview_worksheet(workbook, 'ls rougel ' + interval, sum_data=(ls, sum_rl, eg_list, improved))
pm, pm_sums, comb_list = pm_data
for interval, sum_r1, sum_rl in pm_sums:
write_one_overview_worksheet(workbook, 'pm rouge1 ' + interval, sum_data=(pm, sum_r1, comb_list, improved))
write_one_overview_worksheet(workbook, 'pm rougel ' + interval, sum_data=(pm, sum_rl, comb_list, improved))
workbook.close()
def read_or_load_summaries():
"""
Reads or loads the the summaries from their files. For every intervalm rouge-l and rouge-1 and pm and ls.
Loads if exists, reads otherwise.
:return: (summaries, sentences) dataframes with summaries the summary data, sentences the corresponding
sentences and counts the counts of existing summaries for each combination
"""
try:
summaries = utils.df_from_json(current_path=current_dir, path=data_path + all_summaries_df_name)
sentences = utils.df_from_json(current_path=current_dir, path=data_path + sum_sentences_df_name)
except Exception:
sentences = pd.DataFrame(columns=[type_str, utils.aktenzeichen_str, rouge_type_str, sentence_str])
summaries = pd.DataFrame(columns=[type_str, utils.aktenzeichen_str, original_str, summary_id_str, summary_str,
rouge_p_str, rouge_r_str, rouge_f_str, rouge_type_str])
rouge_overview_path = eval_path + 'rouge_overview/'
for file in utils.list_dir_files(current_path=current_dir, path=rouge_overview_path):
current_summary_id = 0
file_sentences = pd.DataFrame(
columns=[type_str, utils.aktenzeichen_str, rouge_type_str, sentence_str])
aktenzeichen = file.replace('.xlsx', '')
# intervalle durchgehen
for i in range(1, 11, 1):
# pm und leitsatz durchgehen
for identifier in ['pm', 'ls']:
# rouge 1+ l durchgehen
for rouge_metric in [rouge1_str, rougel_str]:
sheetname = identifier + ' ' + rouge_metric + ' ' + \
str((i - 1) / 10) + '-' + str(i / 10)
df_sheet_data = pd.read_excel(rouge_overview_path + '/' + file,
sheet_name=sheetname)
if df_sheet_data.shape[0] == 0:
continue
my_summary = ''
my_sentences = [False] * file_sentences.shape[0]
original_summary = df_sheet_data.columns.values[0]
for index, row in df_sheet_data.iterrows():
if index == 0: # first row is empty
continue
sent = row[original_summary]
my_summary += ' ' + sent
my_summary = my_summary.strip()
# add sent to sentences or mark the old index
existing_sent = file_sentences.loc[(file_sentences[utils.aktenzeichen_str] ==
aktenzeichen) &
(file_sentences[sentence_str] == sent) &
(file_sentences[type_str] == identifier) &
(file_sentences[
rouge_type_str] == rouge_metric)]
if existing_sent.shape[0] > 0:
my_sentences[existing_sent.index.values[0]] = True
else:
file_sentences.loc[len(file_sentences.index)] = [identifier, aktenzeichen,
rouge_metric, sent] + \
[False] * (
file_sentences.shape[
1] - 4)
my_sentences.append(True)
file_sentences[current_summary_id] = my_sentences
file_sentences = file_sentences.T.drop_duplicates().T
if file_sentences.shape[1] <= (4 + current_summary_id): # duplicate summary
continue
if rouge_metric == rouge1_str:
r_p, r_r, r_f = rouge.rouge_n(reference_summary=original_summary,
created_summary=my_summary,
pp_options=pp_options, n=1, extended_results=True)
else:
r_p, r_r, r_f = rouge.rouge_l(reference_summary=original_summary,
created_summary=my_summary,
pp_options=pp_options, extended_results=True)
summaries.loc[len(summaries.index)] = [identifier, aktenzeichen, original_summary,
current_summary_id, my_summary, r_p, r_r,
r_f,
rouge_metric]
current_summary_id += 1
sentences = pd.concat([sentences, file_sentences], ignore_index=True)
sentences = sentences.fillna(False)
utils.create_dir(current_path=current_dir, directory_name=data_path, delete=False)
utils.df_to_json(current_path=current_dir, path=data_path + all_summaries_df_name, dataframe=summaries)
utils.df_to_json(current_path=current_dir, path=data_path + sum_sentences_df_name, dataframe=sentences)
return summaries, sentences
def read_or_load_summaries_run_two():
"""
Reads or loads the the summaries from their files. For every intervalm rouge-l and rouge-1 and pm and ls.
Loads if exists, reads otherwise.
:return: (summaries, sentences) dataframes with summaries the summary data, sentences the corresponding
sentences and counts the counts of existing summaries for each combination
"""
try:
summaries = utils.df_from_json(current_path=current_dir,
path=data_path + eval_run_two_path + all_summaries_df_name)
sentences = utils.df_from_json(current_path=current_dir,
path=data_path + eval_run_two_path + sum_sentences_df_name)
except Exception:
sentences = pd.DataFrame(columns=[type_str, utils.aktenzeichen_str, rouge_type_str, sentence_str])
summaries = pd.DataFrame(columns=[type_str, utils.aktenzeichen_str, original_str, summary_id_str, summary_str,
rouge_p_str, rouge_r_str, rouge_f_str, rouge_type_str])
rouge_overview_path = eval_path + eval_run_two_path + 'rouge_overview/'
for file in utils.list_dir_files(current_path=current_dir, path=rouge_overview_path):
current_summary_id = 0
file_sentences = pd.DataFrame(
columns=[type_str, utils.aktenzeichen_str, rouge_type_str, sentence_str])
aktenzeichen = file.replace('.xlsx', '')
# intervalle durchgehen
for i in range(1, 11, 1):
# pm und leitsatz durchgehen
for identifier in ['pm', 'ls']:
# rouge 1+ l durchgehen
for rouge_metric in [rouge1_str, rougel_str]:
sheetname = identifier + ' ' + rouge_metric + ' ' + \
str((i - 1) / 10) + '-' + str(i / 10)
df_sheet_data = pd.read_excel(rouge_overview_path + '/' + file,
sheet_name=sheetname)
if df_sheet_data.shape[0] == 0:
continue
my_summary = ''
my_sentences = [False] * file_sentences.shape[0]
original_summary = df_sheet_data.columns.values[0]
for index, row in df_sheet_data.iterrows():
if index == 0: # first row is empty
continue
sent = row[original_summary]
my_summary += ' ' + sent
my_summary = my_summary.strip()
# add sent to sentences or mark the old index
existing_sent = file_sentences.loc[(file_sentences[utils.aktenzeichen_str] ==
aktenzeichen) &
(file_sentences[sentence_str] == sent) &
(file_sentences[type_str] == identifier) &
(file_sentences[
rouge_type_str] == rouge_metric)]
if existing_sent.shape[0] > 0:
my_sentences[existing_sent.index.values[0]] = True
else:
file_sentences.loc[len(file_sentences.index)] = [identifier, aktenzeichen,
rouge_metric, sent] + \
[False] * (
file_sentences.shape[
1] - 4)
my_sentences.append(True)
file_sentences[current_summary_id] = my_sentences
file_sentences = file_sentences.T.drop_duplicates().T
if file_sentences.shape[1] <= (4 + current_summary_id): # duplicate summary
continue
if rouge_metric == rouge1_str:
r_p, r_r, r_f = rouge.rouge_n(reference_summary=original_summary,
created_summary=my_summary,
pp_options=pp_options, n=1, extended_results=True)
else:
r_p, r_r, r_f = rouge.rouge_l(reference_summary=original_summary,
created_summary=my_summary,
pp_options=pp_options, extended_results=True)
summaries.loc[len(summaries.index)] = [identifier, aktenzeichen, original_summary,
current_summary_id, my_summary, r_p, r_r,
r_f,
rouge_metric]
current_summary_id += 1
sentences = pd.concat([sentences, file_sentences], ignore_index=True)
sentences = sentences.fillna(False)
utils.create_dir(current_path=current_dir, directory_name=data_path + eval_run_two_path, delete=False)
utils.df_to_json(current_path=current_dir, path=data_path + eval_run_two_path + all_summaries_df_name,
dataframe=summaries)
utils.df_to_json(current_path=current_dir, path=data_path + eval_run_two_path + sum_sentences_df_name,
dataframe=sentences)
return summaries, sentences
def get_evaluated_sentences(run_two=False):
"""
Reads the evaluated sentences
:return: (sentences, info) two dataframes containing all sentences and additional info about sentences needed for a
complete representation of the original
"""
path = evaluated_path + summary_sentences_path
if run_two:
path = evaluated_path + eval_run_two_path + summary_sentences_path
result_sentences = pd.DataFrame()
result_info = pd.DataFrame()
for file in utils.list_dir_files(current_path=current_dir, path=path):
ls_data = pd.read_excel(utils.server_path(current_path=current_dir, path=path + file),
sheet_name='Leitsatz')
pm_data = pd.read_excel(utils.server_path(current_path=current_dir, path=path + file),
sheet_name='Pressemitteilung')
aktenzeichen = file.replace('.xlsx', '')
sentences, info = extract_one_type_data(ls_data, 'ls', aktenzeichen)
sentences_pm, info_pm = extract_one_type_data(pm_data, 'pm', aktenzeichen)
result_sentences = pd.concat([result_sentences, sentences, sentences_pm])
result_info = pd.concat([result_info, info, info_pm], ignore_index=True)
return result_sentences, result_info
def extract_one_type_data(dataframe, current_type, aktenzeichen):
"""
Extracts the data of one worksheet
:param dataframe: raw dataframe of the sheet
:param current_type: ls or pm
:param aktenzeichen: aktenzeichen of the judgement
:return: sentences, info with sentences the evaluated sentences and info a dataframe of legal issues and the
sentences needed to complete them
"""
# check if column empty
data_row_count = dataframe.shape[0] - 3
drop_columns = []
for i in range(len(dataframe.columns)):
column = dataframe.columns.values[i]
if (i % 2) == 0:
if dataframe[column][1:-2].isnull().sum() == data_row_count:
# everything empty
index = dataframe.columns.get_loc(column)
drop_columns.append(index)
if index + 1 < len(dataframe.columns):
drop_columns.append(index + 1)
dataframe.drop(dataframe.columns[drop_columns], axis=1, inplace=True) # also dop dopplung
# sentence duplicates
duplicates = []
duplicate_rows = dataframe.iloc[1:-2, 3::2].dropna(how='all')
for index, row in duplicate_rows.iterrows():
for dup_index in row.unique():
if not pd.isna(dup_index):
duplicates.append((index, dup_index))
# refill values for duplicates if not given
for a, b in duplicates:
a_v = dataframe.iloc[a].iloc[2::2].dropna().unique()
b_v = dataframe.iloc[b].iloc[2::2].dropna().unique()
if len(b_v) == 0: # missing value
dataframe.iloc[b, 2] = a_v[0]
if len(a_v) == 0: # missing value
dataframe.iloc[a, 2] = b_v[0]
# Sentence evaluations
sentences = dataframe.iloc[1:-2, :2]
sentences.columns = [sent_no_str, sentence_str]
legal_issues = dataframe.iloc[1:-2, 2:].T.apply(lambda col: col.iloc[::2].dropna()).T
column_names = [issues_str + str(i) for i in range(1, legal_issues.shape[1] + 1)]
legal_issues.columns = column_names
sentences = pd.concat([sentences, legal_issues], axis=1)
# Kombination für vollständige Abbildung finden
abstr_matching = []
content_matching = []
subset = dataframe.iloc[-2:, 2:]
for i in range(1, subset.shape[1], 2):
if not pd.isna(subset.iloc[0, i]):
content_matching.append(subset.iloc[0, i])
else:
content_matching.append('')
if not pd.isna(subset.iloc[1, i]):
abstr_matching.append(subset.iloc[1, i])
else:
abstr_matching.append('')
sentences[type_str] = current_type
sentences[utils.aktenzeichen_str] = aktenzeichen
abstr_matching = insert_duplicates(abstr_matching, duplicates)
content_matching = insert_duplicates(content_matching, duplicates)
info = pd.DataFrame(columns=[utils.aktenzeichen_str, type_str, issues_str, abstr_complete_str,
content_complete_str])
for i in range(len(abstr_matching)):
info.loc[len(info.index)] = [aktenzeichen, current_type, str(i), abstr_matching[i], content_matching[i]]
return sentences, info
def insert_duplicates(match, duplicates):
"""
In the list of sentences needed for a complete content or abstraction level, the duplicates are inserted
:param match: list of either abstraction or content matchings
:param duplicates: [(a,b)*] the list of duplicates
:return: the updated input list
"""
for i in range(len(match)):
matching = str(match[i]).replace(',', '').split(' ')
for a, b in duplicates:
if str(a) in matching:
matching.remove(str(a))
matching.append(str(a) + ',' + str(b))
if str(b) in matching:
matching.remove(str(b))
matching.append(str(a) + ',' + str(b))
match[i] = ' '.join(matching)
return match
def get_interval_counts(summaries):
"""
Counts all summaries for the intervals
:param summaries: all summaries
:return: dataframe with countet result
"""
res = pd.DataFrame()
for type_id in ['ls', 'pm']:
for rouge_type in [rouge1_str, rougel_str]:
selected_summaries = summaries[(summaries[type_str] == type_id) &
(summaries[rouge_type_str] == rouge_type)]
counts = selected_summaries[interval_start_str].value_counts()
counts.name = rouge_type + '_' + type_id
res = pd.concat([res, counts], axis=1)
res = res.sort_index()
return res
def get_interval_mean_ranking(summaries):
"""
Gets mean ranking of all summaries for the intervals
:param summaries: all summaries
:return: dataframe with countet result
"""
res = pd.DataFrame()
for type_id in ['ls', 'pm']:
for rouge_type in [rouge1_str, rougel_str]:
selected_summaries = summaries[(summaries[type_str] == type_id) &
(summaries[rouge_type_str] == rouge_type)]
means = selected_summaries.groupby(interval_start_str)[evaluation_str].mean()
means.name = rouge_type + '_' + type_id
res = pd.concat([res, means], axis=1)
res = res.sort_index()
return res
def sentences_complete(number_list, sentence_index_list):
"""
Determines wether all sentence are contained which are needed for completion.
:param number_list: abstr_ or content_ list
:param sentence_index_list: all sentence indices of the summary
:return: true if all sentences are contained, false otherwise
"""
if number_list == '':
return False
else:
numbers = number_list.split(' ')
for number in numbers:
if ',' in number: # duplicate numbering, 'or'
a, b = number.split(',')
if a not in sentence_index_list and b not in sentence_index_list:
return False
else:
if number not in sentence_index_list:
return False
return True
def get_cat_values(all_values):
"""
Reads all evaluated categories and writes them to a set
:param all_values: raw evaluation
:return: set of all letters contained.
"""
all_cats = set()
for cat in all_values:
if not pd.isna(cat):
for char in cat:
all_cats.add(char)
return all_cats
def get_one_summary_evaluation(package):
"""
Evaluates one summary.
:param package: my_info, my_sum_sents, sum_index
:return: sum_index, evaluation.
"""
my_info, my_sum_sents, sum_index = package
# for each legal issue
result = 0
num_legal_issues = my_info.shape[0]
for issue in range(num_legal_issues):
content_list = my_info[my_info[issues_str] == str(issue)][content_complete_str].iloc[0]
sentence_list = my_sum_sents[sent_no_str].unique()
sentence_list = [str(v) for v in sentence_list]
cat_v_content_comp = sentences_complete(content_list, sentence_list)
all_categories = get_cat_values(my_sum_sents[issues_str + str(issue + 1)].unique())
if 'F' in all_categories:
result = -4
elif 'L' in all_categories:
result -= 2
if 'V' in all_categories or cat_v_content_comp:
result += 2
elif 'P' in all_categories or 'E' in all_categories:
result += 1
if 'S' in all_categories: # as soon as there is one S, only one point
result += 1
elif 'E' in all_categories or 'G' in all_categories: # if no S, but G or E, then 2 points
result += 2
return sum_index, result / num_legal_issues / 4 # divide with 4 for range
def evaluate_all_summaries(info, sents, sums):
"""
Coordinates calculation of all summary evaluations
:param info: infos to the summaries
:param sents: sentences of all summaries
:param sums: summarie overviews of all summaries
:return: sums with an appended column cointaining the evaluation
"""
packaged_info = [(info[(info[utils.aktenzeichen_str] == row[utils.aktenzeichen_str]) &
(info[type_str] == row[type_str])],
sents[(sents[utils.aktenzeichen_str] == row[utils.aktenzeichen_str])
& (sents[str(row[summary_id_str])] == True)], index)
for index, row in sums.iterrows()]
res = utils.parallel_imap(get_one_summary_evaluation, packaged_args=packaged_info)
var = []
for content in res:
index, evaluation = content
var.append((index, evaluation))
idx, values = zip(*var)
evaluations = pd.Series(values, idx)
evaluations.name = evaluation_str
res_summaries = pd.concat([sums, evaluations], axis=1)
return res_summaries
def get_category_overview_data(sentences, name):
"""
Gets the data for plotting a bar plot of categories
:param sentences: sentences to include
:param name: name for the column
:return: dataframe containing prepared data
"""
sent_count = sentences.shape[0]
evals = sentences.apply(pd.Series.value_counts).sum(axis=1)
res = {}
for i in evals.index:
for char in i:
if char != ' ':
vals = res.get(char)
if vals is None:
vals = 0
vals += evals[i]
res[char] = vals
for key in res.keys():
res[key] = [res[key] / sent_count]
res_df = pd.DataFrame.from_dict(res).T.sort_index()
res_df.columns = [name]
return res_df
def get_category_counts(sentences, name, sums, evaluation_cols):
"""
Gets the data for plotting a bar plot of categories
:param sentences: sentences to include
:param name: name for the column
:param evaluation_cols: number of evaluation columns
:param sums: all summaries
:return: dataframe containing prepared data
"""
summary_count = sums.shape[0]
sentences_all = pd.DataFrame()
for _, row in sentences.iterrows():
for _ in range(len(elaborated_sentence_splitting(row[sentence_str]))):
occurance_count = row[[str(x) for x in range(0, sums[summary_id_str].max() + 1)]].value_counts()[True]
for _ in range(occurance_count):
sentences_all = pd.concat([sentences_all, row], axis=1)
sentences_all = sentences_all.T[[issues_str + str(i) for i in range(1, evaluation_cols + 1)]]
sent_count = sentences_all.shape[0]
evals = sentences_all.apply(pd.Series.value_counts).sum(axis=1)
res = {}
for i in evals.index:
for char in i:
if char != ' ':
vals = res.get(char)
if vals is None:
vals = 0
vals += evals[i]
res[char] = vals
for key in res.keys():
res[key] = [res[key] / sent_count]
res['avg number of sentences'] = [sent_count / summary_count]
res_df = pd.DataFrame.from_dict(res).T.sort_index()
res_df.columns = [name]
return res_df
def draw_pics(sents, sums, info, run_two=False):
"""
Draws the images for the visual evaluation.
:param sents: sentences to visualize
:param sums: summaries to visualize
:param info: corresponding info
:param run_two: if True, then files are written to dedicated directory for second run
"""
my_picture_path = picture_path
if run_two:
my_picture_path += eval_run_two_path
utils.create_dir(current_path=current_dir, directory_name=my_picture_path, delete=False)
# sent_subset = sents[~sents[[issues_str+str(i) for i in range(1, int(info[issues_str].max())+2)]]
# .isin(['R', 'T']).any(axis=1)]
# sum_subset = pd.DataFrame()
# for _, row in sums.iterrows():
# subset = sent_subset[(sent_subset[type_str] == row[type_str]) &
# (sent_subset[rouge_type_str] == row[rouge_type_str]) &
# (sent_subset[utils.aktenzeichen_str] == row[utils.aktenzeichen_str])
# & (sent_subset[str(row[summary_id_str])] == True)]
# if subset.shape[0] > 0:
# sum_subset = pd.concat([sum_subset, row], axis=1)
# sum_subset = sum_subset.T
# sents = sent_subset
# sums = sum_subset
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(11, 6))
for sum_type in ['ls', 'pm']:
for r_type in [rouge1_str, rougel_str]:
x_ticks = ['0.1 -\n0.2', '0.2 -\n0.3', '0.3 -\n0.4', '0.4 -\n0.5', '0.5 -\n0.6', '0.6 -\n0.7',
'0.7 -\n0.8', '0.8 -\n0.9', '0.9 -\n1.0']
if sum_type == 'ls':
if r_type == rouge1_str:
ax = ax1
x_label = 'ROUGE-1'
y_label = 'Guiding principles'
else:
ax = ax2
x_label = 'ROUGE-L'
y_label = ''
else:
if r_type == rouge1_str:
ax = ax3
x_label = ''
y_label = 'Press releases'
else:
ax = ax4
x_label = ''
y_label = ''
x_ticks = [tick for tick in x_ticks if float(tick[:3]) in sums[(sums[type_str] == sum_type) &
(sums[rouge_type_str] ==
r_type)][interval_start_str].unique()]
sums[(sums[type_str] == sum_type) & (sums[rouge_type_str] == r_type)][
['eval', interval_start_str]].plot(kind='box', ax=ax, by=interval_start_str,
color=dict(boxes='black', whiskers='black',
medians='black', caps='black'),
ylabel=y_label)
ax.set_title(x_label)
ax.set_ylim(0, 1)
ax.set_xticklabels(x_ticks)
fig.savefig(my_picture_path + 'boxplots.png')
xticklabels = ['ROUGE-1\ngp', 'ROUGE-L\ngp', 'ROUGE-1\npr',
'ROUGE-L\npr']
yticklabels = ['0.1-0.2', '0.2-0.3', '0.3-0.4', '0.4-0.5', '0.5-0.6', '0.6-0.7', '0.7-0.8', '0.8-0.9', '0.9-1.0']
fig, (ax, ax2) = plt.subplots(ncols=2, figsize=(12, 5))
mean_rankings = get_interval_mean_ranking(sums).fillna(0)
sns.heatmap(mean_rankings, annot=True, ax=ax, cmap='Greys', vmax=1, xticklabels=xticklabels,
yticklabels=yticklabels)
all_counts = get_interval_counts(sums).fillna(0)
sns.heatmap(all_counts, annot=True, ax=ax2, cmap='Greys_r', xticklabels=xticklabels, yticklabels=yticklabels)
ax2.tick_params(rotation=0)
ax.tick_params(rotation=0)
ax2.set_ylabel('Interval')
ax.set_ylabel('Interval')
fig.savefig(my_picture_path + 'heatmaps.png')
evaluations_cols = info[issues_str].astype(int).max() + 1
evals = sents[[issues_str + str(i) for i in range(1, evaluations_cols + 1)]]
res_df = get_category_overview_data(evals, 'all')
for sum_type in ['ls', 'pm']:
for r_type in [rouge1_str, rougel_str]:
evals = sents[(sents[rouge_type_str] == r_type) & (sents[type_str] == sum_type)]
evals = evals[[issues_str + str(i) for i in range(1, evaluations_cols + 1)]]
res_df = pd.concat([res_df, get_category_overview_data(evals, sum_type + ' ' + r_type)], axis=1)
fig = res_df.plot(kind='bar').get_figure()
fig.savefig(utils.server_path(current_path=current_dir, path=my_picture_path + 'cat_perc_types.png'))
evals = sents[[issues_str + str(i) for i in range(1, evaluations_cols + 1)]]
res_df_intervals = get_category_overview_data(evals, 'all')
for interval in sums[interval_start_str].unique():
selection_list = sums[sums[interval_start_str] == interval][[utils.aktenzeichen_str, summary_id_str]].values
evals = pd.DataFrame()
for aktenzeichen, sum_id in selection_list:
evals = pd.concat([evals, sents[(sents[str(sum_id)] == True) &
(sents[utils.aktenzeichen_str] == aktenzeichen)]])
evals = evals[[issues_str + str(i) for i in range(1, evaluations_cols + 1)]]
res_df_intervals = pd.concat([res_df_intervals, get_category_overview_data(evals, interval)], axis=1)
fig = res_df_intervals.plot(kind='bar').get_figure()
fig.savefig(utils.server_path(current_path=current_dir, path=my_picture_path + 'cat_perc_intervals.png'))
# table overview
for sum_type in ['ls', 'pm']:
for r_type in [rouge1_str, rougel_str]:
print(sum_type + ' ' + r_type)
my_sents = sents[(sents[type_str] == sum_type) & (sents[rouge_type_str] == r_type)]
my_sums = sums[(sums[type_str] == sum_type) & (sums[rouge_type_str] == r_type)]
table_data = pd.DataFrame()
for interval in my_sums[interval_start_str].unique():
my_current_sums = my_sums[my_sums[interval_start_str] == interval]
selection_list = my_current_sums[[utils.aktenzeichen_str, summary_id_str]].values
evals = pd.DataFrame()
for aktenzeichen, sum_id in selection_list:
evals = pd.concat([evals, my_sents[(my_sents[str(sum_id)] == True) &
(my_sents[utils.aktenzeichen_str] == aktenzeichen)]])
table_data = pd.concat([table_data, get_category_counts(evals, interval, my_current_sums,
evaluations_cols).T], axis=0)
print_data = pd.DataFrame()
print_data['R+T'] = table_data['R'] + table_data['T']
print_data['U'] = table_data['U']
print_data['Rest'] = 1 - (print_data['U'] + print_data['R+T'])
print_data['avg sentences'] = table_data['avg number of sentences']
print(print_data.to_string())
def readjust_consecutive_sentences(sents, sums, info):
"""
Inputs the additional consecutive sentences from the file
:param sents: all existing sentences
:param sums: all existing summaries
:param info: all existing infos
:return: the new sentences
"""
res = pd.DataFrame()
comb_sents = pd.read_excel(evaluated_path + eval_run_two_path + 'AufeinanderfolgendeSätze.xlsx')
comb_sents.columns = [utils.aktenzeichen_str, type_str, sentence_str, evaluation_str,
evaluation_str + '_new']
new_sent_indices = {}
for _, comb_row in comb_sents.iterrows():
sentences = elaborated_sentence_splitting(comb_row[sentence_str])
if len(sentences) != len(comb_row[evaluation_str].split()):
print('Wrong splitting!')
elaborated_sentence_splitting(comb_row[sentence_str])
r1_sents = sents[(sents[type_str] == comb_row[type_str]) &
(sents[utils.aktenzeichen_str] == comb_row[utils.aktenzeichen_str].replace('/', '-')) &
(sents[rouge_type_str] == rouge1_str) & sents[sentence_str].isin(sentences)]
rl_sents = sents[(sents[type_str] == comb_row[type_str]) &
(sents[utils.aktenzeichen_str] == comb_row[utils.aktenzeichen_str].replace('/', '-')) &
(sents[rouge_type_str] == rougel_str) & sents[sentence_str].isin(sentences)]
r1_sums = []
rl_sums = []
# find summaries containing the new sentence package
for col_name in [str(i) for i in range(1, sums[summary_id_str].max() + 1)]:
var_r1 = r1_sents[col_name].unique()
if len(var_r1) == 1 and var_r1[0] == True:
r1_sums.append(col_name)
var_rl = rl_sents[col_name].unique()
if len(var_rl) == 1 and var_rl[0] == True:
rl_sums.append(col_name)
if len(r1_sums) > 0:
new_index, new_sent_indices = get_new_sent_index(new_sent_indices, comb_row[utils.aktenzeichen_str], sents)
res = pd.concat([res, get_new_row(comb_row=comb_row, my_sents=r1_sents, r_string=rouge1_str,
sums=sums, sum_list=r1_sums, info=info, sent_no=new_index)])
if len(rl_sums) > 0:
new_index, new_sent_indices = get_new_sent_index(new_sent_indices, comb_row[utils.aktenzeichen_str], sents)
res = pd.concat([res, get_new_row(comb_row=comb_row, my_sents=rl_sents, r_string=rougel_str,
sums=sums, sum_list=rl_sums, info=info, sent_no=new_index)])
for index in r1_sents.index.values:
for sum_id in r1_sums:
sents.at[index, sum_id] = False
for index in rl_sents.index.values:
for sum_id in rl_sums:
sents.at[index, sum_id] = False
return pd.concat([sents, res])
def get_new_sent_index(index_dict, aktenzeichen, sentences):
"""
Gets a new number for a sentence to add that wasnt used for that aktenzeichen before
:param index_dict: dict with existing new indices
:param aktenzeichen: aktenzeichen to look for
:param sentences: sentences to find existing numbers in
:return: a new sentences numer to use
"""
new_index = index_dict.get(aktenzeichen)
if new_index is None:
new_index = sentences[(sentences[utils.aktenzeichen_str] ==
aktenzeichen.replace('/', '-'))][sent_no_str].max() + 1
index_dict[aktenzeichen] = new_index + 1
return new_index, index_dict
def get_new_row(comb_row, my_sents, r_string, sums, sum_list, info, sent_no):
"""
Creates a new sentence (package) row
:param comb_row: information concerning the sentence
:param my_sents: sentences appeasring in that sentence
:param r_string: rouge_string
:param sums: all existing summaries
:param sum_list: index list of summaries for that sentence
:param info: all infos
:param sent_no: sentence number for the row
:return: a row as dataframe
"""
row_data = {type_str: [comb_row[type_str]], rouge_type_str: [r_string],
utils.aktenzeichen_str: [my_sents.iloc[0][utils.aktenzeichen_str]],
sentence_str: [comb_row[sentence_str]], sent_no_str: [sent_no]}
for col_name in [str(i) for i in range(sums[summary_id_str].max())]:
if col_name not in sum_list:
row_data[col_name] = [False]
else:
row_data[col_name] = [True]
# legal issue
legal_issues = my_sents[[issues_str + str(i + 1) for i in range(int(info[issues_str].max()) + 1)]].dropna(axis=1,
how='all')
for col in legal_issues.columns:
row_data[col] = comb_row[evaluation_str + '_new']
if 'P' in str(comb_row[evaluation_str + '_new']):
# completeness
old_sentences = info[(info[utils.aktenzeichen_str] == comb_row[utils.aktenzeichen_str].replace('/', '-')) &
(info[type_str] == comb_row[type_str]) &
(info[issues_str] == str(int(col[-1]) - 1))][content_complete_str].values[0].split(' ')
new_sentences = ''
or_string = ''
for sentence_no in old_sentences:
if sentence_no in [str(x) for x in my_sents[sent_no_str].values]:
or_string += ' ' + sentence_no
or_string = or_string.strip()
else:
new_sentences += ' ' + sentence_no
new_sentences = new_sentences.strip()
if or_string != '':
or_string = '(' + or_string.replace(' ', '-') + ',' + str(sent_no) + ')'
new_sentences += ' ' + or_string
else:
new_sentences += ' ' + str(sent_no)
new_sentences = new_sentences.strip()
info.at[info[(info[utils.aktenzeichen_str] == comb_row[utils.aktenzeichen_str].replace('/', '-')) &
(info[type_str] == comb_row[type_str]) &
(info[issues_str] == str(int(col[-1]) - 1))]
[content_complete_str].index.values[0], content_complete_str] = new_sentences
return pd.DataFrame.from_dict(row_data)
def remove_bad_cats(sums, sents, infos):
res_sums = pd.DataFrame()
res_sents = sents[~sents[[issues_str + str(i + 1) for i in range(int(infos[issues_str].max()) + 1)]].isin(
['R', 'T', np.nan]).all(axis=1)]
for _, row in sums.iterrows():
my_sents = res_sents[(res_sents[utils.aktenzeichen_str] == row[utils.aktenzeichen_str]) &
(res_sents[type_str] == row[type_str]) &
(res_sents[rouge_type_str] == row[rouge_type_str]) &
(res_sents[str(row[summary_id_str])] == True)]
if my_sents.shape[0] > 0:
summary = ' '.join(my_sents[sentence_str].values)
new_row = {type_str: [row[type_str]], utils.aktenzeichen_str: [row[utils.aktenzeichen_str]],
original_str: [row[original_str]], summary_id_str: [row[summary_id_str]],
rouge_type_str: [row[rouge_type_str]], summary_str: [summary]}
if row[rouge_type_str] == rouge1_str:
p, r, f = rouge.rouge_n(reference_summary=row[original_str], created_summary=summary, n=1,
pp_options=pp_options, extended_results=True)
else:
p, r, f = rouge.rouge_l(reference_summary=row[original_str], created_summary=summary,
pp_options=pp_options, extended_results=True)
new_row['rouge_precision'] = p
new_row['rouge_recall'] = r
new_row['rouge_f_measure'] = f
res_sums = pd.concat([res_sums, pd.DataFrame.from_dict(new_row)], ignore_index=True)
return res_sents, res_sums
def evaluate_summaries_run_two():
"""
Coordinates evaluation of the summaries from the sentences evaluated
"""
all_summaries, all_summary_sentences = read_or_load_summaries_run_two()
evaluated_sentences, evaluated_infos = get_evaluated_sentences(run_two=True)
final_summaries = all_summaries[all_summaries[utils.aktenzeichen_str].isin(evaluated_infos[utils.aktenzeichen_str])]
final_sentences = all_summary_sentences.merge(evaluated_sentences, how='inner',
on=[utils.aktenzeichen_str, type_str, sentence_str])
# final_sentences, final_summaries = remove_bad_cats(sents=final_sentences,
# sums=final_summaries, infos=evaluated_infos)
final_summaries = evaluate_all_summaries(evaluated_infos, final_sentences, final_summaries)
final_summaries[interval_start_str] = final_summaries.apply(
lambda row: (np.ceil(row['rouge_recall'] * 10) - 1) / 10
if row[rouge_type_str] == rouge1_str
else (np.ceil(row['rouge_f_measure'] * 10) - 1) / 10, axis=1)
final_summaries = final_summaries.drop_duplicates([utils.aktenzeichen_str, type_str, rouge_type_str,
interval_start_str])
final_summaries = final_summaries[final_summaries[interval_start_str] > 0.0]
draw_pics(sents=final_sentences, sums=final_summaries, info=evaluated_infos, run_two=True)
def evaluate_summaries():
"""
Coordinates evaluation of the summaries from the sentences evaluated
"""
all_summaries, all_summary_sentences = read_or_load_summaries()
evaluated_sentences, evaluated_infos = get_evaluated_sentences()
final_sentences = all_summary_sentences.merge(evaluated_sentences, how='inner',
on=[utils.aktenzeichen_str, type_str, sentence_str])
final_summaries = all_summaries[all_summaries[utils.aktenzeichen_str].isin(evaluated_infos[utils.aktenzeichen_str])]
# final_sentences, final_summaries = remove_bad_cats(sents=final_sentences, sums=final_summaries,
# infos=evaluated_infos)
final_summaries = evaluate_all_summaries(evaluated_infos, final_sentences, final_summaries)
final_summaries[interval_start_str] = final_summaries.apply(
lambda row: (np.ceil(row['rouge_recall'] * 10) - 1) / 10
if row[rouge_type_str] == rouge1_str
else (np.ceil(row['rouge_f_measure'] * 10) - 1) / 10, axis=1)
final_summaries = final_summaries.drop_duplicates([utils.aktenzeichen_str, type_str, rouge_type_str,
interval_start_str])
final_summaries = final_summaries[final_summaries[interval_start_str] > 0.0]
draw_pics(sents=final_sentences, sums=final_summaries, info=evaluated_infos)
def combine_modus(list_of_string):
"""
For the input list of consecutive strings, konjunktiv sentences are combined with all following sentences until
there is an indicative sentence
:param list_of_string: strings for comibning
:return: updated list
"""
result_strings = []
current_string = ''
for string in list_of_string:
nlp_string = settings.nlp(string)
ind = False
sub = False
verb_mod_tag_list = ['VVFIN', 'VAFIN', 'VMFIN', 'VVIMP', 'VAIMP']
for token in nlp_string:
if token.tag_ in verb_mod_tag_list:
mood = token.morph.get('Mood')
if 'Ind' in mood:
if token.head.tag_ in verb_mod_tag_list:
head_mood = token.head.morph.get('Mood')
if 'Sub' not in head_mood:
ind = True
else:
ind = True
if 'Sub' in mood:
sub = True
current_string += ' ' + string
current_string = current_string.strip()
if ind or not sub:
result_strings.append(current_string)
current_string = ''
return result_strings
def get_index_in_list(sentence, string_list):
"""
Returns index of a string in a list with some leniance.
:param sentence: sentence to find
:param string_list: list to look in
:return: index or None, if nothing was found
"""
try:
return string_list.index(sentence)
except ValueError:
short_length = 200
for i in range(100):
result_list = [string_list.index(l_item) for l_item in string_list
if sentence[:min(100, len(sentence))] in l_item]
if len(result_list) == 1:
return result_list[0]
if len(result_list) == 0:
short_length -= 10
if len(result_list) > 1:
short_length += 50
return None
def readjust_splitting(old_list):
"""
Some sentences might not be split correctly
:param old_list: original split sentences
:return: new split sentences
"""
res = []
for string in old_list:
if 'InfoV. ' in string:
split_list = string.split('InfoV. ')
for i in range(len(split_list) - 1):
split_list[i] = split_list[i] + 'InfoV.'
res += split_list
else:
res.append(string)
return res
if __name__ == "__main__":
# pm_judgments = get_evaluation_data(case_list=cases_one_list)
# data = [(a, b, False) for (a, b) in pm_judgments]
# utils.parallel_imap(write_files_for_one_judgement, data)
# evaluate_summaries()
# pm_judgments = get_evaluation_data(case_list=cases_two_list)
# print_following_sentences()
# data = [(a, b, c, True) for (a, b, c) in pm_judgments]
evaluate_summaries_run_two()
print('Done')