Browse Source

Added Code and data

master
Bianca Steffes 1 year ago
commit
13d30ecd6f
  1. 7
      .gitignore
  2. 214
      data/download_rii.py
  3. 47
      pm_summary/annotation_evaluation.py
  4. 148
      rouge.py
  5. 1439
      rouge_evalauation/create_evaluation_files.py
  6. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_113-18.xlsx
  7. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_25-20.xlsx
  8. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_292-17.xlsx
  9. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_35-18.xlsx
  10. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_391-17.xlsx
  11. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_42-19.xlsx
  12. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_55-19.xlsx
  13. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_67-18.xlsx
  14. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_79-21.xlsx
  15. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/II_ZR_152-20.xlsx
  16. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/II_ZR_84-20.xlsx
  17. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/IV_ZR_144-21.xlsx
  18. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/IV_ZR_253-20.xlsx
  19. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/I_ZR_139-15.xlsx
  20. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/I_ZR_146-20.xlsx
  21. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/I_ZR_153-17.xlsx
  22. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/I_ZR_23-18.xlsx
  23. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/VIII_ZR_21-19.xlsx
  24. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/VIII_ZR_277-16.xlsx
  25. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/VIII_ZR_66-17.xlsx
  26. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/VIII_ZR_94-17.xlsx
  27. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/VII_ZR_151-18.xlsx
  28. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/VII_ZR_192-20.xlsx
  29. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/VII_ZR_78-20.xlsx
  30. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/VI_ZR_128-20.xlsx
  31. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/VI_ZR_252-19.xlsx
  32. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/VI_ZR_506-17.xlsx
  33. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/VIa_ZR_418-21.xlsx
  34. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_112-18.xlsx
  35. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_176-17.xlsx
  36. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_218-18.xlsx
  37. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_254-17.xlsx
  38. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_273-17.xlsx
  39. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_299-19.xlsx
  40. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_8-19.xlsx
  41. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/XII_ZR_13-19.xlsx
  42. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/XI_ZR_345-18.xlsx
  43. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/XI_ZR_7-19.xlsx
  44. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/X_ZR_107-16.xlsx
  45. BIN
      rouge_evalauation/evaluated_data/extractive_judgments/X_ZR_96-17.xlsx
  46. BIN
      rouge_evalauation/evaluated_data/second_eval/summary_sentences/I ZR 146-20.xlsx
  47. BIN
      rouge_evalauation/evaluated_data/second_eval/summary_sentences/I ZR 153-17.xlsx
  48. BIN
      rouge_evalauation/evaluated_data/second_eval/summary_sentences/II ZR 152-20.xlsx
  49. BIN
      rouge_evalauation/evaluated_data/second_eval/summary_sentences/II ZR 84-20.xlsx
  50. BIN
      rouge_evalauation/evaluated_data/second_eval/summary_sentences/III ZR 25-20.xlsx
  51. BIN
      rouge_evalauation/evaluated_data/second_eval/summary_sentences/III ZR 79-21.xlsx
  52. BIN
      rouge_evalauation/evaluated_data/second_eval/summary_sentences/IV ZR 144-21.xlsx
  53. BIN
      rouge_evalauation/evaluated_data/second_eval/summary_sentences/IV ZR 253-20.xlsx
  54. BIN
      rouge_evalauation/evaluated_data/second_eval/summary_sentences/V ZR 299-19.xlsx
  55. BIN
      rouge_evalauation/evaluated_data/second_eval/summary_sentences/V ZR 8-19.xlsx
  56. BIN
      rouge_evalauation/evaluated_data/second_eval/summary_sentences/VI ZR 128-20.xlsx
  57. BIN
      rouge_evalauation/evaluated_data/second_eval/summary_sentences/VI ZR 252-19.xlsx
  58. BIN
      rouge_evalauation/evaluated_data/second_eval/summary_sentences/VII ZR 192-20.xlsx
  59. BIN
      rouge_evalauation/evaluated_data/second_eval/summary_sentences/VII ZR 78-20.xlsx
  60. BIN
      rouge_evalauation/evaluated_data/second_eval/summary_sentences/VIII ZR 21-19.xlsx
  61. BIN
      rouge_evalauation/evaluated_data/second_eval/summary_sentences/VIII ZR 66-17.xlsx
  62. BIN
      rouge_evalauation/evaluated_data/second_eval/summary_sentences/VIa ZR 418-21.xlsx
  63. BIN
      rouge_evalauation/evaluated_data/second_eval/summary_sentences/X ZR 107-16.xlsx
  64. BIN
      rouge_evalauation/evaluated_data/second_eval/summary_sentences/XI ZR 345-18.xlsx
  65. BIN
      rouge_evalauation/evaluated_data/second_eval/summary_sentences/XI ZR 7-19.xlsx
  66. BIN
      rouge_evalauation/evaluated_data/summary_sentences/I ZR 139-15.xlsx
  67. BIN
      rouge_evalauation/evaluated_data/summary_sentences/I ZR 23-18.xlsx
  68. BIN
      rouge_evalauation/evaluated_data/summary_sentences/III ZR 113-18.xlsx
  69. BIN
      rouge_evalauation/evaluated_data/summary_sentences/III ZR 292-17.xlsx
  70. BIN
      rouge_evalauation/evaluated_data/summary_sentences/III ZR 35-18.xlsx
  71. BIN
      rouge_evalauation/evaluated_data/summary_sentences/III ZR 391-17.xlsx
  72. BIN
      rouge_evalauation/evaluated_data/summary_sentences/III ZR 42-19.xlsx
  73. BIN
      rouge_evalauation/evaluated_data/summary_sentences/III ZR 55-19.xlsx
  74. BIN
      rouge_evalauation/evaluated_data/summary_sentences/III ZR 67-18.xlsx
  75. BIN
      rouge_evalauation/evaluated_data/summary_sentences/V ZR 112-18.xlsx
  76. BIN
      rouge_evalauation/evaluated_data/summary_sentences/V ZR 176-17.xlsx
  77. BIN
      rouge_evalauation/evaluated_data/summary_sentences/V ZR 218-18.xlsx
  78. BIN
      rouge_evalauation/evaluated_data/summary_sentences/V ZR 254-17.xlsx
  79. BIN
      rouge_evalauation/evaluated_data/summary_sentences/V ZR 273-17.xlsx
  80. BIN
      rouge_evalauation/evaluated_data/summary_sentences/VI ZR 506-17.xlsx
  81. BIN
      rouge_evalauation/evaluated_data/summary_sentences/VII ZR 151-18.xlsx
  82. BIN
      rouge_evalauation/evaluated_data/summary_sentences/VIII ZR 277-16.xlsx
  83. BIN
      rouge_evalauation/evaluated_data/summary_sentences/VIII ZR 94-17.xlsx
  84. BIN
      rouge_evalauation/evaluated_data/summary_sentences/X ZR 96-17.xlsx
  85. BIN
      rouge_evalauation/evaluated_data/summary_sentences/XII ZR 13-19.xlsx
  86. 12
      settings.py
  87. 192
      test/test_rouge.py
  88. 475
      utils.py

7
.gitignore

@ -0,0 +1,7 @@
.idea
__pycache__
data/dataframes
rouge_evalauation/dataframes
rouge_evalauation/figures
rouge_evalauation/evaluated_data/second_eval/AufeinanderfolgendeSätze.xlsx
rouge_evalauation/manual_evaluation

214
data/download_rii.py

@ -0,0 +1,214 @@
import time
import xml.etree.ElementTree as ET
import urllib.request as request
import zipfile
import os
import pandas as pd
import settings
import utils
from utils import time_convert
base_dir_bgh = 'raw_data/BGH_Data'
extended_dir_bgh = base_dir_bgh + '/senates'
dataframe_dir_bgh = 'dataframes/bgh/'
pickle_name_bgh = 'bgh_data.pkl'
simple_attributes = ["doknr", "ecli", "gertyp", "gerort", "spruchkoerper", "entsch-datum",
"aktenzeichen", "doktyp", "norm", "vorinstanz", "mitwirkung", "titelzeile",
"leitsatz", "sonstosatz", "tenor", "tatbestand", "entscheidungsgruende",
"gruende", "abwmeinung", "sonstlt", "identifier", "coverage", "language",
"publisher", "accessRights"]
nested_attributes = ["region_abk", "region_long"]
text_attributes = ["titelzeile", "leitsatz", "sonstosatz", "tenor", "tatbestand",
"entscheidungsgruende", "gruende", "abwmeinung", "sonstlt"]
stopword_extension = '_no_stopwords'
current_path = 'data'
def get_file_list():
"""
Makes http request for the files
:return: the web page with all current cases as an xml-tree
"""
xml_file, https_message = request.urlretrieve('https://www.rechtsprechung-im-internet.de/rii-toc.xml')
tree = ET.parse(xml_file)
root = tree.getroot()
return root
def count_cases(root, tag):
"""
counts all cases belonging to the given tag and returns the count
:param root: downloaded xml-tree with all files
:param tag: tag to find in the name
:return: number of cases belonging to the BGH
"""
count = 0
for child in root:
if tag in child[0].text:
count += 1
return count
def download(base_dir, extended_dir, tag):
"""
download all cases to a folder related to their senats
:param base_dir: Name of the directory for the data
:param extended_dir: name of the subdirectory for saving
:param tag: tag to recognize the court (BGH, BVerwG)
"""
# set up directories
utils.create_dir(current_path=current_path, directory_name=base_dir)
utils.create_dir(current_path=current_path, directory_name=extended_dir)
# do the download
root = get_file_list() # 0 ist gericht, 3 ist link
max_cases = count_cases(root, tag)
downloaded = 0
for child in root:
while True:
try:
if tag in child[0].text:
filename, http = request.urlretrieve(child[3].text)
with zipfile.ZipFile(filename, 'r') as zip_ref:
zip_ref.extractall(
utils.server_path(current_path=current_path,
path=extended_dir + '/' + child[0].text.replace('\n', '') + '/'))
os.remove(filename)
downloaded += 1
print("\rDownloaded %d of %d " % (downloaded, max_cases) + tag + "Cases", end="")
finally:
break
print("\nDone!")
def read_file_data(file):
"""
Reads the data of one case / file.
:param file: package containing (filename, directory, directory extension) to address the file
:return: a dictionary with key: attribute_name and value: attribute_value
"""
filename, directory, extended_dir = file
tree = ET.parse(utils.server_path(current_path=current_path, path=os.path.join(extended_dir, directory, filename)))
root = tree.getroot()
res = {}
for attribute in simple_attributes:
attr = root.find(attribute) # leitsatz überprüfen: zwei Worte zusammen, aber leerzeichen immer noch da!
text = ''
for t in attr.itertext():
if t == '.' or t == ',' or t == ';' or t == '!' or t == '?':
text = text.strip() # remove space before these characters
text += t + ' '
text = text.strip()
if text == '':
res[attribute] = None
else:
res[attribute] = text
for attribute in nested_attributes:
nesting = attribute.split('_')
xml_tag = root
# find nested attribute
for i in range(len(nesting)):
xml_tag = xml_tag.find(nesting[i])
text = ""
for t in xml_tag.itertext():
if t == '.' or t == ',' or t == ';' or t == '!' or t == '?':
text = text.strip() # remove space before these characters
text += t + ' '
text = text.strip()
if text == '':
res[attribute] = None
else:
res[attribute] = text
for attribute in utils.rii_text_columns:
if res[attribute] is not None:
if settings.remove_brackets:
res[attribute] = utils.remove_brackets(res[attribute])
res[attribute] = utils.remove_spaces_before_sentence_marks(res[attribute])
return pd.DataFrame(res, index=[0])
def create_pickle(extended_dir, pickle_name, steps):
"""
Combines all downloaded files of the given extended directory into one pickle
:param extended_dir: extended dir to find the files
:param pickle_name: name of the pickle to save
:param steps: how many cases should be worked on now
"""
utils.create_dir(current_path=current_path, directory_name=dataframe_dir_bgh, delete=False)
start_time = time.time()
extension = ''
if settings.remove_brackets:
extension = settings.no_brackets_suffix
files = [(filename, directory, extended_dir) for directory in
utils.list_dir_files(current_path=current_path, path=extended_dir) for filename in
utils.list_dir_files(current_path=current_path, path=os.path.join(extended_dir, directory))
if filename.endswith(".xml")]
original_length = len(files)
data = pd.DataFrame(columns=simple_attributes + nested_attributes)
pickle_path = dataframe_dir_bgh+extension+pickle_name
files, data = utils.get_step_subset_raw(steps=steps,
path_to_dest_dataframe=pickle_path,
source_data=files,
dest_data=data,
call_path=current_path)
result = utils.parallel_imap(read_file_data, files)
for row in result:
data = pd.concat([data, row], ignore_index=True)
with utils.open_file(current_path=current_path, path=pickle_path, modes='wb') as f:
data.to_pickle(f)
print('Resulting dataframes have length ' + str(data.shape[0]) +
' (' + str(data.shape[0] / original_length * 100) + '%)')
end_time = time.time()
time_lapsed = end_time - start_time
time_convert(time_lapsed)
def get_selected_bgh_data(directory='.\\'):
"""
Shortcut for getting the BGH data currently needed. Selects all data from the Civil copurts which contain 'Urteile'
:param directory: directory offset from current position, with ending slashes
:return: the data
"""
return get_data(pickle_name_bgh, directory, spruchkoerper='Zivilsenat', doktyp='Urteil')
def get_data(pickle_name, directory='../data/', spruchkoerper=None, doktyp=None):
"""
Method for access to the bgh pickle
:param pickle_name: name to identify the data
:param directory: directory path to the data file (with ending slash)
:param spruchkoerper: Parameter can be used to select the senates (checks whether the given string is contained
in the datas spruchkoerper)
:param doktyp: can be used to select specific documents (like 'Urteil', 'Beschluss', etc.), must contain the word
:return: The data as a pandas dataframe
"""
extension = ''
if settings.remove_brackets:
extension = settings.no_brackets_suffix
data = utils.df_from_pickle(current_path=current_path, path=directory + dataframe_dir_bgh + extension + pickle_name)
if spruchkoerper is not None:
data = data[data['spruchkoerper'].notnull()]
data = data[data['spruchkoerper'].str.contains(spruchkoerper)]
if doktyp is not None:
data = data[data['doktyp'].str.lower().str.contains(doktyp.lower())]
data = data.dropna(axis=1, how='all') # drop all columns with no value
data = data.drop_duplicates()
return data
# if __name__ == "__main__":
# download(base_dir=base_dir_bgh, extended_dir=extended_dir_bgh, tag='BGH')
# create_pickle(extended_dir=extended_dir_bgh, pickle_name=pickle_name_bgh, steps=2)

47
pm_summary/annotation_evaluation.py

@ -0,0 +1,47 @@
import os
import pandas as pd
import utils
pm_sent_no = 'pm_sent_number'
pm_sent = 'Sätze der Pressemitteilung'
judgement_sent_no = 'judgement_sent_number'
judgement_sent = 'Dazu passende Sätze des Urteils'
keywords = 'Schlagworte'
comments = 'Anmerkung'
duration = 'Wie lange hast Du für die Bearbeitung dieses Urteils gebraucht?'
bad_pm = 'Ist diese Pressemitteilung eine schlechte Darstellung / Zusammenfassung des Urteils?'
current_dir = 'pm_summary/'
def prepare_file(path):
"""
Liest eine Datei aus und überführt sie in ein einheitliches Format.
:param path: Pfad zur Datei.
:return: dictionary, in dem die Ergebnisse stehen. Für jeden Satz der PM gibt es ein Unterdict (Zahl als key).
"""
res = {}
raw_data = pd.read_excel(path, names=[pm_sent_no, pm_sent], header=None)
for index, row in raw_data.iterrows():
current_sentence = {pm_sent_no: row[pm_sent_no], pm_sent: row[pm_sent]}
res[current_sentence[pm_sent_no]] = current_sentence
return res
def get_all_pm_files():
"""
Returns the list of all annotated pm-files
:return: [(pm_filename, file_data)*]
"""
file_path_base = utils.server_path(current_path=current_dir,
path='../rouge_evalauation/evaluated_data/extractive_judgments')
res = []
for judgment in os.listdir(file_path_base):
if '.xlsx' in judgment:
filename = file_path_base + '/' + judgment
res.append((judgment, prepare_file(filename)))
return res

148
rouge.py

@ -0,0 +1,148 @@
import utils
beta = 1
# https://aclanthology.org/W04-1013/
def rouge_n(reference_summary, created_summary, n, pp_options=None, extended_results=False):
"""
Calculates the rouge n score
:param reference_summary: gold standard summary
:param created_summary: summary to evaluate
:param n: size of n-grams
:param pp_options: list of options for preprocessing, if None then no preprocessing will be done
:param extended_results: indicates, whether, precision, recall and f-measure should be returned
:return: the score or (precision, recall, f-measure) if extended results are wanted
"""
# preprocess
if pp_options is not None: # otherwise don't preprocess. Text is already preprocessed
reference_summary = utils.preprocess_text(reference_summary, pp_options)
created_summary = utils.preprocess_text(created_summary, pp_options)
else: # seperate sentence marks from tokens
for sentence_mark in utils.sentence_marks:
reference_summary = reference_summary.replace(sentence_mark, ' '+sentence_mark)
created_summary = created_summary.replace(sentence_mark, ' ' + sentence_mark)
# split into n-grams of size n
# count occurances of single ngrams
reference_ngrams, ref_complete_count = count_n_grams(reference_summary, n)
created_ngrams, created_complete_count = count_n_grams(created_summary, n)
overlapping_count = 0
for ref_key in reference_ngrams.keys():
created_count = created_ngrams.get(ref_key)
if created_count is not None: # ngrams in both dicts
ref_count = reference_ngrams[ref_key]
overlapping_count += min(ref_count, created_count)
# calculate score
if ref_complete_count == 0:
return 0
recall = overlapping_count / ref_complete_count
if extended_results:
precision = overlapping_count / created_complete_count
return precision, recall, (2*precision*recall) / (precision+recall)
return recall
def count_n_grams(pp_summary, n):
"""
Counts the n-grams of the given size in a summary.
:param pp_summary: Pre-processed summary
:param n: n for the size of ngrams
:return: {ngram:count} for all ngrams in the summary
"""
words = pp_summary.split(' ')
complete_count = 0
n_grams = {}
for i in range(len(words)-(n-1)):
n_gram = ' '.join(words[i:i+n])
if n_gram != '':
complete_count += 1
count = n_grams.get(n_gram)
if count is None:
count = 0
n_grams[n_gram] = count + 1
return n_grams, complete_count
def rouge_l(reference_summary, created_summary, pp_options=None, extended_results=False):
"""
Calculates the rouge-l value of a summary and its gold standard summary
:param reference_summary: Gold standard summary
:param created_summary: Created summary to compare
:param pp_options: options for preprocessing, if None then there will be no preprocessing
:param extended_results: if True, precision, recall and f-score will be returned
:return: The calculated score, if extended results are wanted (precision, recall, f-measure)
"""
# preprocess
if pp_options is not None: # otherwise don't preprocess. Text is already preprocessed
reference_summary = utils.preprocess_text(reference_summary, pp_options)
created_summary = utils.preprocess_text(created_summary, pp_options)
# seperate sentence marks from words
# split into sentences
m_reference_word_number = len(reference_summary.split(' '))
reference_summary = utils.split_into_sentences(reference_summary)
n_created_word_number = len(created_summary.split(' '))
created_summary = utils.split_into_sentences(created_summary)
total_sum_subsequences = 0
# to make sure every word in the created summary is used only once
used_created_indices = [set()]*len(created_summary)
used_gold_indices = [set()]*len(reference_summary)
for j in range(len(reference_summary)):
ref_sentence = reference_summary[j]
# calculate union longest subsequence
for i in range(len(created_summary)):
created_sentence = created_summary[i]
indices_a, indices_b = get_subsequence(ref_sentence, created_sentence)
used_gold_indices[j] = (used_gold_indices[j]).union(indices_a)
used_created_indices[i] = (used_created_indices[i]).union(indices_b)
# used indices of b here to ensure words arent used twice
used_created_indices = [len(sent_set) for sent_set in used_created_indices]
used_gold_indices = [len(sent_set) for sent_set in used_gold_indices]
total_sum_subsequences = min(sum(used_gold_indices), sum(used_created_indices))
if total_sum_subsequences == 0:
return 0
p_lcs = total_sum_subsequences / n_created_word_number
r_lcs = total_sum_subsequences / m_reference_word_number
f_lcs = ((1 + beta * beta) * r_lcs*p_lcs) / (r_lcs + beta * beta * p_lcs)
if extended_results:
return p_lcs, r_lcs, f_lcs
return f_lcs
def get_subsequence(sent_a, sent_b):
"""
Finds all (not necessarily consecutive) subsequences of a in b.
:param sent_a: Sentence to find subsequences from
:param sent_b: Sentence to find subsequence in
:return: (ind_a, ind_b) two sets of indices of sent_a and sent_b of the longest subsequence
"""
result_a = set()
words_a = sent_a.split(' ')
words_b = sent_b.split(' ')
for word_index_a in range(len(words_a)):
word_result = set()
char_index_b = 0
while word_index_a < len(words_a):
# word is contained
try:
found_index = words_b.index(words_a[word_index_a], char_index_b)
word_result.add(word_index_a)
char_index_b = found_index
word_index_a += 1
except ValueError:
# word not in b contained, do nth
word_index_a += 1
if len(word_result) > len(result_a):
result_a = word_result
result_b = set([words_b.index(words_a[a_ind]) for a_ind in result_a])
return result_a, result_b
if __name__ == "__main__":
print('Done')

1439
rouge_evalauation/create_evaluation_files.py
File diff suppressed because it is too large
View File

BIN
rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_113-18.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_25-20.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_292-17.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_35-18.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_391-17.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_42-19.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_55-19.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_67-18.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/III_ZR_79-21.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/II_ZR_152-20.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/II_ZR_84-20.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/IV_ZR_144-21.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/IV_ZR_253-20.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/I_ZR_139-15.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/I_ZR_146-20.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/I_ZR_153-17.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/I_ZR_23-18.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/VIII_ZR_21-19.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/VIII_ZR_277-16.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/VIII_ZR_66-17.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/VIII_ZR_94-17.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/VII_ZR_151-18.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/VII_ZR_192-20.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/VII_ZR_78-20.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/VI_ZR_128-20.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/VI_ZR_252-19.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/VI_ZR_506-17.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/VIa_ZR_418-21.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_112-18.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_176-17.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_218-18.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_254-17.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_273-17.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_299-19.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/V_ZR_8-19.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/XII_ZR_13-19.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/XI_ZR_345-18.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/XI_ZR_7-19.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/X_ZR_107-16.xlsx

BIN
rouge_evalauation/evaluated_data/extractive_judgments/X_ZR_96-17.xlsx

BIN
rouge_evalauation/evaluated_data/second_eval/summary_sentences/I ZR 146-20.xlsx

BIN
rouge_evalauation/evaluated_data/second_eval/summary_sentences/I ZR 153-17.xlsx

BIN
rouge_evalauation/evaluated_data/second_eval/summary_sentences/II ZR 152-20.xlsx

BIN
rouge_evalauation/evaluated_data/second_eval/summary_sentences/II ZR 84-20.xlsx

BIN
rouge_evalauation/evaluated_data/second_eval/summary_sentences/III ZR 25-20.xlsx

BIN
rouge_evalauation/evaluated_data/second_eval/summary_sentences/III ZR 79-21.xlsx

BIN
rouge_evalauation/evaluated_data/second_eval/summary_sentences/IV ZR 144-21.xlsx

BIN
rouge_evalauation/evaluated_data/second_eval/summary_sentences/IV ZR 253-20.xlsx

BIN
rouge_evalauation/evaluated_data/second_eval/summary_sentences/V ZR 299-19.xlsx

BIN
rouge_evalauation/evaluated_data/second_eval/summary_sentences/V ZR 8-19.xlsx

BIN
rouge_evalauation/evaluated_data/second_eval/summary_sentences/VI ZR 128-20.xlsx

BIN
rouge_evalauation/evaluated_data/second_eval/summary_sentences/VI ZR 252-19.xlsx

BIN
rouge_evalauation/evaluated_data/second_eval/summary_sentences/VII ZR 192-20.xlsx

BIN
rouge_evalauation/evaluated_data/second_eval/summary_sentences/VII ZR 78-20.xlsx

BIN
rouge_evalauation/evaluated_data/second_eval/summary_sentences/VIII ZR 21-19.xlsx

BIN
rouge_evalauation/evaluated_data/second_eval/summary_sentences/VIII ZR 66-17.xlsx

BIN
rouge_evalauation/evaluated_data/second_eval/summary_sentences/VIa ZR 418-21.xlsx

BIN
rouge_evalauation/evaluated_data/second_eval/summary_sentences/X ZR 107-16.xlsx

BIN
rouge_evalauation/evaluated_data/second_eval/summary_sentences/XI ZR 345-18.xlsx

BIN
rouge_evalauation/evaluated_data/second_eval/summary_sentences/XI ZR 7-19.xlsx

BIN
rouge_evalauation/evaluated_data/summary_sentences/I ZR 139-15.xlsx

BIN
rouge_evalauation/evaluated_data/summary_sentences/I ZR 23-18.xlsx

BIN
rouge_evalauation/evaluated_data/summary_sentences/III ZR 113-18.xlsx

BIN
rouge_evalauation/evaluated_data/summary_sentences/III ZR 292-17.xlsx

BIN
rouge_evalauation/evaluated_data/summary_sentences/III ZR 35-18.xlsx

BIN
rouge_evalauation/evaluated_data/summary_sentences/III ZR 391-17.xlsx

BIN
rouge_evalauation/evaluated_data/summary_sentences/III ZR 42-19.xlsx

BIN
rouge_evalauation/evaluated_data/summary_sentences/III ZR 55-19.xlsx

BIN
rouge_evalauation/evaluated_data/summary_sentences/III ZR 67-18.xlsx

BIN
rouge_evalauation/evaluated_data/summary_sentences/V ZR 112-18.xlsx

BIN
rouge_evalauation/evaluated_data/summary_sentences/V ZR 176-17.xlsx

BIN
rouge_evalauation/evaluated_data/summary_sentences/V ZR 218-18.xlsx

BIN
rouge_evalauation/evaluated_data/summary_sentences/V ZR 254-17.xlsx

BIN
rouge_evalauation/evaluated_data/summary_sentences/V ZR 273-17.xlsx

BIN
rouge_evalauation/evaluated_data/summary_sentences/VI ZR 506-17.xlsx

BIN
rouge_evalauation/evaluated_data/summary_sentences/VII ZR 151-18.xlsx

BIN
rouge_evalauation/evaluated_data/summary_sentences/VIII ZR 277-16.xlsx

BIN
rouge_evalauation/evaluated_data/summary_sentences/VIII ZR 94-17.xlsx

BIN
rouge_evalauation/evaluated_data/summary_sentences/X ZR 96-17.xlsx

BIN
rouge_evalauation/evaluated_data/summary_sentences/XII ZR 13-19.xlsx

12
settings.py

@ -0,0 +1,12 @@
# creating BGH data
import spacy
remove_brackets = False
server = False
no_brackets_suffix = "no_br_"
nlp = spacy.load("de_core_news_sm") # small one
# nlp = spacy.load("de_dep_news_trf") # big one, CUDA Problems on Server...

192
test/test_rouge.py

@ -0,0 +1,192 @@
from unittest import TestCase
import rouge
import utils
def get_file_data(filename):
text_file = open(filename, "r", encoding='utf-8')
data = text_file.read()
text_file.close()
return data
def run_tests_with_data(self, test_data):
for original, generated_one, generated_two, description in test_data:
print(description)
print('original: ' + original)
rouge_one = rouge.rouge_n(original, generated_one, n=1)
rouge_two = rouge.rouge_n(original, generated_two, n=1)
print('Rouge for one: ' + str(rouge_one) + ' ' + generated_one)
print('Rouge for two: ' + str(rouge_two) + ' ' + generated_two)
combined = generated_two + ' ' + generated_one
combined_rouge = rouge.rouge_n(original, combined, n=1)
print('Rouge for combined: ' + str(combined_rouge) + ' ' + combined)
self.assertGreaterEqual(combined_rouge, rouge_one)
self.assertGreaterEqual(combined_rouge, rouge_two)
class RougeTest(TestCase):
def tests_from_paper(self):
s1 = 'police killed the gunman'
s2 = 'police kill the gunman'
s3 = 'the gunman kill police'
score_s2 = rouge.rouge_l(s1, s2)
self.assertEqual(score_s2, 0.75)
score_s3 = rouge.rouge_l(s1, s3)
self.assertEqual(score_s3, 0.5)
reference = 'affe birne club düne essen'
summary = 'affe birne feder geld himmel. affe club insel jagd essen.'
lcs = 4
p = lcs / 12
r = lcs / 5
f = 2*(r*p)/(r+p)
r_p, r_r, r_f = rouge.rouge_l(reference_summary=reference, created_summary=summary, pp_options=[utils.pp_option_stopwords],
extended_results=True)
self.assertEqual(r_r, r)
self.assertEqual(r_p, p)
self.assertEqual(r_f, f)
score_equal = rouge.rouge_l(summary, summary)
self.assertEqual(score_equal, 1)
def test_one(self):
original = 'Für die Frage, ob alle in Art. 6 Abs. 1 der Richtlinie 2011/83/EU genannten Informationen objektiv in einem Werbemittel dargestellt werden können, ist erheblich, welchen Anteil diese Informationen am verfügbaren Raum des vom Unternehmer ausgewählten Werbeträgers einnehmen würden; die Werbebotschaft muss gegenüber den Verbraucherinformationen nicht zurücktreten.'
sent_1 = '(1) Zwar ist für die nach der Vorabentscheidung des Gerichtshofs der Europäischen Union maßgebliche Frage, ob alle in Art. 6 Abs. 1 der Richtlinie 2011/83/EU genannten Informationen objektiv in einem Werbemittel dargestellt werden können, erheblich, welchen Anteil diese Informationen am verfügbaren Raum des vom Unternehmer ausgewählten Werbeträgers einnehmen würden.'
sent_2 = 'Aus der Anforderung, die Informationen objektiv in der Werbebotschaft darstellen zu können, ist zu schließen, dass die Werbebotschaft gegenüber den Verbraucherinformationen nicht zurücktreten muss.'
rouge_v1 = rouge.rouge_n(original, sent_1, 1, pp_options=[utils.pp_option_stopwords])
rouge_v2 = rouge.rouge_n(original, sent_1 + ' ' + sent_2, 1, pp_options=[utils.pp_option_stopwords])
self.assertGreater(rouge_v2, rouge_v1)
def test_one_match(self):
original = 'a b c d e.'
score = rouge.rouge_n(original, 'd.', n=1)
self.assertGreater(score, 0)
def test_extension(self):
original_short = 'a b c d e.'
original_medi = 'a b c d e f g h i j k l m n o.'
original_long = 'a b c d e f g h i j k l m n o p q r s t u v w x y z.'
test_data = [[original_short, 'a b.', 'a b d.', 'small extension short sentence'],
[original_short, 'a.', 'a b c d.', 'large extension short sentence'],
[original_medi, 'a b c d e f g h i.', 'a b c d e f g h i j.', 'small extension medi sentence'],
[original_medi, 'a b c d e f g h i.', 'a b c d e f g h i m n o l.',
'large extension medi sentence'],
[original_long, 'a b c d e f g h i j k l m n o p q r s t u v.',
'a b c d e f g h i j k l m n o p q r s t u v w.', 'small extension long sentence'],
[original_long, 'a b c d e f g h i j k.',
'a b c d e f g h i j k l m n o p q r s t u v w.', 'large extension long sentence'],
]
print('Test extensions')
run_tests_with_data(self, test_data)
def test_differing(self):
original_short = 'a b c d e.'
original_medi = 'a b c d e f g h i j k l m n o.'
original_long = 'a b c d e f g h i j k l m n o p q r s t u v w x y z.'
test_data = [[original_short, 'a b c.', 'a b d.', 'small difference short sentence'],
[original_short, 'a e.', 'a b c.', 'large difference short sentence'],
[original_medi, 'a b c d e f g h i.', 'a b c d e f g h j.', 'small difference medi sentence'],
[original_medi, 'a b c d e f g h i.', 'a b c d j k l m.',
'large difference medi sentence'],
[original_long, 'a b c d e f g h i j k l m n o p q r s t u v.',
'a b c d e f g h i j k l m n o p q r s t u w.', 'small difference long sentence'],
[original_long, 'a b c d e f g h i j k.',
'a b l m n o p q r s t u v.', 'large difference long sentence'],
]
print('Test differences')
run_tests_with_data(self, test_data)
def test_rougel_high_precision_or_recall(self):
gold = 'Boot.'
created = 'Boot. Boot.'
r_p, r_r, r_f = rouge.rouge_l(created_summary=created, reference_summary=gold, extended_results=True,
pp_options=[utils.pp_option_stopwords, utils.pp_option_lemmatize])
self.assertEqual(r_p, 1/2)
self.assertEqual(r_r, 1)
self.assertEqual(r_f, 2/3)
gold = 'Affe Boot. Boot Club.'
created = 'Boot.'
r_p, r_r, r_f = rouge.rouge_l(created_summary=created, reference_summary=gold, extended_results=True,
pp_options=[utils.pp_option_stopwords, utils.pp_option_lemmatize])
self.assertEqual(r_p, 1)
self.assertEqual(r_r, 2/6)
self.assertEqual(r_f, 1/2)
gold = 'Im Rahmen der bei Prüfung der Schutzschranke der Berichterstattung über Tagesereignisse gemäß § 50 ' \
'UrhG vorzunehmenden Grundrechtsabwägung ist im Falle der Veröffentlichung eines bislang ' \
'unveröffentlichten Werks auch das vom Urheberpersönlichkeitsrecht geschützte Interesse an einer ' \
'Geheimhaltung des Werks zu berücksichtigen. Dieses schützt das urheberrechtsspezifische Interesse des ' \
'Urhebers, darüber zu bestimmen, ob er mit der erstmaligen Veröffentlichung den Schritt von der ' \
'Privatsphäre in die Öffentlichkeit tut und sich und sein Werk damit der öffentlichen Kenntnisnahme ' \
'und Kritik aussetzt. Nicht zu berücksichtigen ist bei dieser Abwägung dagegen das Interesse an der ' \
'Geheimhaltung von Umständen, deren Offenlegung Nachteile für die Interessen des Staates und seiner ' \
'Einrichtungen haben könnten. Dieses Interesse ist nicht durch das Urheberpersönlichkeitsrecht, ' \
'sondern durch andere Vorschriften - etwa das Sicherheitsüberprüfungsgesetz, § 3 Nr. 1 Buchst. b IFG ' \
'und die strafrechtlichen Bestimmungen gegen Landesverrat und die Gefährdung der äußeren Sicherheit ' \
'gemäß §§ 93 ff. StGB - geschützt. '
created = 'Dieses Interesse ist vielmehr durch die allgemeinen Vorschriften - etwa das ' \
'Sicherheitsüberprüfungsgesetz, § 3 Nr. 1 Buchst. b IFG und die strafrechtlichen Bestimmungen gegen ' \
'Landesverrat und die Gefährdung der äußeren Sicherheit gemäß §§ 93 ff. '
r_p, r_r, r_f = rouge.rouge_l(created_summary=created, reference_summary=gold, extended_results=True,
pp_options=[utils.pp_option_stopwords, utils.pp_option_lemmatize])
self.assertLessEqual(r_p, 1)
gold = 'Der Eigentümer eines Grundstücks ist hinsichtlich der von einem darauf befindlichen Baum (hier: ' \
'Birken) ausgehenden natürlichen Immissionen auf benachbarte Grundstücke Störer i.S.d. § 1004 Abs. 1 ' \
'BGB, wenn er sein Grundstück nicht ordnungsgemäß bewirtschaftet. Hieran fehlt es in aller Regel, ' \
'wenn die für die Anpflanzung bestehenden landesrechtlichen Abstandsregelungen eingehalten sind. 1b. ' \
'Ein Anspruch auf Beseitigung des Baums lässt sich in diesem Fall regelmäßig auch nicht aus dem ' \
'nachbarlichen Gemeinschaftsverhältnis herleiten. Hält der Grundstückseigentümer die für die ' \
'Anpflanzung bestehenden landesrechtlichen Abstandsregelungen ein, hat der Eigentümer des ' \
'Nachbargrundstücks wegen der Beeinträchtigungen durch die von den Anpflanzungen ausgehenden ' \
'natürlichen Immissionen weder einen Ausgleichsanspruch gemäß § 906 Abs. 2 Satz 2 BGB in unmittelbarer ' \
'Anwendung noch einen nachbarrechtlichen Ausgleichsanspruch gemäß § 906 Abs. 2 Satz 2 analog (' \
'Abgrenzung zu Senat, Urteil vom 27. Oktober 2017 - V ZR 8/17, ZfIR 2018, 190). '
created = "Für die Entscheidung des Meinungsstreits ist von dem oben dargelegten Grundsatz auszugehen, " \
"dass der Eigentümer eines Grundstücks hinsichtlich der von einem darauf befindlichen Baum " \
"ausgehenden natürlichen Immissionen auf benachbarte Grundstücke Störer i.S.d. § 1004 Abs. 1 BGB " \
"ist, wenn er sein Grundstück nicht ordnungsgemäß bewirtschaftet. Hält der Grundstückseigentümer " \
"die für die Anpflanzung bestehenden landes-rechtlichen Abstandsregelungen ein, hat der Eigentümer " \
"des Nachbargrund-stücks wegen der Beeinträchtigungen durch die von den Anpflanzungen ausgehenden " \
"natürlichen Immissionen weder einen Ausgleichsanspruch gemäß § 906 Abs. 2 Satz 2 BGB in " \
"unmittelbarer Anwendung noch einen nachbarrechtlichen Ausgleichsanspruch gemäß § 906 Abs. 2 Satz 2 " \
"analog. Sind die für die Anpflanzung bestehenden landesrechtlichen Abstandsregelungen eingehalten, " \
"lässt sich ein Anspruch auf Beseitigung der Bäume in aller Regel - und so auch hier - nicht aus " \
"dem nachbarlichen Gemeinschaftsverhältnis herleiten. Gemäß § 907 Abs. 2 BGB gehören aber Bäume und " \
"Sträucher nicht zu den Anlagen i.S.d. § 907 Abs. 1 BGB. Ob den Grundstückseigentümer für " \
"natürliche Immissionen eine „Sicherungspflicht“ trifft und er damit Störer i.S.d. § 1004 Abs. 1 " \
"BGB ist, ist jeweils anhand der Umstände des Einzelfalls zu prüfen. Rechtsfehlerhaft ist jedoch " \
"die Auffassung des Berufungsgerichts, der Beklagte sei als Störer i.S.d. § 1004 Abs. 1 BGB für die " \
"von den Birken ausgehenden Immissionen auf das Grundstück des Klägers verantwortlich. In diesem " \
"Fall ist er regelmäßig schon nicht Störer, so dass es bereits an einem Beseitigungsanspruch gemäß " \
"§ 1004 Abs. 1 BGB fehlt und der von dem Berufungsgericht beschriebene Konflikt zwischen den Regeln " \
"des Bürgerlichen Gesetzbuchs und den landesrechtlichen Vorschriften nicht besteht. Voraussetzung " \
"hierfür ist jedoch, dass der in Anspruch genommene Grundstückseigentümer für die " \
"Eigentumsbeeinträchtigung verantwortlich und damit Störer i.S.d. § 1004 Abs. 1 BGB ist. "
r_p, r_r, r_f = rouge.rouge_l(created_summary=created, reference_summary=gold, extended_results=True,
pp_options=[utils.pp_option_stopwords, utils.pp_option_lemmatize])
self.assertLessEqual(r_p, 1)
self.assertLessEqual(r_r, 1)
def test_specific(self):
gold = 'Diese Voraussetzungen hat der XII. Zivilsenat für den vorliegenden Fall bejaht.'
created = 'Dies ist insbesondere der Fall, wenn die Sanktion außer Verhältnis zum Gewicht des Vertragsverstoßes und den Folgen für den Schuldner der Vertragsstrafe steht.'
created_2 = 'Deren Untergrenze ist mit 30 € angegeben.'
r_p, r_r, r_f = rouge.rouge_l(created_summary=created, reference_summary=gold, extended_results=True,
pp_options=[utils.pp_option_stopwords, utils.pp_option_lemmatize])
r_p_2, r_r_2, r_f_2 = rouge.rouge_l(created_summary=created_2, reference_summary=gold, extended_results=True,
pp_options=[utils.pp_option_stopwords, utils.pp_option_lemmatize])
r_p_c, r_r_c, r_f_c = rouge.rouge_l(created_summary=created+ ' '+created_2, reference_summary=gold, extended_results=True,
pp_options=[utils.pp_option_stopwords, utils.pp_option_lemmatize])
self.assertGreater(r_f, r_f_2)
self.assertGreater(r_f_c, r_f)

475
utils.py

@ -0,0 +1,475 @@
import json
import multiprocessing
import os
import re
import shutil
import pandas as pd
import settings
pool_processes = 8
pool_maxtask = 10
pool_chunksize = 30
leitsatz_str = 'leitsatz'
tenor_str = 'tenor'
tatbestand_str = 'tatbestand'
entscheidungsgruende_str = 'entscheidungsgruende'
aktenzeichen_str = 'aktenzeichen'
rii_text_columns = [leitsatz_str, tenor_str, tatbestand_str, entscheidungsgruende_str]
sentence_marks = ['.', ',', ';', '!', '?']
pp_option_lemmatize = 'preprocessing: lemmatize the text'
pp_option_stopwords = 'preprocessing: remove stopwords'
pp_option_case_normalize = 'preprocessing: normalize cases / put to lower'
pp_option_remove_qout_marks_sing = 'preprocessing: remove qoutation marks around single words'
no_stopword_list = ['nicht', 'kein']
entsch_gr_start_sentences = ['II.', 'B.', 'B']
def server_path(current_path, path):
"""
Method to add path in case it is run on server.
:param current_path: Path to add when run on server
:param path: Path for local
:return: Final path for local or server
"""
if settings.server:
path = current_path + '/' + path
return path
def open_file(current_path, path, modes, encoding=None, newline=None):
"""
Wraps the builtin open function to adjust to server settings
:param current_path: path of the calling file to adjust for server (without /)
:param path: Path for file loading relative to calling file
:param modes: Modes to apply
:param newline: newline option of the original method, if None nothing will be passed
:param encoding: encoding option of the original method, if None nothing will be passed
:return: the opened file
"""
if encoding is not None:
return open(server_path(current_path=current_path, path=path), modes, encoding=encoding)
if newline is not None:
return open(server_path(current_path=current_path, path=path), modes, newline=newline)
if newline is not None and encoding is not None:
return open(server_path(current_path=current_path, path=path), modes, encoding=encoding, newline=newline)
return open(server_path(current_path=current_path, path=path), modes)
def file_exists(current_path, path):
"""
Wraps the builtin exists function to adjust to server settings
:param current_path: path of the calling file to adjust for server (without /)
:param path: Path for file loading relative to calling file
:return: True if the file exists
"""
return os.path.exists(server_path(current_path=current_path, path=path))
def list_dir_files(current_path, path):
"""
Wraps the builtin os.listdir function to adjust to server settings
:param current_path: path of the calling file to adjust for server (without /)
:param path: Path for file loading relative to calling file
:return: The filenames of the directory
"""
return os.listdir(server_path(current_path=current_path, path=path))
def df_from_pickle(current_path, path):
"""
Wraps the pd.read_pickle function to adjust to server settings
:param current_path: path of the calling file to adjust for server (without /)
:param path: Path for file loading relative to calling file
:return: The loaded dataframe
"""
return pd.read_pickle(server_path(current_path=current_path, path=path))
def df_to_json(current_path, path, dataframe):
"""
Wraps the df.to_json function to adjust to server settings
:param current_path: path of the calling file to adjust for server (without /)
:param path: Path for file loading relative to calling file
:param dataframe: The dataframe to save
"""
dataframe.to_json(server_path(current_path=current_path, path=path))
def df_from_json(current_path, path):
"""
Wraps the json.load function in combination with a dataframe creation to adjust to server settings
:param current_path: path of the calling file to adjust for server (without /)
:param path: Path for file loading relative to calling file
:return: The loaded dataframe
"""
return pd.DataFrame(json.load(open_file(current_path=current_path, path=path, modes="r")))
def time_convert(sec):
"""
Gibt eine Zeitangabe hübsch aus. Format : Time Lapsed = hh:mm:ss
:param sec: Zeit zu zeigen
"""
mins = sec // 60
sec = sec % 60
hours = mins // 60
mins = mins % 60
print("Time Lapsed = {0}:{1}:{2}".format(int(hours), int(mins), sec))
def parallel_imap(function, packaged_args):
"""
Executes the given function in a parallel way. For list data.
:param function: Function to do in parallel.
:param packaged_args: Iterable of argumentpairs for each run to be done.
:return: Result of the parallel work
"""
if settings.server:
pool_obj = multiprocessing.Pool(maxtasksperchild=pool_maxtask)
result = pool_obj.imap(function, packaged_args, chunksize=pool_chunksize)
else:
pool_obj = multiprocessing.Pool(processes=pool_processes)
result = pool_obj.imap(function, packaged_args)
pool_obj.close()
pool_obj.join()
return result
def get_step_subset_raw(steps, path_to_dest_dataframe, source_data, dest_data, call_path):
"""
Method for stepwise work on datasets. Reads in the already present data and starts
where last time ended. Used for raw pickle-files in destination
:param steps: How many rows should be selcted now
:param path_to_dest_dataframe: Path on where to load the destination data
:param source_data: Source dataframe to select the rows
:param dest_data: empty dataframe to load the data into
:param call_path: path from which the method was called, for server path
:return: the subset of the source data an the loaded destintion data (source, dest)
"""
if steps > 0:
try:
try:
var = df_from_pickle(current_path=call_path, path=path_to_dest_dataframe)
except Exception:
var = df_from_json(current_path=call_path, path=path_to_dest_dataframe)
dest_data = pd.concat([dest_data, var], ignore_index=True)
start = dest_data.shape[0]
except OSError as _:
start = 0
finally:
end = start + steps
try: # case source is a dataframe
if end >= source_data.shape[0]:
return source_data.iloc[start:], dest_data # subset
else:
return source_data.iloc[start:end], dest_data # subset
except Exception:
if end >= len(source_data):
return source_data[start:], dest_data # subset
else:
return source_data[start:end], dest_data # subset
def remove_spaces_before_sentence_marks(text):
"""
Removes unneccessary spaces before '.' etc.
:param text: Text to replace in
:return: The cleaned text
"""
for sentence_mark in sentence_marks:
while ' ' + sentence_mark in text:
text = text.replace(' ' + sentence_mark, sentence_mark)
return text
def remove_brackets(text):
"""
Removes all matching round bracktet pairs () with their content. Always takes the first brackets that
appear in the text, so could also be an enumeration like a)
:param text: Text to remove the brackets from.
:return: Resulting text
"""
startindex = text.find('(')
res = ''
while startindex > -1:
endindex = startindex + text[startindex:].find(')')
if endindex > -1:
# in case there is a ' ' in front or after the brackets, remove one space
if startindex > 0 and text[startindex - 1] == ' ':
startindex -= 1
# if endindex < len(text) - 1 and text[endindex + 1] == ' ':
# endindex += 1
res += text[:startindex]
text = text[endindex + 1:]
else:
break
startindex = text.find('(')
res += text
return res
def remove_leading_keywords_and_listing_sentences(sentences):
"""
Method intended for Leitsätze. Some of them start with a single keyword in the first line.
This is removed. Additionally, Sentences which are only a listin ('1.') will also be removed.
:param sentences: List of sentences in the original order to remove these things from
:return: the list of sentences after removing
"""
# remove leading keywords and sentences which are only enumerations
sentences_var = list()
sentence_var = ''
for i in range(len(sentences)):
sentence = sentences[i].strip()
if len(sentence) > 1 and sentence[-1] == '.' and ' ' not in sentence: # at least two chars
if any(char.isdigit() for char in sentence) and sentence[0].isdigit(): # most likely enumeration like '1.'
continue
if i > 0 or (i == 0 and len(sentence) > 20):
# most likely not a short keyword at the beginning
if sentence[-1] == '.' or sentence[-1] == ',' or sentence[-1] == ':' or \
sentence[-1] == ';' or sentence[-1] == '!' or sentence[-1] == '?':
# sentence end
sentence_var += sentence
sentences_var.append(remove_spaces_before_sentence_marks(sentence_var))
sentence_var = ''
else:
# continuing sentence
sentence_var += sentence + ' '
return sentences_var
def prepare_leitsatz(l_satz):
"""
Does the preparation for Leitsätze: First splits into sentences, removes leading keywords and
single listing sentences and leading listings of sentences
:param l_satz: Original Leitsatz as one string
:return: prepared Leitsatz as a list of String
"""
sentences = split_into_sentences(l_satz)
sentences = remove_leading_keywords_and_listing_sentences(sentences)
sentences = [remove_leading_listing(sentence) for sentence in sentences]
return sentences
def select_list_subset(list_of_string, start_strings, end_string=None):
"""
Selects a subset of a list of strings. If the start_string is not in the list,
the whole original list is returned. (case-sensitive)
If more start strings are given, then it will be copied from the first occuring start string.
sometimes entscheidungsgruende II. is started not with II. but B. Use start_String_2 here
:param list_of_string: List to get subset from
:param start_strings: List of Strings to start to copy
:param end_string: First string where one shouldn't copy anymore, if none is given, then till the end
:return: Selected subset
"""
result_list = []
copy = False
for i in range(len(list_of_string)):
string = list_of_string[i]
if string in start_strings:
copy = True
if end_string is not None and string == end_string:
copy = False
if copy:
result_list.append(string)
# if nothing was found or very little was found
if len(result_list) == 0 or len(result_list) / len(list_of_string) < 0.2:
return list_of_string
return result_list
def abbreviation_ending(text):
"""
Checks for an input text whether it ends with a known legal abbreviation.
Known issues: numbers and roman numbering with following dots arent matched
:param text: Input Text
:return: True, if it does and with such an abbreviation, False otherwise
"""
abbrev_list = ['A.', ' a.', 'a.A.', 'a.a.O.', 'ABl.', ' abl.', 'Abs.', ' abs.', 'Abschn.', 'Abse.',
' abzgl.', 'a.D.', 'a.E.', ' a.F.', ' ähnl.', 'a.l.i.c.', ' allg.', ' allgem.',
'Alt.', 'AmtsBl.', ' and.', ' angef.', 'Anh.', 'Anl.', 'Anm.', ' Art.', '(Art.', ' aufgeh.',
'Aufl.', ' ausf.', 'Ausn.', 'BAnz.', 'BArbBl.', 'BayJMBl.', 'Bd.', 'Bde.', 'Bdg.',
'Bearb.', ' begr.', 'Beil.', 'Bek.', ' ber.', ' bes.', 'Beschl.', ' best.', ' bestr.',
'Betr.', ' betr.', 'Bf.', 'BGBl.', ' bish.', ' Bl.', 'BPräs.', 'BReg.', 'Bsp.', 'Bst.',
'BStBl.', 'BT-Drucks.', 'Buchst.', 'bzgl.', 'bzw.', 'c.i.c.', 'Co.', 'c.p.c.',
'c.s.q.n.', 'Ct.', ' dar.', 'Darst.', ' ders.', 'd.h.', 'Diss.', ' div.', 'Dr.',
'Drucks.', ' dto.', 'DVBl.', ' ebd.', ' Ed.', 'E.G.', ' eingef.', 'Einf.', 'Einl.',
' einschl.', 'Erg.', ' erk.Sen.', ' erk.', ' Erl.', 'etc.', 'E.U.', ' e.V.',
'EVertr.', ' evtl.', 'E.W.G.', ' F.', ' f.', ' Fa.', ' Festschr.', ' ff.', ' Fn.',
' form.', ' fr.', ' fr.Rspr.', ' Fz.', 'GBl.', ' geänd.', 'Gedschr.', ' geg.',
' gem.', 'Ges.', ' gg.', ' ggf.', ' ggü.', ' ggüb.', ' Gl.', ' GMBl.', 'G.o.A.',
'Grds.', ' grdsl.', 'Großkomm.', 'Großkomm.z.', 'GVBl.', 'GVOBl.', ' h.A.', 'Halbs.',
' h.c.', 'Hdlg.', 'Hess.', ' heut.', ' heut.Rspr.', ' hins.', ' h.L.', ' h.Lit.',
' h.M.', 'Hrsg.', ' h.Rspr.', 'HS.', 'Hs.', ' i.A.', ' ib.', ' ibd.', ' ibid.',
'i.d.', 'i.d.F.', 'i.d.R.', 'i.d.S.', 'i.E.', 'i.e.', 'i.e.S.', 'i.H.d.', 'i.H.v.',
'i.K.', ' incl.', ' inkl.', 'inkl.MwSt.', ' insb.', ' insbes.', 'Int.', ' i.O.',
' i.R.', ' i.R.d.', 'i.S.', 'i.S.d.', 'i.S.e.', 'i.S.v.', 'i.ü.', ' iur.', 'i.V.',
'i.V.m.', 'i.W.', 'i.Wes.', 'i.w.S.', 'i.Zw.', 'Jahrb.', ' jew.', ' Jh.', 'JMBl.',
' jur.', ' Kap.', ' Ko.', ' krit.', ' kzfr.', 'Lb.', 'Lfg.', 'lgfr.', ' Lief.',
'Lit.', ' lit.', ' lt.', 'Ltd.', 'M.A.', 'm.Änd.', 'MABl.', 'mat.', 'm.a.W.', 'm.E.',
' med.', 'mgl.', 'Mglkt.', 'MinBl.', 'Mio.', ' Mot.', 'M.M.', 'm.N.', 'Mod.',
' mögl.', 'Mot.', 'MünchKomm.', 'm.w.', 'm.w.N.', 'MwSt.', 'Mwst.', 'm.W.v.',
'm.zust.Anm.', 'Nachw.', 'Nachw.b.', ' nat.', 'Nds.', 'Neubearb.', 'Neuf.',
' neugef.', 'n.F.', 'Nr.', 'Nrn.', ' o.', 'o.Ä.', ' od.', ' oec.', ' öff.',
' o.g.', ' österr.', 'p.F.V.', ' pharm.', ' phil.', ' pol.', 'Postf.', ' pp.',
' ppA.', ' ppa.', 'Prof.', 'Prot.', ' publ.', ' p.V.', 'p.V.V.', 'q.e.d.',
'RdErl.', 'Rdn.', 'Rdnr.', 'RdSchr.', ' rel.', ' rer.', 'RGBl.', 'Rn.', 'Rspr.',
'Rz.', 'S.', ' s.', 's.a.', 'Schr.', ' scil.', 'Sen.', ' sinngem.', 'SiZess.',
'Slg.', 's.o.', ' sog.', 'Sonderbeil.', 'Stpfl.', ' str.', ' st.', 'st.Rspr.',
' st. Rspr.', 'stud.iur.', 's.u.', ' teilw.', ' theol.', 'Thür.', ' TO.', ' tw.',
'Tz.', ' u.', 'u.a.', 'UAbs.', 'u.a.m.', ' umstr.', ' unmgl.', 'Unmglkt.', ' unmögl.',
'Urt.', ' usw.', ' u.U.', ' V.', ' v.', 'Var.', 'Ver.', ' vgl.', 'V.m.', 'VOBl.',
'Vor.', 'Vorbem.', 'Warn.', ' weg.', ' wg.', 'W.G.G.', 'w.z.b.w.', 'z.B.', 'z.Hd.',
'Ziff.', 'z.T.', ' zust.', 'zust.Anm.', ' zw.' 'z.Z.', ' zzgl.', ';', 'II.1.a.', '(s.',
]
for abbrev in abbrev_list:
if text.endswith(abbrev):
return True
if len(text) >= 3 and re.search(" .\\.", text[-3:]):
return True
return False
def remove_leading_listing(sentence):
"""
removes leading listings / enumerations like 1. or a)
:param sentence: Sentence to remove from
:return: Processed sentence
"""
return split_leading_listing(sentence)[1]
def split_leading_listing(sentence):
"""
Splits the sentence from a possible listing (1. or a) ) at the start.
:param sentence: Sentence to split
:return: (start, rest) with start being the listing or None, if there is no listing and
rest being the rest of the sentence or the original sentence if there was no listing
"""
first_word = sentence.split()
if first_word is None or len(first_word) == 0:
first_word = ''
else:
first_word = first_word[0]
rest = sentence[len(first_word) + 1:]
# could be a name like M. Leicht
if (first_word.endswith('.') or first_word.endswith(')')) and len(rest.split()) > 1 and first_word != 'Art.':
# Enumeration!
return first_word, rest
else:
return None, sentence
def split_into_sentences(input_text):
"""
Splits text into sentences. Uses spacy sentences but fixes broken sentences on \n or Abbreviations
:param input_text: Text to split into sentences
:return: A list of sentences which where split
"""
paragraphs = input_text.split('\n')
sentences = list()
sentence_var = ''
# roughly split original leitsatz into sentences
for paragraph in paragraphs:
nlp_paragraph = settings.nlp(paragraph)
sentences_paragraph = []
for sent in nlp_paragraph.sents:
sent = sent.text.strip()
# some leading listings aren't detected
a, b = split_leading_listing(sent)
if a is not None:
sentences_paragraph.append(a)
sentences_paragraph.append(b)
for i in range(0, len(sentences_paragraph)):
# add a space before next token if it isn't a sentence mark
if not (sentences_paragraph[i].startswith('.') or sentences_paragraph[i].startswith(':')
or sentences_paragraph[i].startswith('?') or sentences_paragraph[i].startswith('!')):
sentence_var += ' '
sentence_var += sentences_paragraph[i]
# if not sentence_var.count('(') > sentence_var.count(
# ')') and not sentence_var.strip() == '': # no unclosed brackets
if (sentences_paragraph[i].endswith('.') or sentences_paragraph[i].endswith(':')
or sentences_paragraph[i].endswith('?') or sentences_paragraph[i].endswith('!')) \
and not abbreviation_ending(sentence_var) \
and not sentence_var.strip() == '':
# Satz sehr wahrscheinlich wirklich zuende
sentences.append(sentence_var.strip())
sentence_var = ''
if not sentence_var.strip() == '':
# if not sentence_var.count('(') > sentence_var.count(
# ')') and not sentence_var.strip() == '': # no unclosed brackets
sentences.append(sentence_var.strip()) # am Ende des Paragraphen soll auch fertig sein
sentence_var = ''
# end of whole text
if sentence_var.strip() != '':
sentences.append(sentence_var.strip())
return sentences
def preprocess_text(text, options):
"""
Allows simple preprocessing like lemmatization on strings.
:param text: Text to preprocess
:param options: Options specifying on what preprocessing is to be done, if None, text will be returned
:return: the preprocessed text, if text is None, the result will also be ''
"""
if text is None:
return ''
if options is None:
return text
text_spacy = settings.nlp(text)
result_text = ''
for token in text_spacy:
# stop-words removing: no stopwords or stopwords shouldn't be removed
if not token.is_stop or pp_option_stopwords not in options or token.lemma_ in no_stopword_list:
# lemmatization if wanted
if pp_option_lemmatize in options and token.text not in sentence_marks:
to_append = token.lemma_
else:
to_append = token.text
if pp_option_remove_qout_marks_sing in options and to_append[0] == '"' and to_append[-1] == '"':
to_append = to_append.replace('"', '')
result_text += to_append + ' '
result_text = result_text.strip()
# case-normlaization, all to lower
if pp_option_case_normalize in options:
return result_text.lower()
else:
return result_text
def create_dir(current_path, directory_name, delete=True):
"""
Creates a directory if it doesn't exist
:param current_path: path of the calling file
:param directory_name: name / path to create
:param delete: if True, than an old directory with same name will be delted
"""
if delete and file_exists(current_path=current_path, path=directory_name):
shutil.rmtree(server_path(current_path=current_path, path=directory_name))
if not file_exists(current_path=current_path, path=directory_name):
os.makedirs(server_path(current_path=current_path, path=directory_name))
Loading…
Cancel
Save