You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

475 lines
20 KiB

  1. import json
  2. import multiprocessing
  3. import os
  4. import re
  5. import shutil
  6. import pandas as pd
  7. import settings
  8. pool_processes = 8
  9. pool_maxtask = 10
  10. pool_chunksize = 30
  11. leitsatz_str = 'leitsatz'
  12. tenor_str = 'tenor'
  13. tatbestand_str = 'tatbestand'
  14. entscheidungsgruende_str = 'entscheidungsgruende'
  15. aktenzeichen_str = 'aktenzeichen'
  16. rii_text_columns = [leitsatz_str, tenor_str, tatbestand_str, entscheidungsgruende_str]
  17. sentence_marks = ['.', ',', ';', '!', '?']
  18. pp_option_lemmatize = 'preprocessing: lemmatize the text'
  19. pp_option_stopwords = 'preprocessing: remove stopwords'
  20. pp_option_case_normalize = 'preprocessing: normalize cases / put to lower'
  21. pp_option_remove_qout_marks_sing = 'preprocessing: remove qoutation marks around single words'
  22. no_stopword_list = ['nicht', 'kein']
  23. entsch_gr_start_sentences = ['II.', 'B.', 'B']
  24. def server_path(current_path, path):
  25. """
  26. Method to add path in case it is run on server.
  27. :param current_path: Path to add when run on server
  28. :param path: Path for local
  29. :return: Final path for local or server
  30. """
  31. if settings.server:
  32. path = current_path + '/' + path
  33. return path
  34. def open_file(current_path, path, modes, encoding=None, newline=None):
  35. """
  36. Wraps the builtin open function to adjust to server settings
  37. :param current_path: path of the calling file to adjust for server (without /)
  38. :param path: Path for file loading relative to calling file
  39. :param modes: Modes to apply
  40. :param newline: newline option of the original method, if None nothing will be passed
  41. :param encoding: encoding option of the original method, if None nothing will be passed
  42. :return: the opened file
  43. """
  44. if encoding is not None:
  45. return open(server_path(current_path=current_path, path=path), modes, encoding=encoding)
  46. if newline is not None:
  47. return open(server_path(current_path=current_path, path=path), modes, newline=newline)
  48. if newline is not None and encoding is not None:
  49. return open(server_path(current_path=current_path, path=path), modes, encoding=encoding, newline=newline)
  50. return open(server_path(current_path=current_path, path=path), modes)
  51. def file_exists(current_path, path):
  52. """
  53. Wraps the builtin exists function to adjust to server settings
  54. :param current_path: path of the calling file to adjust for server (without /)
  55. :param path: Path for file loading relative to calling file
  56. :return: True if the file exists
  57. """
  58. return os.path.exists(server_path(current_path=current_path, path=path))
  59. def list_dir_files(current_path, path):
  60. """
  61. Wraps the builtin os.listdir function to adjust to server settings
  62. :param current_path: path of the calling file to adjust for server (without /)
  63. :param path: Path for file loading relative to calling file
  64. :return: The filenames of the directory
  65. """
  66. return os.listdir(server_path(current_path=current_path, path=path))
  67. def df_from_pickle(current_path, path):
  68. """
  69. Wraps the pd.read_pickle function to adjust to server settings
  70. :param current_path: path of the calling file to adjust for server (without /)
  71. :param path: Path for file loading relative to calling file
  72. :return: The loaded dataframe
  73. """
  74. return pd.read_pickle(server_path(current_path=current_path, path=path))
  75. def df_to_json(current_path, path, dataframe):
  76. """
  77. Wraps the df.to_json function to adjust to server settings
  78. :param current_path: path of the calling file to adjust for server (without /)
  79. :param path: Path for file loading relative to calling file
  80. :param dataframe: The dataframe to save
  81. """
  82. dataframe.to_json(server_path(current_path=current_path, path=path))
  83. def df_from_json(current_path, path):
  84. """
  85. Wraps the json.load function in combination with a dataframe creation to adjust to server settings
  86. :param current_path: path of the calling file to adjust for server (without /)
  87. :param path: Path for file loading relative to calling file
  88. :return: The loaded dataframe
  89. """
  90. return pd.DataFrame(json.load(open_file(current_path=current_path, path=path, modes="r")))
  91. def time_convert(sec):
  92. """
  93. Gibt eine Zeitangabe hübsch aus. Format : Time Lapsed = hh:mm:ss
  94. :param sec: Zeit zu zeigen
  95. """
  96. mins = sec // 60
  97. sec = sec % 60
  98. hours = mins // 60
  99. mins = mins % 60
  100. print("Time Lapsed = {0}:{1}:{2}".format(int(hours), int(mins), sec))
  101. def parallel_imap(function, packaged_args):
  102. """
  103. Executes the given function in a parallel way. For list data.
  104. :param function: Function to do in parallel.
  105. :param packaged_args: Iterable of argumentpairs for each run to be done.
  106. :return: Result of the parallel work
  107. """
  108. if settings.server:
  109. pool_obj = multiprocessing.Pool(maxtasksperchild=pool_maxtask)
  110. result = pool_obj.imap(function, packaged_args, chunksize=pool_chunksize)
  111. else:
  112. pool_obj = multiprocessing.Pool(processes=pool_processes)
  113. result = pool_obj.imap(function, packaged_args)
  114. pool_obj.close()
  115. pool_obj.join()
  116. return result
  117. def get_step_subset_raw(steps, path_to_dest_dataframe, source_data, dest_data, call_path):
  118. """
  119. Method for stepwise work on datasets. Reads in the already present data and starts
  120. where last time ended. Used for raw pickle-files in destination
  121. :param steps: How many rows should be selcted now
  122. :param path_to_dest_dataframe: Path on where to load the destination data
  123. :param source_data: Source dataframe to select the rows
  124. :param dest_data: empty dataframe to load the data into
  125. :param call_path: path from which the method was called, for server path
  126. :return: the subset of the source data an the loaded destintion data (source, dest)
  127. """
  128. if steps > 0:
  129. try:
  130. try:
  131. var = df_from_pickle(current_path=call_path, path=path_to_dest_dataframe)
  132. except Exception:
  133. var = df_from_json(current_path=call_path, path=path_to_dest_dataframe)
  134. dest_data = pd.concat([dest_data, var], ignore_index=True)
  135. start = dest_data.shape[0]
  136. except OSError as _:
  137. start = 0
  138. finally:
  139. end = start + steps
  140. try: # case source is a dataframe
  141. if end >= source_data.shape[0]:
  142. return source_data.iloc[start:], dest_data # subset
  143. else:
  144. return source_data.iloc[start:end], dest_data # subset
  145. except Exception:
  146. if end >= len(source_data):
  147. return source_data[start:], dest_data # subset
  148. else:
  149. return source_data[start:end], dest_data # subset
  150. def remove_spaces_before_sentence_marks(text):
  151. """
  152. Removes unneccessary spaces before '.' etc.
  153. :param text: Text to replace in
  154. :return: The cleaned text
  155. """
  156. for sentence_mark in sentence_marks:
  157. while ' ' + sentence_mark in text:
  158. text = text.replace(' ' + sentence_mark, sentence_mark)
  159. return text
  160. def remove_brackets(text):
  161. """
  162. Removes all matching round bracktet pairs () with their content. Always takes the first brackets that
  163. appear in the text, so could also be an enumeration like a)
  164. :param text: Text to remove the brackets from.
  165. :return: Resulting text
  166. """
  167. startindex = text.find('(')
  168. res = ''
  169. while startindex > -1:
  170. endindex = startindex + text[startindex:].find(')')
  171. if endindex > -1:
  172. # in case there is a ' ' in front or after the brackets, remove one space
  173. if startindex > 0 and text[startindex - 1] == ' ':
  174. startindex -= 1
  175. # if endindex < len(text) - 1 and text[endindex + 1] == ' ':
  176. # endindex += 1
  177. res += text[:startindex]
  178. text = text[endindex + 1:]
  179. else:
  180. break
  181. startindex = text.find('(')
  182. res += text
  183. return res
  184. def remove_leading_keywords_and_listing_sentences(sentences):
  185. """
  186. Method intended for Leitsätze. Some of them start with a single keyword in the first line.
  187. This is removed. Additionally, Sentences which are only a listin ('1.') will also be removed.
  188. :param sentences: List of sentences in the original order to remove these things from
  189. :return: the list of sentences after removing
  190. """
  191. # remove leading keywords and sentences which are only enumerations
  192. sentences_var = list()
  193. sentence_var = ''
  194. for i in range(len(sentences)):
  195. sentence = sentences[i].strip()
  196. if len(sentence) > 1 and sentence[-1] == '.' and ' ' not in sentence: # at least two chars
  197. if any(char.isdigit() for char in sentence) and sentence[0].isdigit(): # most likely enumeration like '1.'
  198. continue
  199. if i > 0 or (i == 0 and len(sentence) > 20):
  200. # most likely not a short keyword at the beginning
  201. if sentence[-1] == '.' or sentence[-1] == ',' or sentence[-1] == ':' or \
  202. sentence[-1] == ';' or sentence[-1] == '!' or sentence[-1] == '?':
  203. # sentence end
  204. sentence_var += sentence
  205. sentences_var.append(remove_spaces_before_sentence_marks(sentence_var))
  206. sentence_var = ''
  207. else:
  208. # continuing sentence
  209. sentence_var += sentence + ' '
  210. return sentences_var
  211. def prepare_leitsatz(l_satz):
  212. """
  213. Does the preparation for Leitsätze: First splits into sentences, removes leading keywords and
  214. single listing sentences and leading listings of sentences
  215. :param l_satz: Original Leitsatz as one string
  216. :return: prepared Leitsatz as a list of String
  217. """
  218. sentences = split_into_sentences(l_satz)
  219. sentences = remove_leading_keywords_and_listing_sentences(sentences)
  220. sentences = [remove_leading_listing(sentence) for sentence in sentences]
  221. return sentences
  222. def select_list_subset(list_of_string, start_strings, end_string=None):
  223. """
  224. Selects a subset of a list of strings. If the start_string is not in the list,
  225. the whole original list is returned. (case-sensitive)
  226. If more start strings are given, then it will be copied from the first occuring start string.
  227. sometimes entscheidungsgruende II. is started not with II. but B. Use start_String_2 here
  228. :param list_of_string: List to get subset from
  229. :param start_strings: List of Strings to start to copy
  230. :param end_string: First string where one shouldn't copy anymore, if none is given, then till the end
  231. :return: Selected subset
  232. """
  233. result_list = []
  234. copy = False
  235. for i in range(len(list_of_string)):
  236. string = list_of_string[i]
  237. if string in start_strings:
  238. copy = True
  239. if end_string is not None and string == end_string:
  240. copy = False
  241. if copy:
  242. result_list.append(string)
  243. # if nothing was found or very little was found
  244. if len(result_list) == 0 or len(result_list) / len(list_of_string) < 0.2:
  245. return list_of_string
  246. return result_list
  247. def abbreviation_ending(text):
  248. """
  249. Checks for an input text whether it ends with a known legal abbreviation.
  250. Known issues: numbers and roman numbering with following dots arent matched
  251. :param text: Input Text
  252. :return: True, if it does and with such an abbreviation, False otherwise
  253. """
  254. abbrev_list = ['A.', ' a.', 'a.A.', 'a.a.O.', 'ABl.', ' abl.', 'Abs.', ' abs.', 'Abschn.', 'Abse.',
  255. ' abzgl.', 'a.D.', 'a.E.', ' a.F.', ' ähnl.', 'a.l.i.c.', ' allg.', ' allgem.',
  256. 'Alt.', 'AmtsBl.', ' and.', ' angef.', 'Anh.', 'Anl.', 'Anm.', ' Art.', '(Art.', ' aufgeh.',
  257. 'Aufl.', ' ausf.', 'Ausn.', 'BAnz.', 'BArbBl.', 'BayJMBl.', 'Bd.', 'Bde.', 'Bdg.',
  258. 'Bearb.', ' begr.', 'Beil.', 'Bek.', ' ber.', ' bes.', 'Beschl.', ' best.', ' bestr.',
  259. 'Betr.', ' betr.', 'Bf.', 'BGBl.', ' bish.', ' Bl.', 'BPräs.', 'BReg.', 'Bsp.', 'Bst.',
  260. 'BStBl.', 'BT-Drucks.', 'Buchst.', 'bzgl.', 'bzw.', 'c.i.c.', 'Co.', 'c.p.c.',
  261. 'c.s.q.n.', 'Ct.', ' dar.', 'Darst.', ' ders.', 'd.h.', 'Diss.', ' div.', 'Dr.',
  262. 'Drucks.', ' dto.', 'DVBl.', ' ebd.', ' Ed.', 'E.G.', ' eingef.', 'Einf.', 'Einl.',
  263. ' einschl.', 'Erg.', ' erk.Sen.', ' erk.', ' Erl.', 'etc.', 'E.U.', ' e.V.',
  264. 'EVertr.', ' evtl.', 'E.W.G.', ' F.', ' f.', ' Fa.', ' Festschr.', ' ff.', ' Fn.',
  265. ' form.', ' fr.', ' fr.Rspr.', ' Fz.', 'GBl.', ' geänd.', 'Gedschr.', ' geg.',
  266. ' gem.', 'Ges.', ' gg.', ' ggf.', ' ggü.', ' ggüb.', ' Gl.', ' GMBl.', 'G.o.A.',
  267. 'Grds.', ' grdsl.', 'Großkomm.', 'Großkomm.z.', 'GVBl.', 'GVOBl.', ' h.A.', 'Halbs.',
  268. ' h.c.', 'Hdlg.', 'Hess.', ' heut.', ' heut.Rspr.', ' hins.', ' h.L.', ' h.Lit.',
  269. ' h.M.', 'Hrsg.', ' h.Rspr.', 'HS.', 'Hs.', ' i.A.', ' ib.', ' ibd.', ' ibid.',
  270. 'i.d.', 'i.d.F.', 'i.d.R.', 'i.d.S.', 'i.E.', 'i.e.', 'i.e.S.', 'i.H.d.', 'i.H.v.',
  271. 'i.K.', ' incl.', ' inkl.', 'inkl.MwSt.', ' insb.', ' insbes.', 'Int.', ' i.O.',
  272. ' i.R.', ' i.R.d.', 'i.S.', 'i.S.d.', 'i.S.e.', 'i.S.v.', 'i.ü.', ' iur.', 'i.V.',
  273. 'i.V.m.', 'i.W.', 'i.Wes.', 'i.w.S.', 'i.Zw.', 'Jahrb.', ' jew.', ' Jh.', 'JMBl.',
  274. ' jur.', ' Kap.', ' Ko.', ' krit.', ' kzfr.', 'Lb.', 'Lfg.', 'lgfr.', ' Lief.',
  275. 'Lit.', ' lit.', ' lt.', 'Ltd.', 'M.A.', 'm.Änd.', 'MABl.', 'mat.', 'm.a.W.', 'm.E.',
  276. ' med.', 'mgl.', 'Mglkt.', 'MinBl.', 'Mio.', ' Mot.', 'M.M.', 'm.N.', 'Mod.',
  277. ' mögl.', 'Mot.', 'MünchKomm.', 'm.w.', 'm.w.N.', 'MwSt.', 'Mwst.', 'm.W.v.',
  278. 'm.zust.Anm.', 'Nachw.', 'Nachw.b.', ' nat.', 'Nds.', 'Neubearb.', 'Neuf.',
  279. ' neugef.', 'n.F.', 'Nr.', 'Nrn.', ' o.', 'o.Ä.', ' od.', ' oec.', ' öff.',
  280. ' o.g.', ' österr.', 'p.F.V.', ' pharm.', ' phil.', ' pol.', 'Postf.', ' pp.',
  281. ' ppA.', ' ppa.', 'Prof.', 'Prot.', ' publ.', ' p.V.', 'p.V.V.', 'q.e.d.',
  282. 'RdErl.', 'Rdn.', 'Rdnr.', 'RdSchr.', ' rel.', ' rer.', 'RGBl.', 'Rn.', 'Rspr.',
  283. 'Rz.', 'S.', ' s.', 's.a.', 'Schr.', ' scil.', 'Sen.', ' sinngem.', 'SiZess.',
  284. 'Slg.', 's.o.', ' sog.', 'Sonderbeil.', 'Stpfl.', ' str.', ' st.', 'st.Rspr.',
  285. ' st. Rspr.', 'stud.iur.', 's.u.', ' teilw.', ' theol.', 'Thür.', ' TO.', ' tw.',
  286. 'Tz.', ' u.', 'u.a.', 'UAbs.', 'u.a.m.', ' umstr.', ' unmgl.', 'Unmglkt.', ' unmögl.',
  287. 'Urt.', ' usw.', ' u.U.', ' V.', ' v.', 'Var.', 'Ver.', ' vgl.', 'V.m.', 'VOBl.',
  288. 'Vor.', 'Vorbem.', 'Warn.', ' weg.', ' wg.', 'W.G.G.', 'w.z.b.w.', 'z.B.', 'z.Hd.',
  289. 'Ziff.', 'z.T.', ' zust.', 'zust.Anm.', ' zw.' 'z.Z.', ' zzgl.', ';', 'II.1.a.', '(s.',
  290. ]
  291. for abbrev in abbrev_list:
  292. if text.endswith(abbrev):
  293. return True
  294. if len(text) >= 3 and re.search(" .\\.", text[-3:]):
  295. return True
  296. return False
  297. def remove_leading_listing(sentence):
  298. """
  299. removes leading listings / enumerations like 1. or a)
  300. :param sentence: Sentence to remove from
  301. :return: Processed sentence
  302. """
  303. return split_leading_listing(sentence)[1]
  304. def split_leading_listing(sentence):
  305. """
  306. Splits the sentence from a possible listing (1. or a) ) at the start.
  307. :param sentence: Sentence to split
  308. :return: (start, rest) with start being the listing or None, if there is no listing and
  309. rest being the rest of the sentence or the original sentence if there was no listing
  310. """
  311. first_word = sentence.split()
  312. if first_word is None or len(first_word) == 0:
  313. first_word = ''
  314. else:
  315. first_word = first_word[0]
  316. rest = sentence[len(first_word) + 1:]
  317. # could be a name like M. Leicht
  318. if (first_word.endswith('.') or first_word.endswith(')')) and len(rest.split()) > 1 and first_word != 'Art.':
  319. # Enumeration!
  320. return first_word, rest
  321. else:
  322. return None, sentence
  323. def split_into_sentences(input_text):
  324. """
  325. Splits text into sentences. Uses spacy sentences but fixes broken sentences on \n or Abbreviations
  326. :param input_text: Text to split into sentences
  327. :return: A list of sentences which where split
  328. """
  329. paragraphs = input_text.split('\n')
  330. sentences = list()
  331. sentence_var = ''
  332. # roughly split original leitsatz into sentences
  333. for paragraph in paragraphs:
  334. nlp_paragraph = settings.nlp(paragraph)
  335. sentences_paragraph = []
  336. for sent in nlp_paragraph.sents:
  337. sent = sent.text.strip()
  338. # some leading listings aren't detected
  339. a, b = split_leading_listing(sent)
  340. if a is not None:
  341. sentences_paragraph.append(a)
  342. sentences_paragraph.append(b)
  343. for i in range(0, len(sentences_paragraph)):
  344. # add a space before next token if it isn't a sentence mark
  345. if not (sentences_paragraph[i].startswith('.') or sentences_paragraph[i].startswith(':')
  346. or sentences_paragraph[i].startswith('?') or sentences_paragraph[i].startswith('!')):
  347. sentence_var += ' '
  348. sentence_var += sentences_paragraph[i]
  349. # if not sentence_var.count('(') > sentence_var.count(
  350. # ')') and not sentence_var.strip() == '': # no unclosed brackets
  351. if (sentences_paragraph[i].endswith('.') or sentences_paragraph[i].endswith(':')
  352. or sentences_paragraph[i].endswith('?') or sentences_paragraph[i].endswith('!')) \
  353. and not abbreviation_ending(sentence_var) \
  354. and not sentence_var.strip() == '':
  355. # Satz sehr wahrscheinlich wirklich zuende
  356. sentences.append(sentence_var.strip())
  357. sentence_var = ''
  358. if not sentence_var.strip() == '':
  359. # if not sentence_var.count('(') > sentence_var.count(
  360. # ')') and not sentence_var.strip() == '': # no unclosed brackets
  361. sentences.append(sentence_var.strip()) # am Ende des Paragraphen soll auch fertig sein
  362. sentence_var = ''
  363. # end of whole text
  364. if sentence_var.strip() != '':
  365. sentences.append(sentence_var.strip())
  366. return sentences
  367. def preprocess_text(text, options):
  368. """
  369. Allows simple preprocessing like lemmatization on strings.
  370. :param text: Text to preprocess
  371. :param options: Options specifying on what preprocessing is to be done, if None, text will be returned
  372. :return: the preprocessed text, if text is None, the result will also be ''
  373. """
  374. if text is None:
  375. return ''
  376. if options is None:
  377. return text
  378. text_spacy = settings.nlp(text)
  379. result_text = ''
  380. for token in text_spacy:
  381. # stop-words removing: no stopwords or stopwords shouldn't be removed
  382. if not token.is_stop or pp_option_stopwords not in options or token.lemma_ in no_stopword_list:
  383. # lemmatization if wanted
  384. if pp_option_lemmatize in options and token.text not in sentence_marks:
  385. to_append = token.lemma_
  386. else:
  387. to_append = token.text
  388. if pp_option_remove_qout_marks_sing in options and to_append[0] == '"' and to_append[-1] == '"':
  389. to_append = to_append.replace('"', '')
  390. result_text += to_append + ' '
  391. result_text = result_text.strip()
  392. # case-normlaization, all to lower
  393. if pp_option_case_normalize in options:
  394. return result_text.lower()
  395. else:
  396. return result_text
  397. def create_dir(current_path, directory_name, delete=True):
  398. """
  399. Creates a directory if it doesn't exist
  400. :param current_path: path of the calling file
  401. :param directory_name: name / path to create
  402. :param delete: if True, than an old directory with same name will be delted
  403. """
  404. if delete and file_exists(current_path=current_path, path=directory_name):
  405. shutil.rmtree(server_path(current_path=current_path, path=directory_name))
  406. if not file_exists(current_path=current_path, path=directory_name):
  407. os.makedirs(server_path(current_path=current_path, path=directory_name))