ICAIL_2023/utils.py


								import json

								import multiprocessing

								import os

								import re

								import shutil


								import pandas as pd


								import settings


								pool_processes = 8

								pool_maxtask = 10

								pool_chunksize = 30

								leitsatz_str = 'leitsatz'

								tenor_str = 'tenor'

								tatbestand_str = 'tatbestand'

								entscheidungsgruende_str = 'entscheidungsgruende'

								aktenzeichen_str = 'aktenzeichen'

								rii_text_columns = [leitsatz_str, tenor_str, tatbestand_str, entscheidungsgruende_str]

								sentence_marks = ['.', ',', ';', '!', '?']

								pp_option_lemmatize = 'preprocessing: lemmatize the text'

								pp_option_stopwords = 'preprocessing: remove stopwords'

								pp_option_case_normalize = 'preprocessing: normalize cases / put to lower'

								pp_option_remove_qout_marks_sing = 'preprocessing: remove qoutation marks around single words'

								no_stopword_list = ['nicht', 'kein']

								entsch_gr_start_sentences = ['II.', 'B.', 'B']


								def server_path(current_path, path):

								    """

								    Method to add path in case it is run on server.


								    :param current_path: Path to add when run on server

								    :param path: Path for local

								    :return: Final path for local or server

								    """

								    if settings.server:

								        path = current_path + '/' + path

								    return path


								def open_file(current_path, path, modes, encoding=None, newline=None):

								    """

								    Wraps the builtin open function to adjust to server settings


								    :param current_path: path of the calling file to adjust for server (without /)

								    :param path: Path for file loading relative to calling file

								    :param modes: Modes to apply

								    :param newline: newline option of the original method, if None nothing will be passed

								    :param encoding: encoding option of the original method, if None nothing will be passed

								    :return: the opened file

								    """

								    if encoding is not None:

								        return open(server_path(current_path=current_path, path=path), modes, encoding=encoding)

								    if newline is not None:

								        return open(server_path(current_path=current_path, path=path), modes, newline=newline)

								    if newline is not None and encoding is not None:

								        return open(server_path(current_path=current_path, path=path), modes, encoding=encoding, newline=newline)

								    return open(server_path(current_path=current_path, path=path), modes)


								def file_exists(current_path, path):

								    """

								    Wraps the builtin exists function to adjust to server settings


								    :param current_path: path of the calling file to adjust for server (without /)

								    :param path: Path for file loading relative to calling file

								    :return: True if the file exists

								    """

								    return os.path.exists(server_path(current_path=current_path, path=path))


								def list_dir_files(current_path, path):

								    """

								    Wraps the builtin os.listdir function to adjust to server settings


								    :param current_path: path of the calling file to adjust for server (without /)

								    :param path: Path for file loading relative to calling file

								    :return: The filenames of the directory

								    """

								    return os.listdir(server_path(current_path=current_path, path=path))


								def df_from_pickle(current_path, path):

								    """

								    Wraps the pd.read_pickle function to adjust to server settings


								    :param current_path: path of the calling file to adjust for server (without /)

								    :param path: Path for file loading relative to calling file

								    :return: The loaded dataframe

								    """

								    return pd.read_pickle(server_path(current_path=current_path, path=path))


								def df_to_json(current_path, path, dataframe):

								    """

								    Wraps the df.to_json function to adjust to server settings


								    :param current_path: path of the calling file to adjust for server (without /)

								    :param path: Path for file loading relative to calling file

								    :param dataframe: The dataframe to save

								    """

								    dataframe.to_json(server_path(current_path=current_path, path=path))


								def df_from_json(current_path, path):

								    """

								    Wraps the json.load function in combination with a dataframe creation to adjust to server settings


								    :param current_path: path of the calling file to adjust for server (without /)

								    :param path: Path for file loading relative to calling file

								    :return: The loaded dataframe

								    """

								    return pd.DataFrame(json.load(open_file(current_path=current_path, path=path, modes="r")))


								def time_convert(sec):

								    """

								    Gibt eine Zeitangabe hübsch aus. Format : Time Lapsed = hh:mm:ss


								    :param sec: Zeit zu zeigen

								    """

								    mins = sec // 60

								    sec = sec % 60

								    hours = mins // 60

								    mins = mins % 60

								    print("Time Lapsed = {0}:{1}:{2}".format(int(hours), int(mins), sec))


								def parallel_imap(function, packaged_args):

								    """

								    Executes the given function in a parallel way. For list data.


								    :param function: Function to do in parallel.

								    :param packaged_args: Iterable of argumentpairs for each run to be done.

								    :return: Result of the parallel work

								    """

								    if settings.server:

								        pool_obj = multiprocessing.Pool(maxtasksperchild=pool_maxtask)

								        result = pool_obj.imap(function, packaged_args, chunksize=pool_chunksize)

								    else:

								        pool_obj = multiprocessing.Pool(processes=pool_processes)

								        result = pool_obj.imap(function, packaged_args)

								    pool_obj.close()

								    pool_obj.join()

								    return result


								def get_step_subset_raw(steps, path_to_dest_dataframe, source_data, dest_data, call_path):

								    """

								    Method for stepwise work on datasets. Reads in the already present data and starts

								    where last time ended. Used for raw pickle-files in destination


								    :param steps: How many rows should be selcted now

								    :param path_to_dest_dataframe: Path on where to load the destination data

								    :param source_data: Source dataframe to select the rows

								    :param dest_data: empty dataframe to load the data into

								    :param call_path: path from which the method was called, for server path

								    :return: the subset of the source data an the loaded destintion data (source, dest)

								    """

								    if steps > 0:

								        try:

								            try:

								                var = df_from_pickle(current_path=call_path, path=path_to_dest_dataframe)

								            except Exception:

								                var = df_from_json(current_path=call_path, path=path_to_dest_dataframe)

								            dest_data = pd.concat([dest_data, var], ignore_index=True)

								            start = dest_data.shape[0]

								        except OSError as _:

								            start = 0

								        finally:

								            end = start + steps

								            try:  # case source is a dataframe

								                if end >= source_data.shape[0]:

								                    return source_data.iloc[start:], dest_data  # subset

								                else:

								                    return source_data.iloc[start:end], dest_data  # subset

								            except Exception:

								                if end >= len(source_data):

								                    return source_data[start:], dest_data  # subset

								                else:

								                    return source_data[start:end], dest_data  # subset


								def remove_spaces_before_sentence_marks(text):

								    """

								    Removes unneccessary spaces before '.' etc.


								    :param text: Text to replace in

								    :return: The cleaned text

								    """

								    for sentence_mark in sentence_marks:

								        while ' ' + sentence_mark in text:

								            text = text.replace(' ' + sentence_mark, sentence_mark)

								    return text


								def remove_brackets(text):

								    """

								    Removes all matching round bracktet pairs () with their content. Always takes the first brackets that

								    appear in the text, so could also be an enumeration like a)


								    :param text: Text to remove the brackets from.

								    :return: Resulting text

								    """

								    startindex = text.find('(')

								    res = ''

								    while startindex > -1:

								        endindex = startindex + text[startindex:].find(')')

								        if endindex > -1:

								            # in case there is a ' ' in front or after the brackets, remove one space

								            if startindex > 0 and text[startindex - 1] == ' ':

								                startindex -= 1

								            # if endindex < len(text) - 1 and text[endindex + 1] == ' ':

								            #   endindex += 1

								            res += text[:startindex]

								            text = text[endindex + 1:]

								        else:

								            break

								        startindex = text.find('(')

								    res += text

								    return res


								def remove_leading_keywords_and_listing_sentences(sentences):

								    """

								    Method intended for Leitsätze. Some of them start with a single keyword in the first line.

								    This is removed. Additionally, Sentences which are only a listin ('1.') will also be removed.


								    :param sentences: List of sentences in the original order to remove these things from

								    :return: the list of sentences after removing

								    """

								    # remove leading keywords and sentences which are only enumerations

								    sentences_var = list()

								    sentence_var = ''

								    for i in range(len(sentences)):

								        sentence = sentences[i].strip()

								        if len(sentence) > 1 and sentence[-1] == '.' and ' ' not in sentence:  # at least two chars

								            if any(char.isdigit() for char in sentence) and sentence[0].isdigit():  # most likely enumeration like '1.'

								                continue

								        if i > 0 or (i == 0 and len(sentence) > 20):

								            # most likely not a short keyword at the beginning

								            if sentence[-1] == '.' or sentence[-1] == ',' or sentence[-1] == ':' or \

								                    sentence[-1] == ';' or sentence[-1] == '!' or sentence[-1] == '?':

								                # sentence end

								                sentence_var += sentence

								                sentences_var.append(remove_spaces_before_sentence_marks(sentence_var))

								                sentence_var = ''

								            else:

								                # continuing sentence

								                sentence_var += sentence + ' '

								    return sentences_var


								def prepare_leitsatz(l_satz):

								    """

								    Does the preparation for Leitsätze: First splits into sentences, removes leading keywords and

								    single listing sentences and leading listings of sentences


								    :param l_satz: Original Leitsatz as one string

								    :return: prepared Leitsatz as a list of String

								    """

								    sentences = split_into_sentences(l_satz)

								    sentences = remove_leading_keywords_and_listing_sentences(sentences)

								    sentences = [remove_leading_listing(sentence) for sentence in sentences]

								    return sentences


								def select_list_subset(list_of_string, start_strings, end_string=None):

								    """

								    Selects a subset of a list of strings. If the start_string is not in the list,

								    the whole original list is returned. (case-sensitive)

								    If more start strings are given, then it will be copied from the first occuring start string.


								    sometimes entscheidungsgruende II. is started not with II. but B. Use start_String_2 here


								    :param list_of_string: List to get subset from

								    :param start_strings: List of Strings to start to copy

								    :param end_string: First string where one shouldn't copy anymore, if none is given, then till the end

								    :return: Selected subset

								    """

								    result_list = []

								    copy = False

								    for i in range(len(list_of_string)):

								        string = list_of_string[i]

								        if string in start_strings:

								            copy = True

								        if end_string is not None and string == end_string:

								            copy = False

								        if copy:

								            result_list.append(string)

								    # if nothing was found or very little was found

								    if len(result_list) == 0 or len(result_list) / len(list_of_string) < 0.2:

								        return list_of_string

								    return result_list


								def abbreviation_ending(text):

								    """

								    Checks for an input text whether it ends with a known legal abbreviation.

								    Known issues: numbers and roman numbering with following dots arent matched


								    :param text: Input Text

								    :return: True, if it does and with such an abbreviation, False otherwise

								    """

								    abbrev_list = ['A.', ' a.', 'a.A.', 'a.a.O.', 'ABl.', ' abl.', 'Abs.', ' abs.', 'Abschn.', 'Abse.',

								                   ' abzgl.', 'a.D.', 'a.E.', ' a.F.', ' ähnl.', 'a.l.i.c.', ' allg.', ' allgem.',

								                   'Alt.', 'AmtsBl.', ' and.', ' angef.', 'Anh.', 'Anl.', 'Anm.', ' Art.', '(Art.', ' aufgeh.',

								                   'Aufl.', ' ausf.', 'Ausn.', 'BAnz.', 'BArbBl.', 'BayJMBl.', 'Bd.', 'Bde.', 'Bdg.',

								                   'Bearb.', ' begr.', 'Beil.', 'Bek.', ' ber.', ' bes.', 'Beschl.', ' best.', ' bestr.',

								                   'Betr.', ' betr.', 'Bf.', 'BGBl.', ' bish.', ' Bl.', 'BPräs.', 'BReg.', 'Bsp.', 'Bst.',

								                   'BStBl.', 'BT-Drucks.', 'Buchst.', 'bzgl.', 'bzw.', 'c.i.c.', 'Co.', 'c.p.c.',

								                   'c.s.q.n.', 'Ct.', ' dar.', 'Darst.', ' ders.', 'd.h.', 'Diss.', ' div.', 'Dr.',

								                   'Drucks.', ' dto.', 'DVBl.', ' ebd.', ' Ed.', 'E.G.', ' eingef.', 'Einf.', 'Einl.',

								                   ' einschl.', 'Erg.', ' erk.Sen.', ' erk.', ' Erl.', 'etc.', 'E.U.', ' e.V.',

								                   'EVertr.', ' evtl.', 'E.W.G.', ' F.', ' f.', ' Fa.', ' Festschr.', ' ff.', ' Fn.',

								                   ' form.', ' fr.', ' fr.Rspr.', ' Fz.', 'GBl.', ' geänd.', 'Gedschr.', ' geg.',

								                   ' gem.', 'Ges.', ' gg.', ' ggf.', ' ggü.', ' ggüb.', ' Gl.', ' GMBl.', 'G.o.A.',

								                   'Grds.', ' grdsl.', 'Großkomm.', 'Großkomm.z.', 'GVBl.', 'GVOBl.', ' h.A.', 'Halbs.',

								                   ' h.c.', 'Hdlg.', 'Hess.', ' heut.', ' heut.Rspr.', ' hins.', ' h.L.', ' h.Lit.',

								                   ' h.M.', 'Hrsg.', ' h.Rspr.', 'HS.', 'Hs.', ' i.A.', ' ib.', ' ibd.', ' ibid.',

								                   'i.d.', 'i.d.F.', 'i.d.R.', 'i.d.S.', 'i.E.', 'i.e.', 'i.e.S.', 'i.H.d.', 'i.H.v.',

								                   'i.K.', ' incl.', ' inkl.', 'inkl.MwSt.', ' insb.', ' insbes.', 'Int.', ' i.O.',

								                   ' i.R.', ' i.R.d.', 'i.S.', 'i.S.d.', 'i.S.e.', 'i.S.v.', 'i.ü.', ' iur.', 'i.V.',

								                   'i.V.m.', 'i.W.', 'i.Wes.', 'i.w.S.', 'i.Zw.', 'Jahrb.', ' jew.', ' Jh.', 'JMBl.',

								                   ' jur.', ' Kap.', ' Ko.', ' krit.', ' kzfr.', 'Lb.', 'Lfg.', 'lgfr.', ' Lief.',

								                   'Lit.', ' lit.',  ' lt.', 'Ltd.', 'M.A.', 'm.Änd.', 'MABl.', 'mat.', 'm.a.W.', 'm.E.',

								                   ' med.', 'mgl.', 'Mglkt.', 'MinBl.', 'Mio.', ' Mot.', 'M.M.', 'm.N.', 'Mod.',

								                   ' mögl.', 'Mot.', 'MünchKomm.', 'm.w.', 'm.w.N.', 'MwSt.', 'Mwst.', 'm.W.v.',

								                   'm.zust.Anm.', 'Nachw.', 'Nachw.b.', ' nat.', 'Nds.', 'Neubearb.',  'Neuf.',

								                   ' neugef.', 'n.F.', 'Nr.', 'Nrn.', ' o.', 'o.Ä.', ' od.', ' oec.', ' öff.',

								                   ' o.g.', ' österr.', 'p.F.V.', ' pharm.', ' phil.', ' pol.', 'Postf.', ' pp.',

								                   ' ppA.', ' ppa.', 'Prof.', 'Prot.', ' publ.', ' p.V.', 'p.V.V.', 'q.e.d.',

								                   'RdErl.', 'Rdn.', 'Rdnr.', 'RdSchr.', ' rel.', ' rer.', 'RGBl.', 'Rn.', 'Rspr.',

								                   'Rz.', 'S.', ' s.', 's.a.', 'Schr.', ' scil.', 'Sen.', ' sinngem.', 'SiZess.',

								                   'Slg.', 's.o.', ' sog.', 'Sonderbeil.', 'Stpfl.', ' str.', ' st.', 'st.Rspr.',

								                   ' st. Rspr.', 'stud.iur.', 's.u.', ' teilw.', ' theol.', 'Thür.', ' TO.', ' tw.',

								                   'Tz.', ' u.', 'u.a.', 'UAbs.', 'u.a.m.', ' umstr.', ' unmgl.', 'Unmglkt.', ' unmögl.',

								                   'Urt.', ' usw.', ' u.U.', ' V.', ' v.', 'Var.', 'Ver.', ' vgl.', 'V.m.', 'VOBl.',

								                   'Vor.', 'Vorbem.', 'Warn.', ' weg.', ' wg.', 'W.G.G.', 'w.z.b.w.', 'z.B.', 'z.Hd.',

								                   'Ziff.', 'z.T.', ' zust.', 'zust.Anm.', ' zw.' 'z.Z.', ' zzgl.', ';', 'II.1.a.',  '(s.',

								                   ]

								    for abbrev in abbrev_list:

								        if text.endswith(abbrev):

								            return True

								    if len(text) >= 3 and re.search(" .\\.", text[-3:]):

								        return True

								    return False


								def remove_leading_listing(sentence):

								    """

								    removes leading listings / enumerations like 1. or a)


								    :param sentence: Sentence to remove from

								    :return: Processed sentence

								    """

								    return split_leading_listing(sentence)[1]


								def split_leading_listing(sentence):

								    """

								    Splits the sentence from a possible listing (1. or a) ) at the start.


								    :param sentence: Sentence to split

								    :return: (start, rest) with start being the listing or None, if there is no listing and

								                rest being the rest of the sentence or the original sentence if there was no listing

								    """

								    first_word = sentence.split()

								    if first_word is None or len(first_word) == 0:

								        first_word = ''

								    else:

								        first_word = first_word[0]

								    rest = sentence[len(first_word) + 1:]

								    # could be a name like M. Leicht

								    if (first_word.endswith('.') or first_word.endswith(')')) and len(rest.split()) > 1 and first_word != 'Art.':

								        # Enumeration!

								        return first_word, rest

								    else:

								        return None, sentence


								def split_into_sentences(input_text):

								    """

								    Splits text into sentences. Uses spacy sentences but fixes broken sentences on \n or Abbreviations


								    :param input_text: Text to split into sentences

								    :return: A list of sentences which where split

								    """


								    paragraphs = input_text.split('\n')

								    sentences = list()

								    sentence_var = ''

								    # roughly split original leitsatz into sentences

								    for paragraph in paragraphs:

								        nlp_paragraph = settings.nlp(paragraph)

								        sentences_paragraph = []

								        for sent in nlp_paragraph.sents:

								            sent = sent.text.strip()

								            # some leading listings aren't detected

								            a, b = split_leading_listing(sent)

								            if a is not None:

								                sentences_paragraph.append(a)

								            sentences_paragraph.append(b)

								        for i in range(0, len(sentences_paragraph)):

								            # add a space before next token if it isn't a sentence mark

								            if not (sentences_paragraph[i].startswith('.') or sentences_paragraph[i].startswith(':')

								                    or sentences_paragraph[i].startswith('?') or sentences_paragraph[i].startswith('!')):

								                sentence_var += ' '

								            sentence_var += sentences_paragraph[i]

								            # if not sentence_var.count('(') > sentence_var.count(

								            #        ')') and not sentence_var.strip() == '':  # no unclosed brackets

								            if (sentences_paragraph[i].endswith('.') or sentences_paragraph[i].endswith(':')

								                or sentences_paragraph[i].endswith('?') or sentences_paragraph[i].endswith('!')) \

								                    and not abbreviation_ending(sentence_var) \

								                    and not sentence_var.strip() == '':

								                # Satz sehr wahrscheinlich wirklich zuende

								                sentences.append(sentence_var.strip())

								                sentence_var = ''

								        if not sentence_var.strip() == '':

								            #        if not sentence_var.count('(') > sentence_var.count(

								            #               ')') and not sentence_var.strip() == '':  # no unclosed brackets

								            sentences.append(sentence_var.strip())  # am Ende des Paragraphen soll auch fertig sein

								            sentence_var = ''

								    # end of whole text

								    if sentence_var.strip() != '':

								        sentences.append(sentence_var.strip())

								    return sentences


								def preprocess_text(text, options):

								    """

								    Allows simple preprocessing like lemmatization on strings.


								    :param text: Text to preprocess

								    :param options: Options specifying on what preprocessing is to be done, if None, text will be returned

								    :return: the preprocessed text, if text is None, the result will also be ''

								    """

								    if text is None:

								        return ''

								    if options is None:

								        return text

								    text_spacy = settings.nlp(text)

								    result_text = ''

								    for token in text_spacy:

								        # stop-words removing: no stopwords or stopwords shouldn't be removed

								        if not token.is_stop or pp_option_stopwords not in options or token.lemma_ in no_stopword_list:

								            # lemmatization if wanted

								            if pp_option_lemmatize in options and token.text not in sentence_marks:

								                to_append = token.lemma_

								            else:

								                to_append = token.text

								            if pp_option_remove_qout_marks_sing in options and to_append[0] == '"' and to_append[-1] == '"':

								                to_append = to_append.replace('"', '')

								            result_text += to_append + ' '

								    result_text = result_text.strip()

								    # case-normlaization, all to lower

								    if pp_option_case_normalize in options:

								        return result_text.lower()

								    else:

								        return result_text


								def create_dir(current_path, directory_name, delete=True):

								    """

								    Creates a directory if it doesn't exist


								    :param current_path: path of the calling file

								    :param directory_name: name / path to create

								    :param delete: if True, than an old directory with same name will be delted

								    """

								    if delete and file_exists(current_path=current_path, path=directory_name):

								        shutil.rmtree(server_path(current_path=current_path, path=directory_name))

								    if not file_exists(current_path=current_path, path=directory_name):

								        os.makedirs(server_path(current_path=current_path, path=directory_name))