ICAIL_2023/utils.py

import json
import multiprocessing
import os
import re
import shutil

import pandas as pd

import settings

pool_processes = 8
pool_maxtask = 10
pool_chunksize = 30
leitsatz_str = 'leitsatz'
tenor_str = 'tenor'
tatbestand_str = 'tatbestand'
entscheidungsgruende_str = 'entscheidungsgruende'
aktenzeichen_str = 'aktenzeichen'
rii_text_columns = [leitsatz_str, tenor_str, tatbestand_str, entscheidungsgruende_str]
sentence_marks = ['.', ',', ';', '!', '?']
pp_option_lemmatize = 'preprocessing: lemmatize the text'
pp_option_stopwords = 'preprocessing: remove stopwords'
pp_option_case_normalize = 'preprocessing: normalize cases / put to lower'
pp_option_remove_qout_marks_sing = 'preprocessing: remove qoutation marks around single words'
no_stopword_list = ['nicht', 'kein']
entsch_gr_start_sentences = ['II.', 'B.', 'B']


def server_path(current_path, path):
    """
    Method to add path in case it is run on server.

    :param current_path: Path to add when run on server
    :param path: Path for local
    :return: Final path for local or server
    """
    if settings.server:
        path = current_path + '/' + path
    return path


def open_file(current_path, path, modes, encoding=None, newline=None):
    """
    Wraps the builtin open function to adjust to server settings

    :param current_path: path of the calling file to adjust for server (without /)
    :param path: Path for file loading relative to calling file
    :param modes: Modes to apply
    :param newline: newline option of the original method, if None nothing will be passed
    :param encoding: encoding option of the original method, if None nothing will be passed
    :return: the opened file
    """
    if encoding is not None:
        return open(server_path(current_path=current_path, path=path), modes, encoding=encoding)
    if newline is not None:
        return open(server_path(current_path=current_path, path=path), modes, newline=newline)
    if newline is not None and encoding is not None:
        return open(server_path(current_path=current_path, path=path), modes, encoding=encoding, newline=newline)
    return open(server_path(current_path=current_path, path=path), modes)


def file_exists(current_path, path):
    """
    Wraps the builtin exists function to adjust to server settings

    :param current_path: path of the calling file to adjust for server (without /)
    :param path: Path for file loading relative to calling file
    :return: True if the file exists
    """
    return os.path.exists(server_path(current_path=current_path, path=path))


def list_dir_files(current_path, path):
    """
    Wraps the builtin os.listdir function to adjust to server settings

    :param current_path: path of the calling file to adjust for server (without /)
    :param path: Path for file loading relative to calling file
    :return: The filenames of the directory
    """
    return os.listdir(server_path(current_path=current_path, path=path))


def df_from_pickle(current_path, path):
    """
    Wraps the pd.read_pickle function to adjust to server settings

    :param current_path: path of the calling file to adjust for server (without /)
    :param path: Path for file loading relative to calling file
    :return: The loaded dataframe
    """
    return pd.read_pickle(server_path(current_path=current_path, path=path))


def df_to_json(current_path, path, dataframe):
    """
    Wraps the df.to_json function to adjust to server settings

    :param current_path: path of the calling file to adjust for server (without /)
    :param path: Path for file loading relative to calling file
    :param dataframe: The dataframe to save
    """
    dataframe.to_json(server_path(current_path=current_path, path=path))


def df_from_json(current_path, path):
    """
    Wraps the json.load function in combination with a dataframe creation to adjust to server settings

    :param current_path: path of the calling file to adjust for server (without /)
    :param path: Path for file loading relative to calling file
    :return: The loaded dataframe
    """
    return pd.DataFrame(json.load(open_file(current_path=current_path, path=path, modes="r")))


def time_convert(sec):
    """
    Gibt eine Zeitangabe hübsch aus. Format : Time Lapsed = hh:mm:ss

    :param sec: Zeit zu zeigen
    """
    mins = sec // 60
    sec = sec % 60
    hours = mins // 60
    mins = mins % 60
    print("Time Lapsed = {0}:{1}:{2}".format(int(hours), int(mins), sec))


def parallel_imap(function, packaged_args):
    """
    Executes the given function in a parallel way. For list data.

    :param function: Function to do in parallel.
    :param packaged_args: Iterable of argumentpairs for each run to be done.
    :return: Result of the parallel work
    """
    if settings.server:
        pool_obj = multiprocessing.Pool(maxtasksperchild=pool_maxtask)
        result = pool_obj.imap(function, packaged_args, chunksize=pool_chunksize)
    else:
        pool_obj = multiprocessing.Pool(processes=pool_processes)
        result = pool_obj.imap(function, packaged_args)
    pool_obj.close()
    pool_obj.join()
    return result


def get_step_subset_raw(steps, path_to_dest_dataframe, source_data, dest_data, call_path):
    """
    Method for stepwise work on datasets. Reads in the already present data and starts
    where last time ended. Used for raw pickle-files in destination

    :param steps: How many rows should be selcted now
    :param path_to_dest_dataframe: Path on where to load the destination data
    :param source_data: Source dataframe to select the rows
    :param dest_data: empty dataframe to load the data into
    :param call_path: path from which the method was called, for server path
    :return: the subset of the source data an the loaded destintion data (source, dest)
    """
    if steps > 0:
        try:
            try:
                var = df_from_pickle(current_path=call_path, path=path_to_dest_dataframe)
            except Exception:
                var = df_from_json(current_path=call_path, path=path_to_dest_dataframe)
            dest_data = pd.concat([dest_data, var], ignore_index=True)
            start = dest_data.shape[0]
        except OSError as _:
            start = 0
        finally:
            end = start + steps
            try:  # case source is a dataframe
                if end >= source_data.shape[0]:
                    return source_data.iloc[start:], dest_data  # subset
                else:
                    return source_data.iloc[start:end], dest_data  # subset
            except Exception:
                if end >= len(source_data):
                    return source_data[start:], dest_data  # subset
                else:
                    return source_data[start:end], dest_data  # subset


def remove_spaces_before_sentence_marks(text):
    """
    Removes unneccessary spaces before '.' etc.

    :param text: Text to replace in
    :return: The cleaned text
    """
    for sentence_mark in sentence_marks:
        while ' ' + sentence_mark in text:
            text = text.replace(' ' + sentence_mark, sentence_mark)
    return text


def remove_brackets(text):
    """
    Removes all matching round bracktet pairs () with their content. Always takes the first brackets that
    appear in the text, so could also be an enumeration like a)

    :param text: Text to remove the brackets from.
    :return: Resulting text
    """
    startindex = text.find('(')
    res = ''
    while startindex > -1:
        endindex = startindex + text[startindex:].find(')')
        if endindex > -1:
            # in case there is a ' ' in front or after the brackets, remove one space
            if startindex > 0 and text[startindex - 1] == ' ':
                startindex -= 1
            # if endindex < len(text) - 1 and text[endindex + 1] == ' ':
            #   endindex += 1
            res += text[:startindex]
            text = text[endindex + 1:]
        else:
            break
        startindex = text.find('(')
    res += text
    return res


def remove_leading_keywords_and_listing_sentences(sentences):
    """
    Method intended for Leitsätze. Some of them start with a single keyword in the first line.
    This is removed. Additionally, Sentences which are only a listin ('1.') will also be removed.

    :param sentences: List of sentences in the original order to remove these things from
    :return: the list of sentences after removing
    """
    # remove leading keywords and sentences which are only enumerations
    sentences_var = list()
    sentence_var = ''
    for i in range(len(sentences)):
        sentence = sentences[i].strip()
        if len(sentence) > 1 and sentence[-1] == '.' and ' ' not in sentence:  # at least two chars
            if any(char.isdigit() for char in sentence) and sentence[0].isdigit():  # most likely enumeration like '1.'
                continue
        if i > 0 or (i == 0 and len(sentence) > 20):
            # most likely not a short keyword at the beginning
            if sentence[-1] == '.' or sentence[-1] == ',' or sentence[-1] == ':' or \
                    sentence[-1] == ';' or sentence[-1] == '!' or sentence[-1] == '?':
                # sentence end
                sentence_var += sentence
                sentences_var.append(remove_spaces_before_sentence_marks(sentence_var))
                sentence_var = ''
            else:
                # continuing sentence
                sentence_var += sentence + ' '
    return sentences_var


def prepare_leitsatz(l_satz):
    """
    Does the preparation for Leitsätze: First splits into sentences, removes leading keywords and
    single listing sentences and leading listings of sentences

    :param l_satz: Original Leitsatz as one string
    :return: prepared Leitsatz as a list of String
    """
    sentences = split_into_sentences(l_satz)
    sentences = remove_leading_keywords_and_listing_sentences(sentences)
    sentences = [remove_leading_listing(sentence) for sentence in sentences]
    return sentences


def select_list_subset(list_of_string, start_strings, end_string=None):
    """
    Selects a subset of a list of strings. If the start_string is not in the list,
    the whole original list is returned. (case-sensitive)
    If more start strings are given, then it will be copied from the first occuring start string.

    sometimes entscheidungsgruende II. is started not with II. but B. Use start_String_2 here

    :param list_of_string: List to get subset from
    :param start_strings: List of Strings to start to copy
    :param end_string: First string where one shouldn't copy anymore, if none is given, then till the end
    :return: Selected subset
    """
    result_list = []
    copy = False
    for i in range(len(list_of_string)):
        string = list_of_string[i]
        if string in start_strings:
            copy = True
        if end_string is not None and string == end_string:
            copy = False
        if copy:
            result_list.append(string)
    # if nothing was found or very little was found
    if len(result_list) == 0 or len(result_list) / len(list_of_string) < 0.2:
        return list_of_string
    return result_list


def abbreviation_ending(text):
    """
    Checks for an input text whether it ends with a known legal abbreviation.
    Known issues: numbers and roman numbering with following dots arent matched

    :param text: Input Text
    :return: True, if it does and with such an abbreviation, False otherwise
    """
    abbrev_list = ['A.', ' a.', 'a.A.', 'a.a.O.', 'ABl.', ' abl.', 'Abs.', ' abs.', 'Abschn.', 'Abse.',
                   ' abzgl.', 'a.D.', 'a.E.', ' a.F.', ' ähnl.', 'a.l.i.c.', ' allg.', ' allgem.',
                   'Alt.', 'AmtsBl.', ' and.', ' angef.', 'Anh.', 'Anl.', 'Anm.', ' Art.', '(Art.', ' aufgeh.',
                   'Aufl.', ' ausf.', 'Ausn.', 'BAnz.', 'BArbBl.', 'BayJMBl.', 'Bd.', 'Bde.', 'Bdg.',
                   'Bearb.', ' begr.', 'Beil.', 'Bek.', ' ber.', ' bes.', 'Beschl.', ' best.', ' bestr.',
                   'Betr.', ' betr.', 'Bf.', 'BGBl.', ' bish.', ' Bl.', 'BPräs.', 'BReg.', 'Bsp.', 'Bst.',
                   'BStBl.', 'BT-Drucks.', 'Buchst.', 'bzgl.', 'bzw.', 'c.i.c.', 'Co.', 'c.p.c.',
                   'c.s.q.n.', 'Ct.', ' dar.', 'Darst.', ' ders.', 'd.h.', 'Diss.', ' div.', 'Dr.',
                   'Drucks.', ' dto.', 'DVBl.', ' ebd.', ' Ed.', 'E.G.', ' eingef.', 'Einf.', 'Einl.',
                   ' einschl.', 'Erg.', ' erk.Sen.', ' erk.', ' Erl.', 'etc.', 'E.U.', ' e.V.',
                   'EVertr.', ' evtl.', 'E.W.G.', ' F.', ' f.', ' Fa.', ' Festschr.', ' ff.', ' Fn.',
                   ' form.', ' fr.', ' fr.Rspr.', ' Fz.', 'GBl.', ' geänd.', 'Gedschr.', ' geg.',
                   ' gem.', 'Ges.', ' gg.', ' ggf.', ' ggü.', ' ggüb.', ' Gl.', ' GMBl.', 'G.o.A.',
                   'Grds.', ' grdsl.', 'Großkomm.', 'Großkomm.z.', 'GVBl.', 'GVOBl.', ' h.A.', 'Halbs.',
                   ' h.c.', 'Hdlg.', 'Hess.', ' heut.', ' heut.Rspr.', ' hins.', ' h.L.', ' h.Lit.',
                   ' h.M.', 'Hrsg.', ' h.Rspr.', 'HS.', 'Hs.', ' i.A.', ' ib.', ' ibd.', ' ibid.',
                   'i.d.', 'i.d.F.', 'i.d.R.', 'i.d.S.', 'i.E.', 'i.e.', 'i.e.S.', 'i.H.d.', 'i.H.v.',
                   'i.K.', ' incl.', ' inkl.', 'inkl.MwSt.', ' insb.', ' insbes.', 'Int.', ' i.O.',
                   ' i.R.', ' i.R.d.', 'i.S.', 'i.S.d.', 'i.S.e.', 'i.S.v.', 'i.ü.', ' iur.', 'i.V.',
                   'i.V.m.', 'i.W.', 'i.Wes.', 'i.w.S.', 'i.Zw.', 'Jahrb.', ' jew.', ' Jh.', 'JMBl.',
                   ' jur.', ' Kap.', ' Ko.', ' krit.', ' kzfr.', 'Lb.', 'Lfg.', 'lgfr.', ' Lief.',
                   'Lit.', ' lit.',  ' lt.', 'Ltd.', 'M.A.', 'm.Änd.', 'MABl.', 'mat.', 'm.a.W.', 'm.E.',
                   ' med.', 'mgl.', 'Mglkt.', 'MinBl.', 'Mio.', ' Mot.', 'M.M.', 'm.N.', 'Mod.',
                   ' mögl.', 'Mot.', 'MünchKomm.', 'm.w.', 'm.w.N.', 'MwSt.', 'Mwst.', 'm.W.v.',
                   'm.zust.Anm.', 'Nachw.', 'Nachw.b.', ' nat.', 'Nds.', 'Neubearb.',  'Neuf.',
                   ' neugef.', 'n.F.', 'Nr.', 'Nrn.', ' o.', 'o.Ä.', ' od.', ' oec.', ' öff.',
                   ' o.g.', ' österr.', 'p.F.V.', ' pharm.', ' phil.', ' pol.', 'Postf.', ' pp.',
                   ' ppA.', ' ppa.', 'Prof.', 'Prot.', ' publ.', ' p.V.', 'p.V.V.', 'q.e.d.',
                   'RdErl.', 'Rdn.', 'Rdnr.', 'RdSchr.', ' rel.', ' rer.', 'RGBl.', 'Rn.', 'Rspr.',
                   'Rz.', 'S.', ' s.', 's.a.', 'Schr.', ' scil.', 'Sen.', ' sinngem.', 'SiZess.',
                   'Slg.', 's.o.', ' sog.', 'Sonderbeil.', 'Stpfl.', ' str.', ' st.', 'st.Rspr.',
                   ' st. Rspr.', 'stud.iur.', 's.u.', ' teilw.', ' theol.', 'Thür.', ' TO.', ' tw.',
                   'Tz.', ' u.', 'u.a.', 'UAbs.', 'u.a.m.', ' umstr.', ' unmgl.', 'Unmglkt.', ' unmögl.',
                   'Urt.', ' usw.', ' u.U.', ' V.', ' v.', 'Var.', 'Ver.', ' vgl.', 'V.m.', 'VOBl.',
                   'Vor.', 'Vorbem.', 'Warn.', ' weg.', ' wg.', 'W.G.G.', 'w.z.b.w.', 'z.B.', 'z.Hd.',
                   'Ziff.', 'z.T.', ' zust.', 'zust.Anm.', ' zw.' 'z.Z.', ' zzgl.', ';', 'II.1.a.',  '(s.',
                   ]
    for abbrev in abbrev_list:
        if text.endswith(abbrev):
            return True
    if len(text) >= 3 and re.search(" .\\.", text[-3:]):
        return True
    return False


def remove_leading_listing(sentence):
    """
    removes leading listings / enumerations like 1. or a)

    :param sentence: Sentence to remove from
    :return: Processed sentence
    """
    return split_leading_listing(sentence)[1]


def split_leading_listing(sentence):
    """
    Splits the sentence from a possible listing (1. or a) ) at the start.

    :param sentence: Sentence to split
    :return: (start, rest) with start being the listing or None, if there is no listing and
                rest being the rest of the sentence or the original sentence if there was no listing
    """
    first_word = sentence.split()
    if first_word is None or len(first_word) == 0:
        first_word = ''
    else:
        first_word = first_word[0]
    rest = sentence[len(first_word) + 1:]
    # could be a name like M. Leicht
    if (first_word.endswith('.') or first_word.endswith(')')) and len(rest.split()) > 1 and first_word != 'Art.':
        # Enumeration!
        return first_word, rest
    else:
        return None, sentence


def split_into_sentences(input_text):
    """
    Splits text into sentences. Uses spacy sentences but fixes broken sentences on \n or Abbreviations

    :param input_text: Text to split into sentences
    :return: A list of sentences which where split
    """

    paragraphs = input_text.split('\n')
    sentences = list()
    sentence_var = ''
    # roughly split original leitsatz into sentences
    for paragraph in paragraphs:
        nlp_paragraph = settings.nlp(paragraph)
        sentences_paragraph = []
        for sent in nlp_paragraph.sents:
            sent = sent.text.strip()
            # some leading listings aren't detected
            a, b = split_leading_listing(sent)
            if a is not None:
                sentences_paragraph.append(a)
            sentences_paragraph.append(b)
        for i in range(0, len(sentences_paragraph)):
            # add a space before next token if it isn't a sentence mark
            if not (sentences_paragraph[i].startswith('.') or sentences_paragraph[i].startswith(':')
                    or sentences_paragraph[i].startswith('?') or sentences_paragraph[i].startswith('!')):
                sentence_var += ' '
            sentence_var += sentences_paragraph[i]
            # if not sentence_var.count('(') > sentence_var.count(
            #        ')') and not sentence_var.strip() == '':  # no unclosed brackets
            if (sentences_paragraph[i].endswith('.') or sentences_paragraph[i].endswith(':')
                or sentences_paragraph[i].endswith('?') or sentences_paragraph[i].endswith('!')) \
                    and not abbreviation_ending(sentence_var) \
                    and not sentence_var.strip() == '':
                # Satz sehr wahrscheinlich wirklich zuende
                sentences.append(sentence_var.strip())
                sentence_var = ''
        if not sentence_var.strip() == '':
            #        if not sentence_var.count('(') > sentence_var.count(
            #               ')') and not sentence_var.strip() == '':  # no unclosed brackets
            sentences.append(sentence_var.strip())  # am Ende des Paragraphen soll auch fertig sein
            sentence_var = ''
    # end of whole text
    if sentence_var.strip() != '':
        sentences.append(sentence_var.strip())
    return sentences


def preprocess_text(text, options):
    """
    Allows simple preprocessing like lemmatization on strings.

    :param text: Text to preprocess
    :param options: Options specifying on what preprocessing is to be done, if None, text will be returned
    :return: the preprocessed text, if text is None, the result will also be ''
    """
    if text is None:
        return ''
    if options is None:
        return text
    text_spacy = settings.nlp(text)
    result_text = ''
    for token in text_spacy:
        # stop-words removing: no stopwords or stopwords shouldn't be removed
        if not token.is_stop or pp_option_stopwords not in options or token.lemma_ in no_stopword_list:
            # lemmatization if wanted
            if pp_option_lemmatize in options and token.text not in sentence_marks:
                to_append = token.lemma_
            else:
                to_append = token.text
            if pp_option_remove_qout_marks_sing in options and to_append[0] == '"' and to_append[-1] == '"':
                to_append = to_append.replace('"', '')
            result_text += to_append + ' '
    result_text = result_text.strip()
    # case-normlaization, all to lower
    if pp_option_case_normalize in options:
        return result_text.lower()
    else:
        return result_text


def create_dir(current_path, directory_name, delete=True):
    """
    Creates a directory if it doesn't exist

    :param current_path: path of the calling file
    :param directory_name: name / path to create
    :param delete: if True, than an old directory with same name will be delted
    """
    if delete and file_exists(current_path=current_path, path=directory_name):
        shutil.rmtree(server_path(current_path=current_path, path=directory_name))
    if not file_exists(current_path=current_path, path=directory_name):
        os.makedirs(server_path(current_path=current_path, path=directory_name))