ICAIL_2023/data/download_rii.py

import time
import xml.etree.ElementTree as ET
import urllib.request as request
import zipfile
import os

import pandas as pd

import settings
import utils
from utils import time_convert

base_dir_bgh = 'raw_data/BGH_Data'
extended_dir_bgh = base_dir_bgh + '/senates'
dataframe_dir_bgh = 'dataframes/bgh/'
pickle_name_bgh = 'bgh_data.pkl'
simple_attributes = ["doknr", "ecli", "gertyp", "gerort", "spruchkoerper", "entsch-datum",
                     "aktenzeichen", "doktyp", "norm", "vorinstanz", "mitwirkung", "titelzeile",
                     "leitsatz", "sonstosatz", "tenor", "tatbestand", "entscheidungsgruende",
                     "gruende", "abwmeinung", "sonstlt", "identifier", "coverage", "language",
                     "publisher", "accessRights"]
nested_attributes = ["region_abk", "region_long"]
text_attributes = ["titelzeile", "leitsatz", "sonstosatz", "tenor", "tatbestand",
                   "entscheidungsgruende", "gruende", "abwmeinung", "sonstlt"]
stopword_extension = '_no_stopwords'
current_path = 'data'


def get_file_list():
    """
    Makes http request for the files
    :return: the web page with all current cases as an xml-tree
    """
    xml_file, https_message = request.urlretrieve('https://www.rechtsprechung-im-internet.de/rii-toc.xml')
    tree = ET.parse(xml_file)
    root = tree.getroot()
    return root


def count_cases(root, tag):
    """
    counts all cases belonging to the given tag and returns the count
    :param root: downloaded xml-tree with all files
    :param tag: tag to find in the name
    :return: number of cases belonging to the BGH
    """
    count = 0
    for child in root:
        if tag in child[0].text:
            count += 1
    return count


def download(base_dir, extended_dir, tag):
    """
    download all cases to a folder related to their senats
    :param base_dir: Name of the directory for the data
    :param extended_dir: name of the subdirectory for saving
    :param tag: tag to recognize the court (BGH, BVerwG)
    """
    # set up directories
    utils.create_dir(current_path=current_path, directory_name=base_dir)
    utils.create_dir(current_path=current_path, directory_name=extended_dir)
    # do the download
    root = get_file_list()  # 0 ist gericht, 3 ist link
    max_cases = count_cases(root, tag)
    downloaded = 0
    for child in root:
        while True:
            try:
                if tag in child[0].text:
                    filename, http = request.urlretrieve(child[3].text)
                    with zipfile.ZipFile(filename, 'r') as zip_ref:
                        zip_ref.extractall(
                            utils.server_path(current_path=current_path,
                                              path=extended_dir + '/' + child[0].text.replace('\n', '') + '/'))
                    os.remove(filename)
                    downloaded += 1
                    print("\rDownloaded %d of %d " % (downloaded, max_cases) + tag + "Cases", end="")
            finally:
                break
    print("\nDone!")


def read_file_data(file):
    """
    Reads the data of one case / file.

    :param file: package containing (filename, directory, directory extension) to address the file
    :return: a dictionary with key: attribute_name and value: attribute_value
    """
    filename, directory, extended_dir = file
    tree = ET.parse(utils.server_path(current_path=current_path, path=os.path.join(extended_dir, directory, filename)))
    root = tree.getroot()
    res = {}
    for attribute in simple_attributes:
        attr = root.find(attribute)  # leitsatz überprüfen: zwei Worte zusammen, aber leerzeichen immer noch da!
        text = ''
        for t in attr.itertext():
            if t == '.' or t == ',' or t == ';' or t == '!' or t == '?':
                text = text.strip()     # remove space before these characters
            text += t + ' '
        text = text.strip()
        if text == '':
            res[attribute] = None
        else:
            res[attribute] = text

    for attribute in nested_attributes:
        nesting = attribute.split('_')
        xml_tag = root
        # find nested attribute
        for i in range(len(nesting)):
            xml_tag = xml_tag.find(nesting[i])
        text = ""
        for t in xml_tag.itertext():
            if t == '.' or t == ',' or t == ';' or t == '!' or t == '?':
                text = text.strip()     # remove space before these characters
            text += t + ' '
        text = text.strip()
        if text == '':
            res[attribute] = None
        else:
            res[attribute] = text

    for attribute in utils.rii_text_columns:
        if res[attribute] is not None:
            if settings.remove_brackets:
                res[attribute] = utils.remove_brackets(res[attribute])
            res[attribute] = utils.remove_spaces_before_sentence_marks(res[attribute])

    return pd.DataFrame(res, index=[0])


def create_pickle(extended_dir, pickle_name, steps):
    """
    Combines all downloaded files of the given extended directory into one pickle

    :param extended_dir: extended dir to find the files
    :param pickle_name: name of the pickle to save
    :param steps: how many cases should be worked on now
    """
    utils.create_dir(current_path=current_path, directory_name=dataframe_dir_bgh, delete=False)
    start_time = time.time()
    extension = ''
    if settings.remove_brackets:
        extension = settings.no_brackets_suffix

    files = [(filename, directory, extended_dir) for directory in
             utils.list_dir_files(current_path=current_path, path=extended_dir) for filename in
             utils.list_dir_files(current_path=current_path, path=os.path.join(extended_dir, directory))
             if filename.endswith(".xml")]

    original_length = len(files)
    data = pd.DataFrame(columns=simple_attributes + nested_attributes)

    pickle_path = dataframe_dir_bgh+extension+pickle_name

    files, data = utils.get_step_subset_raw(steps=steps,
                                            path_to_dest_dataframe=pickle_path,
                                            source_data=files,
                                            dest_data=data,
                                            call_path=current_path)

    result = utils.parallel_imap(read_file_data, files)
    for row in result:
        data = pd.concat([data, row], ignore_index=True)
    with utils.open_file(current_path=current_path, path=pickle_path, modes='wb') as f:
        data.to_pickle(f)

    print('Resulting dataframes have length ' + str(data.shape[0]) +
          ' (' + str(data.shape[0] / original_length * 100) + '%)')
    end_time = time.time()
    time_lapsed = end_time - start_time
    time_convert(time_lapsed)


def get_selected_bgh_data(directory='.\\'):
    """
    Shortcut for getting the BGH data currently needed. Selects all data from the Civil copurts which contain 'Urteile'

    :param directory: directory offset from current position, with ending slashes
    :return: the data
    """
    return get_data(pickle_name_bgh, directory, spruchkoerper='Zivilsenat', doktyp='Urteil')


def get_data(pickle_name, directory='../data/', spruchkoerper=None, doktyp=None):
    """
    Method for access to the bgh pickle
    :param pickle_name: name to identify the data
    :param directory: directory path to the data file (with ending slash)
    :param spruchkoerper: Parameter can be used to select the senates (checks whether the given string is contained
    in the datas spruchkoerper)
    :param doktyp: can be used to select specific documents (like 'Urteil', 'Beschluss', etc.), must contain the word
    :return: The data as a pandas dataframe
    """
    extension = ''
    if settings.remove_brackets:
        extension = settings.no_brackets_suffix
    data = utils.df_from_pickle(current_path=current_path, path=directory + dataframe_dir_bgh + extension + pickle_name)
    if spruchkoerper is not None:
        data = data[data['spruchkoerper'].notnull()]
        data = data[data['spruchkoerper'].str.contains(spruchkoerper)]
    if doktyp is not None:
        data = data[data['doktyp'].str.lower().str.contains(doktyp.lower())]
    data = data.dropna(axis=1, how='all')  # drop all columns with no value
    data = data.drop_duplicates()
    return data


# if __name__ == "__main__":
    # download(base_dir=base_dir_bgh, extended_dir=extended_dir_bgh, tag='BGH')
    # create_pickle(extended_dir=extended_dir_bgh, pickle_name=pickle_name_bgh, steps=2)