ICAIL_2023/data/download_rii.py


								import time

								import xml.etree.ElementTree as ET

								import urllib.request as request

								import zipfile

								import os


								import pandas as pd


								import settings

								import utils

								from utils import time_convert


								base_dir_bgh = 'raw_data/BGH_Data'

								extended_dir_bgh = base_dir_bgh + '/senates'

								dataframe_dir_bgh = 'dataframes/bgh/'

								pickle_name_bgh = 'bgh_data.pkl'

								simple_attributes = ["doknr", "ecli", "gertyp", "gerort", "spruchkoerper", "entsch-datum",

								                     "aktenzeichen", "doktyp", "norm", "vorinstanz", "mitwirkung", "titelzeile",

								                     "leitsatz", "sonstosatz", "tenor", "tatbestand", "entscheidungsgruende",

								                     "gruende", "abwmeinung", "sonstlt", "identifier", "coverage", "language",

								                     "publisher", "accessRights"]

								nested_attributes = ["region_abk", "region_long"]

								text_attributes = ["titelzeile", "leitsatz", "sonstosatz", "tenor", "tatbestand",

								                   "entscheidungsgruende", "gruende", "abwmeinung", "sonstlt"]

								stopword_extension = '_no_stopwords'

								current_path = 'data'


								def get_file_list():

								    """

								    Makes http request for the files

								    :return: the web page with all current cases as an xml-tree

								    """

								    xml_file, https_message = request.urlretrieve('https://www.rechtsprechung-im-internet.de/rii-toc.xml')

								    tree = ET.parse(xml_file)

								    root = tree.getroot()

								    return root


								def count_cases(root, tag):

								    """

								    counts all cases belonging to the given tag and returns the count

								    :param root: downloaded xml-tree with all files

								    :param tag: tag to find in the name

								    :return: number of cases belonging to the BGH

								    """

								    count = 0

								    for child in root:

								        if tag in child[0].text:

								            count += 1

								    return count


								def download(base_dir, extended_dir, tag):

								    """

								    download all cases to a folder related to their senats

								    :param base_dir: Name of the directory for the data

								    :param extended_dir: name of the subdirectory for saving

								    :param tag: tag to recognize the court (BGH, BVerwG)

								    """

								    # set up directories

								    utils.create_dir(current_path=current_path, directory_name=base_dir)

								    utils.create_dir(current_path=current_path, directory_name=extended_dir)

								    # do the download

								    root = get_file_list()  # 0 ist gericht, 3 ist link

								    max_cases = count_cases(root, tag)

								    downloaded = 0

								    for child in root:

								        while True:

								            try:

								                if tag in child[0].text:

								                    filename, http = request.urlretrieve(child[3].text)

								                    with zipfile.ZipFile(filename, 'r') as zip_ref:

								                        zip_ref.extractall(

								                            utils.server_path(current_path=current_path,

								                                              path=extended_dir + '/' + child[0].text.replace('\n', '') + '/'))

								                    os.remove(filename)

								                    downloaded += 1

								                    print("\rDownloaded %d of %d " % (downloaded, max_cases) + tag + "Cases", end="")

								            finally:

								                break

								    print("\nDone!")


								def read_file_data(file):

								    """

								    Reads the data of one case / file.


								    :param file: package containing (filename, directory, directory extension) to address the file

								    :return: a dictionary with key: attribute_name and value: attribute_value

								    """

								    filename, directory, extended_dir = file

								    tree = ET.parse(utils.server_path(current_path=current_path, path=os.path.join(extended_dir, directory, filename)))

								    root = tree.getroot()

								    res = {}

								    for attribute in simple_attributes:

								        attr = root.find(attribute)  # leitsatz überprüfen: zwei Worte zusammen, aber leerzeichen immer noch da!

								        text = ''

								        for t in attr.itertext():

								            if t == '.' or t == ',' or t == ';' or t == '!' or t == '?':

								                text = text.strip()     # remove space before these characters

								            text += t + ' '

								        text = text.strip()

								        if text == '':

								            res[attribute] = None

								        else:

								            res[attribute] = text


								    for attribute in nested_attributes:

								        nesting = attribute.split('_')

								        xml_tag = root

								        # find nested attribute

								        for i in range(len(nesting)):

								            xml_tag = xml_tag.find(nesting[i])

								        text = ""

								        for t in xml_tag.itertext():

								            if t == '.' or t == ',' or t == ';' or t == '!' or t == '?':

								                text = text.strip()     # remove space before these characters

								            text += t + ' '

								        text = text.strip()

								        if text == '':

								            res[attribute] = None

								        else:

								            res[attribute] = text


								    for attribute in utils.rii_text_columns:

								        if res[attribute] is not None:

								            if settings.remove_brackets:

								                res[attribute] = utils.remove_brackets(res[attribute])

								            res[attribute] = utils.remove_spaces_before_sentence_marks(res[attribute])


								    return pd.DataFrame(res, index=[0])


								def create_pickle(extended_dir, pickle_name, steps):

								    """

								    Combines all downloaded files of the given extended directory into one pickle


								    :param extended_dir: extended dir to find the files

								    :param pickle_name: name of the pickle to save

								    :param steps: how many cases should be worked on now

								    """

								    utils.create_dir(current_path=current_path, directory_name=dataframe_dir_bgh, delete=False)

								    start_time = time.time()

								    extension = ''

								    if settings.remove_brackets:

								        extension = settings.no_brackets_suffix


								    files = [(filename, directory, extended_dir) for directory in

								             utils.list_dir_files(current_path=current_path, path=extended_dir) for filename in

								             utils.list_dir_files(current_path=current_path, path=os.path.join(extended_dir, directory))

								             if filename.endswith(".xml")]


								    original_length = len(files)

								    data = pd.DataFrame(columns=simple_attributes + nested_attributes)


								    pickle_path = dataframe_dir_bgh+extension+pickle_name


								    files, data = utils.get_step_subset_raw(steps=steps,

								                                            path_to_dest_dataframe=pickle_path,

								                                            source_data=files,

								                                            dest_data=data,

								                                            call_path=current_path)


								    result = utils.parallel_imap(read_file_data, files)

								    for row in result:

								        data = pd.concat([data, row], ignore_index=True)

								    with utils.open_file(current_path=current_path, path=pickle_path, modes='wb') as f:

								        data.to_pickle(f)


								    print('Resulting dataframes have length ' + str(data.shape[0]) +

								          ' (' + str(data.shape[0] / original_length * 100) + '%)')

								    end_time = time.time()

								    time_lapsed = end_time - start_time

								    time_convert(time_lapsed)


								def get_selected_bgh_data(directory='.\\'):

								    """

								    Shortcut for getting the BGH data currently needed. Selects all data from the Civil copurts which contain 'Urteile'


								    :param directory: directory offset from current position, with ending slashes

								    :return: the data

								    """

								    return get_data(pickle_name_bgh, directory, spruchkoerper='Zivilsenat', doktyp='Urteil')


								def get_data(pickle_name, directory='../data/', spruchkoerper=None, doktyp=None):

								    """

								    Method for access to the bgh pickle

								    :param pickle_name: name to identify the data

								    :param directory: directory path to the data file (with ending slash)

								    :param spruchkoerper: Parameter can be used to select the senates (checks whether the given string is contained

								    in the datas spruchkoerper)

								    :param doktyp: can be used to select specific documents (like 'Urteil', 'Beschluss', etc.), must contain the word

								    :return: The data as a pandas dataframe

								    """

								    extension = ''

								    if settings.remove_brackets:

								        extension = settings.no_brackets_suffix

								    data = utils.df_from_pickle(current_path=current_path, path=directory + dataframe_dir_bgh + extension + pickle_name)

								    if spruchkoerper is not None:

								        data = data[data['spruchkoerper'].notnull()]

								        data = data[data['spruchkoerper'].str.contains(spruchkoerper)]

								    if doktyp is not None:

								        data = data[data['doktyp'].str.lower().str.contains(doktyp.lower())]

								    data = data.dropna(axis=1, how='all')  # drop all columns with no value

								    data = data.drop_duplicates()

								    return data


								# if __name__ == "__main__":

								    # download(base_dir=base_dir_bgh, extended_dir=extended_dir_bgh, tag='BGH')

								    # create_pickle(extended_dir=extended_dir_bgh, pickle_name=pickle_name_bgh, steps=2)