You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

214 lines
8.3 KiB

import time
import xml.etree.ElementTree as ET
import urllib.request as request
import zipfile
import os
import pandas as pd
import settings
import utils
from utils import time_convert
base_dir_bgh = 'raw_data/BGH_Data'
extended_dir_bgh = base_dir_bgh + '/senates'
dataframe_dir_bgh = 'dataframes/bgh/'
pickle_name_bgh = 'bgh_data.pkl'
simple_attributes = ["doknr", "ecli", "gertyp", "gerort", "spruchkoerper", "entsch-datum",
"aktenzeichen", "doktyp", "norm", "vorinstanz", "mitwirkung", "titelzeile",
"leitsatz", "sonstosatz", "tenor", "tatbestand", "entscheidungsgruende",
"gruende", "abwmeinung", "sonstlt", "identifier", "coverage", "language",
"publisher", "accessRights"]
nested_attributes = ["region_abk", "region_long"]
text_attributes = ["titelzeile", "leitsatz", "sonstosatz", "tenor", "tatbestand",
"entscheidungsgruende", "gruende", "abwmeinung", "sonstlt"]
stopword_extension = '_no_stopwords'
current_path = 'data'
def get_file_list():
"""
Makes http request for the files
:return: the web page with all current cases as an xml-tree
"""
xml_file, https_message = request.urlretrieve('https://www.rechtsprechung-im-internet.de/rii-toc.xml')
tree = ET.parse(xml_file)
root = tree.getroot()
return root
def count_cases(root, tag):
"""
counts all cases belonging to the given tag and returns the count
:param root: downloaded xml-tree with all files
:param tag: tag to find in the name
:return: number of cases belonging to the BGH
"""
count = 0
for child in root:
if tag in child[0].text:
count += 1
return count
def download(base_dir, extended_dir, tag):
"""
download all cases to a folder related to their senats
:param base_dir: Name of the directory for the data
:param extended_dir: name of the subdirectory for saving
:param tag: tag to recognize the court (BGH, BVerwG)
"""
# set up directories
utils.create_dir(current_path=current_path, directory_name=base_dir)
utils.create_dir(current_path=current_path, directory_name=extended_dir)
# do the download
root = get_file_list() # 0 ist gericht, 3 ist link
max_cases = count_cases(root, tag)
downloaded = 0
for child in root:
while True:
try:
if tag in child[0].text:
filename, http = request.urlretrieve(child[3].text)
with zipfile.ZipFile(filename, 'r') as zip_ref:
zip_ref.extractall(
utils.server_path(current_path=current_path,
path=extended_dir + '/' + child[0].text.replace('\n', '') + '/'))
os.remove(filename)
downloaded += 1
print("\rDownloaded %d of %d " % (downloaded, max_cases) + tag + "Cases", end="")
finally:
break
print("\nDone!")
def read_file_data(file):
"""
Reads the data of one case / file.
:param file: package containing (filename, directory, directory extension) to address the file
:return: a dictionary with key: attribute_name and value: attribute_value
"""
filename, directory, extended_dir = file
tree = ET.parse(utils.server_path(current_path=current_path, path=os.path.join(extended_dir, directory, filename)))
root = tree.getroot()
res = {}
for attribute in simple_attributes:
attr = root.find(attribute) # leitsatz überprüfen: zwei Worte zusammen, aber leerzeichen immer noch da!
text = ''
for t in attr.itertext():
if t == '.' or t == ',' or t == ';' or t == '!' or t == '?':
text = text.strip() # remove space before these characters
text += t + ' '
text = text.strip()
if text == '':
res[attribute] = None
else:
res[attribute] = text
for attribute in nested_attributes:
nesting = attribute.split('_')
xml_tag = root
# find nested attribute
for i in range(len(nesting)):
xml_tag = xml_tag.find(nesting[i])
text = ""
for t in xml_tag.itertext():
if t == '.' or t == ',' or t == ';' or t == '!' or t == '?':
text = text.strip() # remove space before these characters
text += t + ' '
text = text.strip()
if text == '':
res[attribute] = None
else:
res[attribute] = text
for attribute in utils.rii_text_columns:
if res[attribute] is not None:
if settings.remove_brackets:
res[attribute] = utils.remove_brackets(res[attribute])
res[attribute] = utils.remove_spaces_before_sentence_marks(res[attribute])
return pd.DataFrame(res, index=[0])
def create_pickle(extended_dir, pickle_name, steps):
"""
Combines all downloaded files of the given extended directory into one pickle
:param extended_dir: extended dir to find the files
:param pickle_name: name of the pickle to save
:param steps: how many cases should be worked on now
"""
utils.create_dir(current_path=current_path, directory_name=dataframe_dir_bgh, delete=False)
start_time = time.time()
extension = ''
if settings.remove_brackets:
extension = settings.no_brackets_suffix
files = [(filename, directory, extended_dir) for directory in
utils.list_dir_files(current_path=current_path, path=extended_dir) for filename in
utils.list_dir_files(current_path=current_path, path=os.path.join(extended_dir, directory))
if filename.endswith(".xml")]
original_length = len(files)
data = pd.DataFrame(columns=simple_attributes + nested_attributes)
pickle_path = dataframe_dir_bgh+extension+pickle_name
files, data = utils.get_step_subset_raw(steps=steps,
path_to_dest_dataframe=pickle_path,
source_data=files,
dest_data=data,
call_path=current_path)
result = utils.parallel_imap(read_file_data, files)
for row in result:
data = pd.concat([data, row], ignore_index=True)
with utils.open_file(current_path=current_path, path=pickle_path, modes='wb') as f:
data.to_pickle(f)
print('Resulting dataframes have length ' + str(data.shape[0]) +
' (' + str(data.shape[0] / original_length * 100) + '%)')
end_time = time.time()
time_lapsed = end_time - start_time
time_convert(time_lapsed)
def get_selected_bgh_data(directory='.\\'):
"""
Shortcut for getting the BGH data currently needed. Selects all data from the Civil copurts which contain 'Urteile'
:param directory: directory offset from current position, with ending slashes
:return: the data
"""
return get_data(pickle_name_bgh, directory, spruchkoerper='Zivilsenat', doktyp='Urteil')
def get_data(pickle_name, directory='../data/', spruchkoerper=None, doktyp=None):
"""
Method for access to the bgh pickle
:param pickle_name: name to identify the data
:param directory: directory path to the data file (with ending slash)
:param spruchkoerper: Parameter can be used to select the senates (checks whether the given string is contained
in the datas spruchkoerper)
:param doktyp: can be used to select specific documents (like 'Urteil', 'Beschluss', etc.), must contain the word
:return: The data as a pandas dataframe
"""
extension = ''
if settings.remove_brackets:
extension = settings.no_brackets_suffix
data = utils.df_from_pickle(current_path=current_path, path=directory + dataframe_dir_bgh + extension + pickle_name)
if spruchkoerper is not None:
data = data[data['spruchkoerper'].notnull()]
data = data[data['spruchkoerper'].str.contains(spruchkoerper)]
if doktyp is not None:
data = data[data['doktyp'].str.lower().str.contains(doktyp.lower())]
data = data.dropna(axis=1, how='all') # drop all columns with no value
data = data.drop_duplicates()
return data
# if __name__ == "__main__":
# download(base_dir=base_dir_bgh, extended_dir=extended_dir_bgh, tag='BGH')
# create_pickle(extended_dir=extended_dir_bgh, pickle_name=pickle_name_bgh, steps=2)