You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
214 lines
8.3 KiB
214 lines
8.3 KiB
import time
|
|
import xml.etree.ElementTree as ET
|
|
import urllib.request as request
|
|
import zipfile
|
|
import os
|
|
|
|
import pandas as pd
|
|
|
|
import settings
|
|
import utils
|
|
from utils import time_convert
|
|
|
|
base_dir_bgh = 'raw_data/BGH_Data'
|
|
extended_dir_bgh = base_dir_bgh + '/senates'
|
|
dataframe_dir_bgh = 'dataframes/bgh/'
|
|
pickle_name_bgh = 'bgh_data.pkl'
|
|
simple_attributes = ["doknr", "ecli", "gertyp", "gerort", "spruchkoerper", "entsch-datum",
|
|
"aktenzeichen", "doktyp", "norm", "vorinstanz", "mitwirkung", "titelzeile",
|
|
"leitsatz", "sonstosatz", "tenor", "tatbestand", "entscheidungsgruende",
|
|
"gruende", "abwmeinung", "sonstlt", "identifier", "coverage", "language",
|
|
"publisher", "accessRights"]
|
|
nested_attributes = ["region_abk", "region_long"]
|
|
text_attributes = ["titelzeile", "leitsatz", "sonstosatz", "tenor", "tatbestand",
|
|
"entscheidungsgruende", "gruende", "abwmeinung", "sonstlt"]
|
|
stopword_extension = '_no_stopwords'
|
|
current_path = 'data'
|
|
|
|
|
|
def get_file_list():
|
|
"""
|
|
Makes http request for the files
|
|
:return: the web page with all current cases as an xml-tree
|
|
"""
|
|
xml_file, https_message = request.urlretrieve('https://www.rechtsprechung-im-internet.de/rii-toc.xml')
|
|
tree = ET.parse(xml_file)
|
|
root = tree.getroot()
|
|
return root
|
|
|
|
|
|
def count_cases(root, tag):
|
|
"""
|
|
counts all cases belonging to the given tag and returns the count
|
|
:param root: downloaded xml-tree with all files
|
|
:param tag: tag to find in the name
|
|
:return: number of cases belonging to the BGH
|
|
"""
|
|
count = 0
|
|
for child in root:
|
|
if tag in child[0].text:
|
|
count += 1
|
|
return count
|
|
|
|
|
|
def download(base_dir, extended_dir, tag):
|
|
"""
|
|
download all cases to a folder related to their senats
|
|
:param base_dir: Name of the directory for the data
|
|
:param extended_dir: name of the subdirectory for saving
|
|
:param tag: tag to recognize the court (BGH, BVerwG)
|
|
"""
|
|
# set up directories
|
|
utils.create_dir(current_path=current_path, directory_name=base_dir)
|
|
utils.create_dir(current_path=current_path, directory_name=extended_dir)
|
|
# do the download
|
|
root = get_file_list() # 0 ist gericht, 3 ist link
|
|
max_cases = count_cases(root, tag)
|
|
downloaded = 0
|
|
for child in root:
|
|
while True:
|
|
try:
|
|
if tag in child[0].text:
|
|
filename, http = request.urlretrieve(child[3].text)
|
|
with zipfile.ZipFile(filename, 'r') as zip_ref:
|
|
zip_ref.extractall(
|
|
utils.server_path(current_path=current_path,
|
|
path=extended_dir + '/' + child[0].text.replace('\n', '') + '/'))
|
|
os.remove(filename)
|
|
downloaded += 1
|
|
print("\rDownloaded %d of %d " % (downloaded, max_cases) + tag + "Cases", end="")
|
|
finally:
|
|
break
|
|
print("\nDone!")
|
|
|
|
|
|
def read_file_data(file):
|
|
"""
|
|
Reads the data of one case / file.
|
|
|
|
:param file: package containing (filename, directory, directory extension) to address the file
|
|
:return: a dictionary with key: attribute_name and value: attribute_value
|
|
"""
|
|
filename, directory, extended_dir = file
|
|
tree = ET.parse(utils.server_path(current_path=current_path, path=os.path.join(extended_dir, directory, filename)))
|
|
root = tree.getroot()
|
|
res = {}
|
|
for attribute in simple_attributes:
|
|
attr = root.find(attribute) # leitsatz überprüfen: zwei Worte zusammen, aber leerzeichen immer noch da!
|
|
text = ''
|
|
for t in attr.itertext():
|
|
if t == '.' or t == ',' or t == ';' or t == '!' or t == '?':
|
|
text = text.strip() # remove space before these characters
|
|
text += t + ' '
|
|
text = text.strip()
|
|
if text == '':
|
|
res[attribute] = None
|
|
else:
|
|
res[attribute] = text
|
|
|
|
for attribute in nested_attributes:
|
|
nesting = attribute.split('_')
|
|
xml_tag = root
|
|
# find nested attribute
|
|
for i in range(len(nesting)):
|
|
xml_tag = xml_tag.find(nesting[i])
|
|
text = ""
|
|
for t in xml_tag.itertext():
|
|
if t == '.' or t == ',' or t == ';' or t == '!' or t == '?':
|
|
text = text.strip() # remove space before these characters
|
|
text += t + ' '
|
|
text = text.strip()
|
|
if text == '':
|
|
res[attribute] = None
|
|
else:
|
|
res[attribute] = text
|
|
|
|
for attribute in utils.rii_text_columns:
|
|
if res[attribute] is not None:
|
|
if settings.remove_brackets:
|
|
res[attribute] = utils.remove_brackets(res[attribute])
|
|
res[attribute] = utils.remove_spaces_before_sentence_marks(res[attribute])
|
|
|
|
return pd.DataFrame(res, index=[0])
|
|
|
|
|
|
def create_pickle(extended_dir, pickle_name, steps):
|
|
"""
|
|
Combines all downloaded files of the given extended directory into one pickle
|
|
|
|
:param extended_dir: extended dir to find the files
|
|
:param pickle_name: name of the pickle to save
|
|
:param steps: how many cases should be worked on now
|
|
"""
|
|
utils.create_dir(current_path=current_path, directory_name=dataframe_dir_bgh, delete=False)
|
|
start_time = time.time()
|
|
extension = ''
|
|
if settings.remove_brackets:
|
|
extension = settings.no_brackets_suffix
|
|
|
|
files = [(filename, directory, extended_dir) for directory in
|
|
utils.list_dir_files(current_path=current_path, path=extended_dir) for filename in
|
|
utils.list_dir_files(current_path=current_path, path=os.path.join(extended_dir, directory))
|
|
if filename.endswith(".xml")]
|
|
|
|
original_length = len(files)
|
|
data = pd.DataFrame(columns=simple_attributes + nested_attributes)
|
|
|
|
pickle_path = dataframe_dir_bgh+extension+pickle_name
|
|
|
|
files, data = utils.get_step_subset_raw(steps=steps,
|
|
path_to_dest_dataframe=pickle_path,
|
|
source_data=files,
|
|
dest_data=data,
|
|
call_path=current_path)
|
|
|
|
result = utils.parallel_imap(read_file_data, files)
|
|
for row in result:
|
|
data = pd.concat([data, row], ignore_index=True)
|
|
with utils.open_file(current_path=current_path, path=pickle_path, modes='wb') as f:
|
|
data.to_pickle(f)
|
|
|
|
print('Resulting dataframes have length ' + str(data.shape[0]) +
|
|
' (' + str(data.shape[0] / original_length * 100) + '%)')
|
|
end_time = time.time()
|
|
time_lapsed = end_time - start_time
|
|
time_convert(time_lapsed)
|
|
|
|
|
|
def get_selected_bgh_data(directory='.\\'):
|
|
"""
|
|
Shortcut for getting the BGH data currently needed. Selects all data from the Civil copurts which contain 'Urteile'
|
|
|
|
:param directory: directory offset from current position, with ending slashes
|
|
:return: the data
|
|
"""
|
|
return get_data(pickle_name_bgh, directory, spruchkoerper='Zivilsenat', doktyp='Urteil')
|
|
|
|
|
|
def get_data(pickle_name, directory='../data/', spruchkoerper=None, doktyp=None):
|
|
"""
|
|
Method for access to the bgh pickle
|
|
:param pickle_name: name to identify the data
|
|
:param directory: directory path to the data file (with ending slash)
|
|
:param spruchkoerper: Parameter can be used to select the senates (checks whether the given string is contained
|
|
in the datas spruchkoerper)
|
|
:param doktyp: can be used to select specific documents (like 'Urteil', 'Beschluss', etc.), must contain the word
|
|
:return: The data as a pandas dataframe
|
|
"""
|
|
extension = ''
|
|
if settings.remove_brackets:
|
|
extension = settings.no_brackets_suffix
|
|
data = utils.df_from_pickle(current_path=current_path, path=directory + dataframe_dir_bgh + extension + pickle_name)
|
|
if spruchkoerper is not None:
|
|
data = data[data['spruchkoerper'].notnull()]
|
|
data = data[data['spruchkoerper'].str.contains(spruchkoerper)]
|
|
if doktyp is not None:
|
|
data = data[data['doktyp'].str.lower().str.contains(doktyp.lower())]
|
|
data = data.dropna(axis=1, how='all') # drop all columns with no value
|
|
data = data.drop_duplicates()
|
|
return data
|
|
|
|
|
|
# if __name__ == "__main__":
|
|
# download(base_dir=base_dir_bgh, extended_dir=extended_dir_bgh, tag='BGH')
|
|
# create_pickle(extended_dir=extended_dir_bgh, pickle_name=pickle_name_bgh, steps=2)
|