You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

214 lines
8.3 KiB

  1. import time
  2. import xml.etree.ElementTree as ET
  3. import urllib.request as request
  4. import zipfile
  5. import os
  6. import pandas as pd
  7. import settings
  8. import utils
  9. from utils import time_convert
  10. base_dir_bgh = 'raw_data/BGH_Data'
  11. extended_dir_bgh = base_dir_bgh + '/senates'
  12. dataframe_dir_bgh = 'dataframes/bgh/'
  13. pickle_name_bgh = 'bgh_data.pkl'
  14. simple_attributes = ["doknr", "ecli", "gertyp", "gerort", "spruchkoerper", "entsch-datum",
  15. "aktenzeichen", "doktyp", "norm", "vorinstanz", "mitwirkung", "titelzeile",
  16. "leitsatz", "sonstosatz", "tenor", "tatbestand", "entscheidungsgruende",
  17. "gruende", "abwmeinung", "sonstlt", "identifier", "coverage", "language",
  18. "publisher", "accessRights"]
  19. nested_attributes = ["region_abk", "region_long"]
  20. text_attributes = ["titelzeile", "leitsatz", "sonstosatz", "tenor", "tatbestand",
  21. "entscheidungsgruende", "gruende", "abwmeinung", "sonstlt"]
  22. stopword_extension = '_no_stopwords'
  23. current_path = 'data'
  24. def get_file_list():
  25. """
  26. Makes http request for the files
  27. :return: the web page with all current cases as an xml-tree
  28. """
  29. xml_file, https_message = request.urlretrieve('https://www.rechtsprechung-im-internet.de/rii-toc.xml')
  30. tree = ET.parse(xml_file)
  31. root = tree.getroot()
  32. return root
  33. def count_cases(root, tag):
  34. """
  35. counts all cases belonging to the given tag and returns the count
  36. :param root: downloaded xml-tree with all files
  37. :param tag: tag to find in the name
  38. :return: number of cases belonging to the BGH
  39. """
  40. count = 0
  41. for child in root:
  42. if tag in child[0].text:
  43. count += 1
  44. return count
  45. def download(base_dir, extended_dir, tag):
  46. """
  47. download all cases to a folder related to their senats
  48. :param base_dir: Name of the directory for the data
  49. :param extended_dir: name of the subdirectory for saving
  50. :param tag: tag to recognize the court (BGH, BVerwG)
  51. """
  52. # set up directories
  53. utils.create_dir(current_path=current_path, directory_name=base_dir)
  54. utils.create_dir(current_path=current_path, directory_name=extended_dir)
  55. # do the download
  56. root = get_file_list() # 0 ist gericht, 3 ist link
  57. max_cases = count_cases(root, tag)
  58. downloaded = 0
  59. for child in root:
  60. while True:
  61. try:
  62. if tag in child[0].text:
  63. filename, http = request.urlretrieve(child[3].text)
  64. with zipfile.ZipFile(filename, 'r') as zip_ref:
  65. zip_ref.extractall(
  66. utils.server_path(current_path=current_path,
  67. path=extended_dir + '/' + child[0].text.replace('\n', '') + '/'))
  68. os.remove(filename)
  69. downloaded += 1
  70. print("\rDownloaded %d of %d " % (downloaded, max_cases) + tag + "Cases", end="")
  71. finally:
  72. break
  73. print("\nDone!")
  74. def read_file_data(file):
  75. """
  76. Reads the data of one case / file.
  77. :param file: package containing (filename, directory, directory extension) to address the file
  78. :return: a dictionary with key: attribute_name and value: attribute_value
  79. """
  80. filename, directory, extended_dir = file
  81. tree = ET.parse(utils.server_path(current_path=current_path, path=os.path.join(extended_dir, directory, filename)))
  82. root = tree.getroot()
  83. res = {}
  84. for attribute in simple_attributes:
  85. attr = root.find(attribute) # leitsatz überprüfen: zwei Worte zusammen, aber leerzeichen immer noch da!
  86. text = ''
  87. for t in attr.itertext():
  88. if t == '.' or t == ',' or t == ';' or t == '!' or t == '?':
  89. text = text.strip() # remove space before these characters
  90. text += t + ' '
  91. text = text.strip()
  92. if text == '':
  93. res[attribute] = None
  94. else:
  95. res[attribute] = text
  96. for attribute in nested_attributes:
  97. nesting = attribute.split('_')
  98. xml_tag = root
  99. # find nested attribute
  100. for i in range(len(nesting)):
  101. xml_tag = xml_tag.find(nesting[i])
  102. text = ""
  103. for t in xml_tag.itertext():
  104. if t == '.' or t == ',' or t == ';' or t == '!' or t == '?':
  105. text = text.strip() # remove space before these characters
  106. text += t + ' '
  107. text = text.strip()
  108. if text == '':
  109. res[attribute] = None
  110. else:
  111. res[attribute] = text
  112. for attribute in utils.rii_text_columns:
  113. if res[attribute] is not None:
  114. if settings.remove_brackets:
  115. res[attribute] = utils.remove_brackets(res[attribute])
  116. res[attribute] = utils.remove_spaces_before_sentence_marks(res[attribute])
  117. return pd.DataFrame(res, index=[0])
  118. def create_pickle(extended_dir, pickle_name, steps):
  119. """
  120. Combines all downloaded files of the given extended directory into one pickle
  121. :param extended_dir: extended dir to find the files
  122. :param pickle_name: name of the pickle to save
  123. :param steps: how many cases should be worked on now
  124. """
  125. utils.create_dir(current_path=current_path, directory_name=dataframe_dir_bgh, delete=False)
  126. start_time = time.time()
  127. extension = ''
  128. if settings.remove_brackets:
  129. extension = settings.no_brackets_suffix
  130. files = [(filename, directory, extended_dir) for directory in
  131. utils.list_dir_files(current_path=current_path, path=extended_dir) for filename in
  132. utils.list_dir_files(current_path=current_path, path=os.path.join(extended_dir, directory))
  133. if filename.endswith(".xml")]
  134. original_length = len(files)
  135. data = pd.DataFrame(columns=simple_attributes + nested_attributes)
  136. pickle_path = dataframe_dir_bgh+extension+pickle_name
  137. files, data = utils.get_step_subset_raw(steps=steps,
  138. path_to_dest_dataframe=pickle_path,
  139. source_data=files,
  140. dest_data=data,
  141. call_path=current_path)
  142. result = utils.parallel_imap(read_file_data, files)
  143. for row in result:
  144. data = pd.concat([data, row], ignore_index=True)
  145. with utils.open_file(current_path=current_path, path=pickle_path, modes='wb') as f:
  146. data.to_pickle(f)
  147. print('Resulting dataframes have length ' + str(data.shape[0]) +
  148. ' (' + str(data.shape[0] / original_length * 100) + '%)')
  149. end_time = time.time()
  150. time_lapsed = end_time - start_time
  151. time_convert(time_lapsed)
  152. def get_selected_bgh_data(directory='.\\'):
  153. """
  154. Shortcut for getting the BGH data currently needed. Selects all data from the Civil copurts which contain 'Urteile'
  155. :param directory: directory offset from current position, with ending slashes
  156. :return: the data
  157. """
  158. return get_data(pickle_name_bgh, directory, spruchkoerper='Zivilsenat', doktyp='Urteil')
  159. def get_data(pickle_name, directory='../data/', spruchkoerper=None, doktyp=None):
  160. """
  161. Method for access to the bgh pickle
  162. :param pickle_name: name to identify the data
  163. :param directory: directory path to the data file (with ending slash)
  164. :param spruchkoerper: Parameter can be used to select the senates (checks whether the given string is contained
  165. in the datas spruchkoerper)
  166. :param doktyp: can be used to select specific documents (like 'Urteil', 'Beschluss', etc.), must contain the word
  167. :return: The data as a pandas dataframe
  168. """
  169. extension = ''
  170. if settings.remove_brackets:
  171. extension = settings.no_brackets_suffix
  172. data = utils.df_from_pickle(current_path=current_path, path=directory + dataframe_dir_bgh + extension + pickle_name)
  173. if spruchkoerper is not None:
  174. data = data[data['spruchkoerper'].notnull()]
  175. data = data[data['spruchkoerper'].str.contains(spruchkoerper)]
  176. if doktyp is not None:
  177. data = data[data['doktyp'].str.lower().str.contains(doktyp.lower())]
  178. data = data.dropna(axis=1, how='all') # drop all columns with no value
  179. data = data.drop_duplicates()
  180. return data
  181. # if __name__ == "__main__":
  182. # download(base_dir=base_dir_bgh, extended_dir=extended_dir_bgh, tag='BGH')
  183. # create_pickle(extended_dir=extended_dir_bgh, pickle_name=pickle_name_bgh, steps=2)