From 11487cb11ff0789cf7689650f37875d8d16cdc57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Frederik=20M=C3=B6llers?= Date: Wed, 2 Feb 2022 02:43:33 +0100 Subject: [PATCH] Major optimizations & consolidation of code --- fix_dates.py | 167 +++++++++++++++++++++------------------------------ 1 file changed, 69 insertions(+), 98 deletions(-) diff --git a/fix_dates.py b/fix_dates.py index 77a2573..ab40782 100644 --- a/fix_dates.py +++ b/fix_dates.py @@ -1,6 +1,8 @@ import argparse import datetime import time +import urllib.parse +import sys import xml.etree.ElementTree as ET import requests @@ -9,8 +11,10 @@ import requests # some constants FILES_PATH_PREFIX = "/remote.php/dav/files/" VERSIONS_PATH_PREFIX = "/remote.php/dav/versions/" +# the threshold for file timestamps (dates older than this are considered invalid) +DATE_THRESHOLD = datetime.datetime(1990, 1, 1) # we only need one session for the whole script -session = requests.Session() +SESSION = requests.Session() def propfind(path, auth): @@ -18,8 +22,11 @@ def propfind(path, auth): Get a file's Last Modified timestamp and FileID via a PROPFIND request :param path: The path of the file in question :param auth: Auth data for the HTTP request (e.g. a requests.auth.HTTPBasicAuth object) - :return: The properties in XML format + :return: An iterator of dictionaries, one for every directory entry. Entry properties are taken from the PROPFIND + response """ + # do not descend further into subdirectories\ + # TODO: we could probably be faster if we did headers = {"Depth": "1"} # This body returns only the timelastmodified and the fileid variable requested_data = \ @@ -27,96 +34,56 @@ def propfind(path, auth): + """ req = requests.Request("PROPFIND", path, headers=headers, auth=auth, data=requested_data) - resp = session.send(req.prepare()) - print(resp.text) - return resp.text + resp = SESSION.send(req.prepare()) + et = ET.fromstring(resp.text) + for dav_response in et.findall('{DAV:}response'): + entry = {} + entry["path"] = dav_response.find("{DAV:}href").text + # skip this entry itself + if path.endswith(entry["path"]): + continue + props = dav_response.find("{DAV:}propstat").find("{DAV:}prop") + try: + entry["last_modified"] = datetime.datetime.strptime( + props.find("{DAV:}getlastmodified").text, + "%a, %d %b %Y %H:%M:%S GMT" + ) + except (AttributeError, TypeError): + pass + entry["resource_type"] = [] + try: + for resourcetype in props.find("{DAV:}resourcetype"): + entry["resource_type"].append(resourcetype.tag) + except (AttributeError, TypeError): + pass + try: + entry["file_id"] = int(props.find("{http://owncloud.org/ns}fileid").text) + except (AttributeError, TypeError): + pass + yield entry -def search_folder(requestreturn): - """ - Iterates through a folder's properties XML and find entries with invalid timestamps - :param requestreturn: The XML returned by propfind() - :return: A tuple of two lists. The first list contains all subfolders, the second contains all FileIDs of entries - with an invalid timestamp. - """ - # List to collect path of folders stored in path - innerfolders = [] - # List to collect path of files with wrong timestamp - linkswrongtime = [] - # First folder provided in the xml file is always the folder we are currently in. - # To prevent searching this folder twice, or ending in a loop, we can not store this folder in the folderlist again. - firstfolder = True - # Get a xml tree - tree = ET.ElementTree(ET.fromstring(requestreturn)) - # Find all responses in the tree, those contain the fielpath, lasttimemodified, typeoffile etc. - for resp in tree.findall('.//{DAV:}response'): - # Here we can get the filpath out of href and get further information in prop (lastimemodified, typeoffile, etc.) - for p in resp: - # In case p.text is not none, it contains the filepath - if not (p.text is None): - if (p.text[-1] == '/'): - # If the current object is a folder, check it its not first folder - if not (firstfolder): - innerfolders.append(p.text) - else: - firstfolder = False - break - # In case p.text is none, it contains the further information - else: - for t in p.findall('.//{DAV:}getlastmodified'): - # this function converts the given date to unix timestamp - lastmodified = time.mktime( - datetime.datetime.strptime(t.text, "%a, %d %b %Y %H:%M:%S GMT").timetuple()) - # 631148400 is the unix timestamp of 01.01.1990 00:00:00, because we know there is no file older - # than this in our nextcloud - if lastmodified < 631148400: - for fileid in p.findall('.//{http://owncloud.org/ns}fileid'): - linkswrongtime.append(fileid.text) - return (innerfolders, linkswrongtime) - - -def version_check(xmlfile): +def find_valid_version(versions): """ This function returns the fileid of the version of a given fileid with the most current timestamp or None if - there are no versions with a timestamp younger than 01.01.1990 - :param xmlfile: An XML file with Last Modified timestamps and FileIDs as returned by propfind() - :return: The FileID of the most recent version or None if no valid version exists + there are no versions with a timestamp younger than the threshold + :param versions: An iterator as returned by propfind() + :return: The entry of the iterator which has the most recent date or None if none exists """ - tree = ET.ElementTree(ET.fromstring(xmlfile)) - # Name of files are stored as a string. But we can check the timestamp of the file only after we can check the name, - # so it needs to be stored temporary in case the timestamp is the most current - temp = "this is a temporary string" - # These two variables are used to store the highest/most current timestamp and the associated fileid - most_current_timestamp = 631148400 - most_current_timestamp_fileid = 0 - for resp in tree.findall('.//{DAV:}response'): - for p in resp: - # In case p.text is not none, it contains the filepath - if not (p.text is None): - temp = p.text - else: - for t in p.findall('.//{DAV:}getlastmodified'): - if not (t.text is None): - # this function converts the given date to unix timestamp - lastmodified = time.mktime( - datetime.datetime.strptime(t.text, "%a, %d %b %Y %H:%M:%S GMT").timetuple()) - # - if lastmodified > most_current_timestamp: - most_current_timestamp = lastmodified - fileid_old_version = temp.split('/') - most_current_timestamp_fileid = fileid_old_version[-1] - else: - break - # Check if there is another version and a file with a current timestamp - if most_current_timestamp_fileid != 0: - return most_current_timestamp_fileid - else: + # mock entry for comparison + most_recent = {"last_modified": DATE_THRESHOLD} + for version in versions: + if "last_modified" in version and version["last_modified"] > most_recent["last_modified"]: + most_recent = version + if most_recent["last_modified"] == DATE_THRESHOLD: return None + return most_recent if __name__ == "__main__": @@ -138,24 +105,28 @@ if __name__ == "__main__": mainpath = FILES_PATH_PREFIX + arguments.username + arguments.search_path # List of all folders we need to enter folders = [mainpath] - # List of all fileids with wrong time + # List of all entries with wrong time wrongtime = [] # Iterate through all folders and check for wrong timestamps while folders: - path_suffix = folders.pop(0) - path = arguments.server + str(path_suffix) - r = propfind(path, auth) - new_folders, new_wrongtime = search_folder(r) - # Append all found folders and files with wrong timestamps to global list - folders += new_folders - wrongtime += new_wrongtime - # Iterate through all fileids with wrong timestamps and check for versions with intact timestamp - while wrongtime: - fileid = wrongtime.pop(0) - version_suffix = VERSIONS_PATH_PREFIX + arguments.username + "/versions/" + fileid - version_path = arguments.server + version_suffix - versions = propfind(version_path, auth) - mrv = version_check(versions) - print(fileid, end=": ") - print(mrv) + url = arguments.server + folders.pop(0) + print("+", end="", flush=True) + for entry in propfind(url, auth): + print(".", end="", flush=True) + # put directories in search list + if "resource_type" in entry and "{DAV:}collection" in entry["resource_type"]: + folders.append(entry["path"]) + # put files with wrong date in wrong date list (we don't know what to do if a directory has an invalid date) + elif "last_modified" in entry and entry["last_modified"] < DATE_THRESHOLD: + wrongtime.append(entry) + # Iterate through all fileids with wrong timestamps and check for versions with intact timestamp + print() + # NOTE: you can indent this into the loop above to fix things on-the-fly instead of all at once + for entry in wrongtime: + print(urllib.parse.unquote(entry["path"][len(FILES_PATH_PREFIX):])) + fixed_version = find_valid_version(propfind(arguments.server + VERSIONS_PATH_PREFIX + arguments.username + "/versions/" + str(entry["file_id"]), auth)) + if fixed_version: + print("Restore from {}".format(fixed_version)) + else: + print("Touch file.") \ No newline at end of file