Browse Source

Major optimizations & consolidation of code

main
Frederik Möllers 3 years ago
parent
commit
11487cb11f
  1. 167
      fix_dates.py

167
fix_dates.py

@ -1,6 +1,8 @@
import argparse import argparse
import datetime import datetime
import time import time
import urllib.parse
import sys
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
import requests import requests
@ -9,8 +11,10 @@ import requests
# some constants # some constants
FILES_PATH_PREFIX = "/remote.php/dav/files/" FILES_PATH_PREFIX = "/remote.php/dav/files/"
VERSIONS_PATH_PREFIX = "/remote.php/dav/versions/" VERSIONS_PATH_PREFIX = "/remote.php/dav/versions/"
# the threshold for file timestamps (dates older than this are considered invalid)
DATE_THRESHOLD = datetime.datetime(1990, 1, 1)
# we only need one session for the whole script # we only need one session for the whole script
session = requests.Session()
SESSION = requests.Session()
def propfind(path, auth): def propfind(path, auth):
@ -18,8 +22,11 @@ def propfind(path, auth):
Get a file's Last Modified timestamp and FileID via a PROPFIND request Get a file's Last Modified timestamp and FileID via a PROPFIND request
:param path: The path of the file in question :param path: The path of the file in question
:param auth: Auth data for the HTTP request (e.g. a requests.auth.HTTPBasicAuth object) :param auth: Auth data for the HTTP request (e.g. a requests.auth.HTTPBasicAuth object)
:return: The properties in XML format
:return: An iterator of dictionaries, one for every directory entry. Entry properties are taken from the PROPFIND
response
""" """
# do not descend further into subdirectories\
# TODO: we could probably be faster if we did
headers = {"Depth": "1"} headers = {"Depth": "1"}
# This body returns only the timelastmodified and the fileid variable # This body returns only the timelastmodified and the fileid variable
requested_data = \ requested_data = \
@ -27,96 +34,56 @@ def propfind(path, auth):
<d:propfind xmlns:d=\"DAV:\" xmlns:oc=\"http://owncloud.org/ns\" xmlns:nc=\"http://nextcloud.org/ns\"> <d:propfind xmlns:d=\"DAV:\" xmlns:oc=\"http://owncloud.org/ns\" xmlns:nc=\"http://nextcloud.org/ns\">
<d:prop> <d:prop>
<d:getlastmodified /> <d:getlastmodified />
<d:resourcetype />
<oc:fileid /> <oc:fileid />
</d:prop> </d:prop>
</d:propfind> </d:propfind>
""" """
req = requests.Request("PROPFIND", path, headers=headers, auth=auth, data=requested_data) req = requests.Request("PROPFIND", path, headers=headers, auth=auth, data=requested_data)
resp = session.send(req.prepare())
print(resp.text)
return resp.text
resp = SESSION.send(req.prepare())
et = ET.fromstring(resp.text)
for dav_response in et.findall('{DAV:}response'):
entry = {}
entry["path"] = dav_response.find("{DAV:}href").text
# skip this entry itself
if path.endswith(entry["path"]):
continue
props = dav_response.find("{DAV:}propstat").find("{DAV:}prop")
try:
entry["last_modified"] = datetime.datetime.strptime(
props.find("{DAV:}getlastmodified").text,
"%a, %d %b %Y %H:%M:%S GMT"
)
except (AttributeError, TypeError):
pass
entry["resource_type"] = []
try:
for resourcetype in props.find("{DAV:}resourcetype"):
entry["resource_type"].append(resourcetype.tag)
except (AttributeError, TypeError):
pass
try:
entry["file_id"] = int(props.find("{http://owncloud.org/ns}fileid").text)
except (AttributeError, TypeError):
pass
yield entry
def search_folder(requestreturn):
"""
Iterates through a folder's properties XML and find entries with invalid timestamps
:param requestreturn: The XML returned by propfind()
:return: A tuple of two lists. The first list contains all subfolders, the second contains all FileIDs of entries
with an invalid timestamp.
"""
# List to collect path of folders stored in path
innerfolders = []
# List to collect path of files with wrong timestamp
linkswrongtime = []
# First folder provided in the xml file is always the folder we are currently in.
# To prevent searching this folder twice, or ending in a loop, we can not store this folder in the folderlist again.
firstfolder = True
# Get a xml tree
tree = ET.ElementTree(ET.fromstring(requestreturn))
# Find all responses in the tree, those contain the fielpath, lasttimemodified, typeoffile etc.
for resp in tree.findall('.//{DAV:}response'):
# Here we can get the filpath out of href and get further information in prop (lastimemodified, typeoffile, etc.)
for p in resp:
# In case p.text is not none, it contains the filepath
if not (p.text is None):
if (p.text[-1] == '/'):
# If the current object is a folder, check it its not first folder
if not (firstfolder):
innerfolders.append(p.text)
else:
firstfolder = False
break
# In case p.text is none, it contains the further information
else:
for t in p.findall('.//{DAV:}getlastmodified'):
# this function converts the given date to unix timestamp
lastmodified = time.mktime(
datetime.datetime.strptime(t.text, "%a, %d %b %Y %H:%M:%S GMT").timetuple())
# 631148400 is the unix timestamp of 01.01.1990 00:00:00, because we know there is no file older
# than this in our nextcloud
if lastmodified < 631148400:
for fileid in p.findall('.//{http://owncloud.org/ns}fileid'):
linkswrongtime.append(fileid.text)
return (innerfolders, linkswrongtime)
def version_check(xmlfile):
def find_valid_version(versions):
""" """
This function returns the fileid of the version of a given fileid with the most current timestamp or None if This function returns the fileid of the version of a given fileid with the most current timestamp or None if
there are no versions with a timestamp younger than 01.01.1990
:param xmlfile: An XML file with Last Modified timestamps and FileIDs as returned by propfind()
:return: The FileID of the most recent version or None if no valid version exists
there are no versions with a timestamp younger than the threshold
:param versions: An iterator as returned by propfind()
:return: The entry of the iterator which has the most recent date or None if none exists
""" """
tree = ET.ElementTree(ET.fromstring(xmlfile))
# Name of files are stored as a string. But we can check the timestamp of the file only after we can check the name,
# so it needs to be stored temporary in case the timestamp is the most current
temp = "this is a temporary string"
# These two variables are used to store the highest/most current timestamp and the associated fileid
most_current_timestamp = 631148400
most_current_timestamp_fileid = 0
for resp in tree.findall('.//{DAV:}response'):
for p in resp:
# In case p.text is not none, it contains the filepath
if not (p.text is None):
temp = p.text
else:
for t in p.findall('.//{DAV:}getlastmodified'):
if not (t.text is None):
# this function converts the given date to unix timestamp
lastmodified = time.mktime(
datetime.datetime.strptime(t.text, "%a, %d %b %Y %H:%M:%S GMT").timetuple())
#
if lastmodified > most_current_timestamp:
most_current_timestamp = lastmodified
fileid_old_version = temp.split('/')
most_current_timestamp_fileid = fileid_old_version[-1]
else:
break
# Check if there is another version and a file with a current timestamp
if most_current_timestamp_fileid != 0:
return most_current_timestamp_fileid
else:
# mock entry for comparison
most_recent = {"last_modified": DATE_THRESHOLD}
for version in versions:
if "last_modified" in version and version["last_modified"] > most_recent["last_modified"]:
most_recent = version
if most_recent["last_modified"] == DATE_THRESHOLD:
return None return None
return most_recent
if __name__ == "__main__": if __name__ == "__main__":
@ -138,24 +105,28 @@ if __name__ == "__main__":
mainpath = FILES_PATH_PREFIX + arguments.username + arguments.search_path mainpath = FILES_PATH_PREFIX + arguments.username + arguments.search_path
# List of all folders we need to enter # List of all folders we need to enter
folders = [mainpath] folders = [mainpath]
# List of all fileids with wrong time
# List of all entries with wrong time
wrongtime = [] wrongtime = []
# Iterate through all folders and check for wrong timestamps # Iterate through all folders and check for wrong timestamps
while folders: while folders:
path_suffix = folders.pop(0)
path = arguments.server + str(path_suffix)
r = propfind(path, auth)
new_folders, new_wrongtime = search_folder(r)
# Append all found folders and files with wrong timestamps to global list
folders += new_folders
wrongtime += new_wrongtime
# Iterate through all fileids with wrong timestamps and check for versions with intact timestamp
while wrongtime:
fileid = wrongtime.pop(0)
version_suffix = VERSIONS_PATH_PREFIX + arguments.username + "/versions/" + fileid
version_path = arguments.server + version_suffix
versions = propfind(version_path, auth)
mrv = version_check(versions)
print(fileid, end=": ")
print(mrv)
url = arguments.server + folders.pop(0)
print("+", end="", flush=True)
for entry in propfind(url, auth):
print(".", end="", flush=True)
# put directories in search list
if "resource_type" in entry and "{DAV:}collection" in entry["resource_type"]:
folders.append(entry["path"])
# put files with wrong date in wrong date list (we don't know what to do if a directory has an invalid date)
elif "last_modified" in entry and entry["last_modified"] < DATE_THRESHOLD:
wrongtime.append(entry)
# Iterate through all fileids with wrong timestamps and check for versions with intact timestamp
print()
# NOTE: you can indent this into the loop above to fix things on-the-fly instead of all at once
for entry in wrongtime:
print(urllib.parse.unquote(entry["path"][len(FILES_PATH_PREFIX):]))
fixed_version = find_valid_version(propfind(arguments.server + VERSIONS_PATH_PREFIX + arguments.username + "/versions/" + str(entry["file_id"]), auth))
if fixed_version:
print("Restore from {}".format(fixed_version))
else:
print("Touch file.")
Loading…
Cancel
Save