|
@ -1,6 +1,8 @@ |
|
|
import argparse |
|
|
import argparse |
|
|
import datetime |
|
|
import datetime |
|
|
import time |
|
|
import time |
|
|
|
|
|
import urllib.parse |
|
|
|
|
|
import sys |
|
|
import xml.etree.ElementTree as ET |
|
|
import xml.etree.ElementTree as ET |
|
|
|
|
|
|
|
|
import requests |
|
|
import requests |
|
@ -9,8 +11,10 @@ import requests |
|
|
# some constants |
|
|
# some constants |
|
|
FILES_PATH_PREFIX = "/remote.php/dav/files/" |
|
|
FILES_PATH_PREFIX = "/remote.php/dav/files/" |
|
|
VERSIONS_PATH_PREFIX = "/remote.php/dav/versions/" |
|
|
VERSIONS_PATH_PREFIX = "/remote.php/dav/versions/" |
|
|
|
|
|
# the threshold for file timestamps (dates older than this are considered invalid) |
|
|
|
|
|
DATE_THRESHOLD = datetime.datetime(1990, 1, 1) |
|
|
# we only need one session for the whole script |
|
|
# we only need one session for the whole script |
|
|
session = requests.Session() |
|
|
|
|
|
|
|
|
SESSION = requests.Session() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def propfind(path, auth): |
|
|
def propfind(path, auth): |
|
@ -18,8 +22,11 @@ def propfind(path, auth): |
|
|
Get a file's Last Modified timestamp and FileID via a PROPFIND request |
|
|
Get a file's Last Modified timestamp and FileID via a PROPFIND request |
|
|
:param path: The path of the file in question |
|
|
:param path: The path of the file in question |
|
|
:param auth: Auth data for the HTTP request (e.g. a requests.auth.HTTPBasicAuth object) |
|
|
:param auth: Auth data for the HTTP request (e.g. a requests.auth.HTTPBasicAuth object) |
|
|
:return: The properties in XML format |
|
|
|
|
|
|
|
|
:return: An iterator of dictionaries, one for every directory entry. Entry properties are taken from the PROPFIND |
|
|
|
|
|
response |
|
|
""" |
|
|
""" |
|
|
|
|
|
# do not descend further into subdirectories\ |
|
|
|
|
|
# TODO: we could probably be faster if we did |
|
|
headers = {"Depth": "1"} |
|
|
headers = {"Depth": "1"} |
|
|
# This body returns only the timelastmodified and the fileid variable |
|
|
# This body returns only the timelastmodified and the fileid variable |
|
|
requested_data = \ |
|
|
requested_data = \ |
|
@ -27,96 +34,56 @@ def propfind(path, auth): |
|
|
<d:propfind xmlns:d=\"DAV:\" xmlns:oc=\"http://owncloud.org/ns\" xmlns:nc=\"http://nextcloud.org/ns\"> |
|
|
<d:propfind xmlns:d=\"DAV:\" xmlns:oc=\"http://owncloud.org/ns\" xmlns:nc=\"http://nextcloud.org/ns\"> |
|
|
<d:prop> |
|
|
<d:prop> |
|
|
<d:getlastmodified /> |
|
|
<d:getlastmodified /> |
|
|
|
|
|
<d:resourcetype /> |
|
|
<oc:fileid /> |
|
|
<oc:fileid /> |
|
|
</d:prop> |
|
|
</d:prop> |
|
|
</d:propfind> |
|
|
</d:propfind> |
|
|
""" |
|
|
""" |
|
|
req = requests.Request("PROPFIND", path, headers=headers, auth=auth, data=requested_data) |
|
|
req = requests.Request("PROPFIND", path, headers=headers, auth=auth, data=requested_data) |
|
|
resp = session.send(req.prepare()) |
|
|
|
|
|
print(resp.text) |
|
|
|
|
|
return resp.text |
|
|
|
|
|
|
|
|
resp = SESSION.send(req.prepare()) |
|
|
|
|
|
et = ET.fromstring(resp.text) |
|
|
|
|
|
for dav_response in et.findall('{DAV:}response'): |
|
|
|
|
|
entry = {} |
|
|
|
|
|
entry["path"] = dav_response.find("{DAV:}href").text |
|
|
|
|
|
# skip this entry itself |
|
|
|
|
|
if path.endswith(entry["path"]): |
|
|
|
|
|
continue |
|
|
|
|
|
props = dav_response.find("{DAV:}propstat").find("{DAV:}prop") |
|
|
|
|
|
try: |
|
|
|
|
|
entry["last_modified"] = datetime.datetime.strptime( |
|
|
|
|
|
props.find("{DAV:}getlastmodified").text, |
|
|
|
|
|
"%a, %d %b %Y %H:%M:%S GMT" |
|
|
|
|
|
) |
|
|
|
|
|
except (AttributeError, TypeError): |
|
|
|
|
|
pass |
|
|
|
|
|
entry["resource_type"] = [] |
|
|
|
|
|
try: |
|
|
|
|
|
for resourcetype in props.find("{DAV:}resourcetype"): |
|
|
|
|
|
entry["resource_type"].append(resourcetype.tag) |
|
|
|
|
|
except (AttributeError, TypeError): |
|
|
|
|
|
pass |
|
|
|
|
|
try: |
|
|
|
|
|
entry["file_id"] = int(props.find("{http://owncloud.org/ns}fileid").text) |
|
|
|
|
|
except (AttributeError, TypeError): |
|
|
|
|
|
pass |
|
|
|
|
|
yield entry |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def search_folder(requestreturn): |
|
|
|
|
|
""" |
|
|
|
|
|
Iterates through a folder's properties XML and find entries with invalid timestamps |
|
|
|
|
|
:param requestreturn: The XML returned by propfind() |
|
|
|
|
|
:return: A tuple of two lists. The first list contains all subfolders, the second contains all FileIDs of entries |
|
|
|
|
|
with an invalid timestamp. |
|
|
|
|
|
""" |
|
|
|
|
|
# List to collect path of folders stored in path |
|
|
|
|
|
innerfolders = [] |
|
|
|
|
|
# List to collect path of files with wrong timestamp |
|
|
|
|
|
linkswrongtime = [] |
|
|
|
|
|
# First folder provided in the xml file is always the folder we are currently in. |
|
|
|
|
|
# To prevent searching this folder twice, or ending in a loop, we can not store this folder in the folderlist again. |
|
|
|
|
|
firstfolder = True |
|
|
|
|
|
# Get a xml tree |
|
|
|
|
|
tree = ET.ElementTree(ET.fromstring(requestreturn)) |
|
|
|
|
|
# Find all responses in the tree, those contain the fielpath, lasttimemodified, typeoffile etc. |
|
|
|
|
|
for resp in tree.findall('.//{DAV:}response'): |
|
|
|
|
|
# Here we can get the filpath out of href and get further information in prop (lastimemodified, typeoffile, etc.) |
|
|
|
|
|
for p in resp: |
|
|
|
|
|
# In case p.text is not none, it contains the filepath |
|
|
|
|
|
if not (p.text is None): |
|
|
|
|
|
if (p.text[-1] == '/'): |
|
|
|
|
|
# If the current object is a folder, check it its not first folder |
|
|
|
|
|
if not (firstfolder): |
|
|
|
|
|
innerfolders.append(p.text) |
|
|
|
|
|
else: |
|
|
|
|
|
firstfolder = False |
|
|
|
|
|
break |
|
|
|
|
|
# In case p.text is none, it contains the further information |
|
|
|
|
|
else: |
|
|
|
|
|
for t in p.findall('.//{DAV:}getlastmodified'): |
|
|
|
|
|
# this function converts the given date to unix timestamp |
|
|
|
|
|
lastmodified = time.mktime( |
|
|
|
|
|
datetime.datetime.strptime(t.text, "%a, %d %b %Y %H:%M:%S GMT").timetuple()) |
|
|
|
|
|
# 631148400 is the unix timestamp of 01.01.1990 00:00:00, because we know there is no file older |
|
|
|
|
|
# than this in our nextcloud |
|
|
|
|
|
if lastmodified < 631148400: |
|
|
|
|
|
for fileid in p.findall('.//{http://owncloud.org/ns}fileid'): |
|
|
|
|
|
linkswrongtime.append(fileid.text) |
|
|
|
|
|
return (innerfolders, linkswrongtime) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def version_check(xmlfile): |
|
|
|
|
|
|
|
|
def find_valid_version(versions): |
|
|
""" |
|
|
""" |
|
|
This function returns the fileid of the version of a given fileid with the most current timestamp or None if |
|
|
This function returns the fileid of the version of a given fileid with the most current timestamp or None if |
|
|
there are no versions with a timestamp younger than 01.01.1990 |
|
|
|
|
|
:param xmlfile: An XML file with Last Modified timestamps and FileIDs as returned by propfind() |
|
|
|
|
|
:return: The FileID of the most recent version or None if no valid version exists |
|
|
|
|
|
|
|
|
there are no versions with a timestamp younger than the threshold |
|
|
|
|
|
:param versions: An iterator as returned by propfind() |
|
|
|
|
|
:return: The entry of the iterator which has the most recent date or None if none exists |
|
|
""" |
|
|
""" |
|
|
tree = ET.ElementTree(ET.fromstring(xmlfile)) |
|
|
|
|
|
# Name of files are stored as a string. But we can check the timestamp of the file only after we can check the name, |
|
|
|
|
|
# so it needs to be stored temporary in case the timestamp is the most current |
|
|
|
|
|
temp = "this is a temporary string" |
|
|
|
|
|
# These two variables are used to store the highest/most current timestamp and the associated fileid |
|
|
|
|
|
most_current_timestamp = 631148400 |
|
|
|
|
|
most_current_timestamp_fileid = 0 |
|
|
|
|
|
for resp in tree.findall('.//{DAV:}response'): |
|
|
|
|
|
for p in resp: |
|
|
|
|
|
# In case p.text is not none, it contains the filepath |
|
|
|
|
|
if not (p.text is None): |
|
|
|
|
|
temp = p.text |
|
|
|
|
|
else: |
|
|
|
|
|
for t in p.findall('.//{DAV:}getlastmodified'): |
|
|
|
|
|
if not (t.text is None): |
|
|
|
|
|
# this function converts the given date to unix timestamp |
|
|
|
|
|
lastmodified = time.mktime( |
|
|
|
|
|
datetime.datetime.strptime(t.text, "%a, %d %b %Y %H:%M:%S GMT").timetuple()) |
|
|
|
|
|
# |
|
|
|
|
|
if lastmodified > most_current_timestamp: |
|
|
|
|
|
most_current_timestamp = lastmodified |
|
|
|
|
|
fileid_old_version = temp.split('/') |
|
|
|
|
|
most_current_timestamp_fileid = fileid_old_version[-1] |
|
|
|
|
|
else: |
|
|
|
|
|
break |
|
|
|
|
|
# Check if there is another version and a file with a current timestamp |
|
|
|
|
|
if most_current_timestamp_fileid != 0: |
|
|
|
|
|
return most_current_timestamp_fileid |
|
|
|
|
|
else: |
|
|
|
|
|
|
|
|
# mock entry for comparison |
|
|
|
|
|
most_recent = {"last_modified": DATE_THRESHOLD} |
|
|
|
|
|
for version in versions: |
|
|
|
|
|
if "last_modified" in version and version["last_modified"] > most_recent["last_modified"]: |
|
|
|
|
|
most_recent = version |
|
|
|
|
|
if most_recent["last_modified"] == DATE_THRESHOLD: |
|
|
return None |
|
|
return None |
|
|
|
|
|
return most_recent |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
if __name__ == "__main__": |
|
@ -138,24 +105,28 @@ if __name__ == "__main__": |
|
|
mainpath = FILES_PATH_PREFIX + arguments.username + arguments.search_path |
|
|
mainpath = FILES_PATH_PREFIX + arguments.username + arguments.search_path |
|
|
# List of all folders we need to enter |
|
|
# List of all folders we need to enter |
|
|
folders = [mainpath] |
|
|
folders = [mainpath] |
|
|
# List of all fileids with wrong time |
|
|
|
|
|
|
|
|
# List of all entries with wrong time |
|
|
wrongtime = [] |
|
|
wrongtime = [] |
|
|
|
|
|
|
|
|
# Iterate through all folders and check for wrong timestamps |
|
|
# Iterate through all folders and check for wrong timestamps |
|
|
while folders: |
|
|
while folders: |
|
|
path_suffix = folders.pop(0) |
|
|
|
|
|
path = arguments.server + str(path_suffix) |
|
|
|
|
|
r = propfind(path, auth) |
|
|
|
|
|
new_folders, new_wrongtime = search_folder(r) |
|
|
|
|
|
# Append all found folders and files with wrong timestamps to global list |
|
|
|
|
|
folders += new_folders |
|
|
|
|
|
wrongtime += new_wrongtime |
|
|
|
|
|
# Iterate through all fileids with wrong timestamps and check for versions with intact timestamp |
|
|
|
|
|
while wrongtime: |
|
|
|
|
|
fileid = wrongtime.pop(0) |
|
|
|
|
|
version_suffix = VERSIONS_PATH_PREFIX + arguments.username + "/versions/" + fileid |
|
|
|
|
|
version_path = arguments.server + version_suffix |
|
|
|
|
|
versions = propfind(version_path, auth) |
|
|
|
|
|
mrv = version_check(versions) |
|
|
|
|
|
print(fileid, end=": ") |
|
|
|
|
|
print(mrv) |
|
|
|
|
|
|
|
|
url = arguments.server + folders.pop(0) |
|
|
|
|
|
print("+", end="", flush=True) |
|
|
|
|
|
for entry in propfind(url, auth): |
|
|
|
|
|
print(".", end="", flush=True) |
|
|
|
|
|
# put directories in search list |
|
|
|
|
|
if "resource_type" in entry and "{DAV:}collection" in entry["resource_type"]: |
|
|
|
|
|
folders.append(entry["path"]) |
|
|
|
|
|
# put files with wrong date in wrong date list (we don't know what to do if a directory has an invalid date) |
|
|
|
|
|
elif "last_modified" in entry and entry["last_modified"] < DATE_THRESHOLD: |
|
|
|
|
|
wrongtime.append(entry) |
|
|
|
|
|
# Iterate through all fileids with wrong timestamps and check for versions with intact timestamp |
|
|
|
|
|
print() |
|
|
|
|
|
# NOTE: you can indent this into the loop above to fix things on-the-fly instead of all at once |
|
|
|
|
|
for entry in wrongtime: |
|
|
|
|
|
print(urllib.parse.unquote(entry["path"][len(FILES_PATH_PREFIX):])) |
|
|
|
|
|
fixed_version = find_valid_version(propfind(arguments.server + VERSIONS_PATH_PREFIX + arguments.username + "/versions/" + str(entry["file_id"]), auth)) |
|
|
|
|
|
if fixed_version: |
|
|
|
|
|
print("Restore from {}".format(fixed_version)) |
|
|
|
|
|
else: |
|
|
|
|
|
print("Touch file.") |