WIP: Implement HedgeDoc import with visiting URLs after upload, Add accessibility check

13f564f9 · Andreas Domanowski · 8ba97220 · 13f564f9 · 13f564f9
Commit 13f564f9 authored 2 years ago by Andreas Domanowski
--- a/codimd_export.py
+++ b/codimd_export.py
@@ -46,7 +46,7 @@ def export_from_codimd(instance_url, session_id, export_to):
            contents = slurp(f"{document_url}/download", session_id)
            with open(Path(target_dir, f"{document_id}.md"), mode="wb") as stream:
                stream.write(contents)
-            with open(Path(target_dir, f"map.map"), mode="w") as stream:
+            with open(Path(target_dir, f"history.json"), mode="w") as stream:
                json.dump(data, stream)
            num_ok += 1
        except HTTPError as error:

--- a/hedgedoc_import.py
+++ b/hedgedoc_import.py
 #!/usr/bin/env python3
 import json
-
+from urllib.error import HTTPError
 from common import get_sessionid
 from pathlib import Path
 import urllib.parse
@@ -10,24 +10,34 @@ import subprocess
 from zipfile import ZipFile


+def check_accessibility(instance_url, session_id):
+    request_url = instance_url + '/me/'
+    headers = {"Cookie": f"connect.hedgeDoc.sid={session_id}"}
+
+    req = urllib.request.Request(request_url, headers=headers)
+    with urllib.request.urlopen(req) as response:
+        response_json = json.load(response)
+        if response_json["status"] != "ok":
+            raise SystemExit(f"Could not access protected resources at {request_url}. Make sure that the specified "
+                             f"cookie is correct. Aborting...")
+
+
 def import_single_document(instance_url, hedgedoc_free_url, content, session_id):
    sanitized_free_url = hedgedoc_free_url.replace(" ", "%20")
    request_url = instance_url + '/new/' + sanitized_free_url

-    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
-    headers = {'User-Agent': user_agent, "Cookie": f"connect.hedgeDoc.sid={session_id}",
-               "Content-Type": "text/markdown"}
+    headers = {"Cookie": f"connect.hedgeDoc.sid={session_id}", "Content-Type": "text/markdown"}

-    document_contents = str.encode(content)
-    req = urllib.request.Request(request_url, data=document_contents, method='POST', headers=headers)
+    req = urllib.request.Request(request_url, data=str.encode(content), method='POST', headers=headers)
    with urllib.request.urlopen(req) as response:
        return response.url


 def import_into_hedgedoc(instance_url, session_id, export_folder, archive_file):
-    folder = os.path.join(os.getcwd(), export_folder);
+    check_accessibility(instance_url, session_id)
+    browser = select_browser()
    # get exported history map
-    with open(os.path.join(export_folder, "map.json")) as map_file:
+    with open(os.path.join(export_folder, "history.json")) as map_file:
        history_dictionary = json.load(map_file)

    # mapping from title of a document (= filename without md extension in archive) to its id (= note url in CodiMD)
@@ -40,44 +50,63 @@ def import_into_hedgedoc(instance_url, session_id, export_folder, archive_file):

    # iterate over files in archive
    with ZipFile(archive_file) as zf:
+        print("Now scanning your provided archive file containing the documents you are the owner of")
+        print("If you visited your own document via a, e.g., \"codi-instance.tld/my_own_path\") " +
+              "this script tries to migrate it to the HedgeDoc instance at \"hedgedoc-instance.tld/my_own_path\"")
+        print("If this is not possible, a new random URL for the document will be created")
        for file in zf.namelist():
            document_title = Path(file).stem
            # check for every file if there is a match between its name (= title in history json) and an existing path
            # for a document
-            document_content = None
+            with zf.open(file) as f:
+                document_content = f.read().decode("UTF-8")

            if document_title in lookup_map:
-                print("filename: ", document_title, ", in lookupMap: ",
-                      lookup_map[document_title])
-                urls_to_visit.append(
-                    import_single_document(instance_url, lookup_map[document_title], document_content, session_id))
+                print(
+                    f"\tYou visited your own document \"{document_title}\".md) via the path " +
+                    f"\"{lookup_map[document_title]}\"")
+                print(f"\tTrying to migrate this document and make it available under the already visited path")
+                try:
+                    new_url = import_single_document(instance_url, lookup_map[document_title], document_content,
+                                                     session_id)
+                    urls_to_visit.append(new_url)
+                except HTTPError as error:
+                    if error.status == 409:
+                        print("\tHTTP 409. Uploading anyways (new path, random ID)")
+                        new_url = import_single_document(instance_url, "", document_content, session_id)
+                urls_to_visit.append(new_url)
            else:
                print("no mapping found for ", document_title, ", uploading anyway")
                # empty string implies HedgeDoc should create a new ID
                urls_to_visit.append(import_single_document(instance_url, "", document_content, session_id))
-
-    browser = "firefox"
+    print("Your specified browser now needs to visit every newly created document to ensure that it's available in"
+          "your history in HedgeDoc")
    subprocess.run([browser] + urls_to_visit)


 def select_browser():
-    options = ['chrome', 'firefox', 'opera']
+    print("Once you've uploaded all your documents, they unfortunately do not appear in your HedgeDoc history.")
+    print("To make sure that they are available to you, this script automatically visits all your newly uploaded "
+          "documents in your browser.")
+    print("Therefore, you need to specify your browser. It needs to be on your path with the same name as "
+          "specified here")
+    print("ATTENTION - this needs to be a browser where you have an active and logged-in HedgeDoc session")
+    supported_browsers = ["firefox", "opera", "safari", "google-chrome", "chromium"]
    user_input = ''
-    input_message = "Pick an option:\n"
+    input_message = "Specify a browser which holds an active and logged-in HedgeDoc session:\n"

-    for index, item in enumerate(options):
-        input_message += f'{index + 1}) {item}\n'
+    for i, browser_suggestion in enumerate(supported_browsers):
+        input_message += f'{i + 1}) {browser_suggestion}\n'

-    input_message += 'Your choice: '
+    # input_message += 'Your choice: '

-    while user_input not in map(str, range(1, len(options) + 1)):
+    while user_input not in map(str, range(1, len(supported_browsers) + 1)):
        user_input = input(input_message)

-    print('You picked: ' + options[int(user_input) - 1])
+    print('You chose: ' + supported_browsers[int(user_input) - 1])
+    return supported_browsers[int(user_input) - 1]


 if __name__ == "__main__":
-    # select_browser()
-    import_into_hedgedoc("https://md.inf.tu-dresden.de/notes", 123, "codimd-documents", "archive.zip")
-
-    # import_into_hedgedoc("https://md.inf.tu-dresden.de/notes", get_sessionid("connect.hedgeDoc.sid"), "codimd-documents")
+    import_into_hedgedoc("https://md.inf.tu-dresden.de/notes", get_sessionid("connect.hedgeDoc.sid"),
+                         "codimd-documents")