From 13f564f939e515d4236ccfa009584d1ec15f13b9 Mon Sep 17 00:00:00 2001
From: Andreas Domanowski <andreas@domanowski.net>
Date: Wed, 1 Mar 2023 12:12:40 +0100
Subject: [PATCH] WIP: Implement HedgeDoc import with visiting URLs after
 upload, Add accessibility check

---
 codimd_export.py   |  2 +-
 hedgedoc_import.py | 81 +++++++++++++++++++++++++++++++---------------
 2 files changed, 56 insertions(+), 27 deletions(-)

diff --git a/codimd_export.py b/codimd_export.py
index 14e5d96..cf3f0b7 100755
--- a/codimd_export.py
+++ b/codimd_export.py
@@ -46,7 +46,7 @@ def export_from_codimd(instance_url, session_id, export_to):
             contents = slurp(f"{document_url}/download", session_id)
             with open(Path(target_dir, f"{document_id}.md"), mode="wb") as stream:
                 stream.write(contents)
-            with open(Path(target_dir, f"map.map"), mode="w") as stream:
+            with open(Path(target_dir, f"history.json"), mode="w") as stream:
                 json.dump(data, stream)
             num_ok += 1
         except HTTPError as error:
diff --git a/hedgedoc_import.py b/hedgedoc_import.py
index 95d1b59..c539dfb 100644
--- a/hedgedoc_import.py
+++ b/hedgedoc_import.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 import json
-
+from urllib.error import HTTPError
 from common import get_sessionid
 from pathlib import Path
 import urllib.parse
@@ -10,24 +10,34 @@ import subprocess
 from zipfile import ZipFile
 
 
+def check_accessibility(instance_url, session_id):
+    request_url = instance_url + '/me/'
+    headers = {"Cookie": f"connect.hedgeDoc.sid={session_id}"}
+
+    req = urllib.request.Request(request_url, headers=headers)
+    with urllib.request.urlopen(req) as response:
+        response_json = json.load(response)
+        if response_json["status"] != "ok":
+            raise SystemExit(f"Could not access protected resources at {request_url}. Make sure that the specified "
+                             f"cookie is correct. Aborting...")
+
+
 def import_single_document(instance_url, hedgedoc_free_url, content, session_id):
     sanitized_free_url = hedgedoc_free_url.replace(" ", "%20")
     request_url = instance_url + '/new/' + sanitized_free_url
 
-    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
-    headers = {'User-Agent': user_agent, "Cookie": f"connect.hedgeDoc.sid={session_id}",
-               "Content-Type": "text/markdown"}
+    headers = {"Cookie": f"connect.hedgeDoc.sid={session_id}", "Content-Type": "text/markdown"}
 
-    document_contents = str.encode(content)
-    req = urllib.request.Request(request_url, data=document_contents, method='POST', headers=headers)
+    req = urllib.request.Request(request_url, data=str.encode(content), method='POST', headers=headers)
     with urllib.request.urlopen(req) as response:
         return response.url
 
 
 def import_into_hedgedoc(instance_url, session_id, export_folder, archive_file):
-    folder = os.path.join(os.getcwd(), export_folder);
+    check_accessibility(instance_url, session_id)
+    browser = select_browser()
     # get exported history map
-    with open(os.path.join(export_folder, "map.json")) as map_file:
+    with open(os.path.join(export_folder, "history.json")) as map_file:
         history_dictionary = json.load(map_file)
 
     # mapping from title of a document (= filename without md extension in archive) to its id (= note url in CodiMD)
@@ -40,44 +50,63 @@ def import_into_hedgedoc(instance_url, session_id, export_folder, archive_file):
 
     # iterate over files in archive
     with ZipFile(archive_file) as zf:
+        print("Now scanning your provided archive file containing the documents you are the owner of")
+        print("If you visited your own document via a, e.g., \"codi-instance.tld/my_own_path\") " +
+              "this script tries to migrate it to the HedgeDoc instance at \"hedgedoc-instance.tld/my_own_path\"")
+        print("If this is not possible, a new random URL for the document will be created")
         for file in zf.namelist():
             document_title = Path(file).stem
             # check for every file if there is a match between its name (= title in history json) and an existing path
             # for a document
-            document_content = None
+            with zf.open(file) as f:
+                document_content = f.read().decode("UTF-8")
 
             if document_title in lookup_map:
-                print("filename: ", document_title, ", in lookupMap: ",
-                      lookup_map[document_title])
-                urls_to_visit.append(
-                    import_single_document(instance_url, lookup_map[document_title], document_content, session_id))
+                print(
+                    f"\tYou visited your own document \"{document_title}\".md) via the path " +
+                    f"\"{lookup_map[document_title]}\"")
+                print(f"\tTrying to migrate this document and make it available under the already visited path")
+                try:
+                    new_url = import_single_document(instance_url, lookup_map[document_title], document_content,
+                                                     session_id)
+                    urls_to_visit.append(new_url)
+                except HTTPError as error:
+                    if error.status == 409:
+                        print("\tHTTP 409. Uploading anyways (new path, random ID)")
+                        new_url = import_single_document(instance_url, "", document_content, session_id)
+                urls_to_visit.append(new_url)
             else:
                 print("no mapping found for ", document_title, ", uploading anyway")
                 # empty string implies HedgeDoc should create a new ID
                 urls_to_visit.append(import_single_document(instance_url, "", document_content, session_id))
-
-    browser = "firefox"
+    print("Your specified browser now needs to visit every newly created document to ensure that it's available in"
+          "your history in HedgeDoc")
     subprocess.run([browser] + urls_to_visit)
 
 
 def select_browser():
-    options = ['chrome', 'firefox', 'opera']
+    print("Once you've uploaded all your documents, they unfortunately do not appear in your HedgeDoc history.")
+    print("To make sure that they are available to you, this script automatically visits all your newly uploaded "
+          "documents in your browser.")
+    print("Therefore, you need to specify your browser. It needs to be on your path with the same name as "
+          "specified here")
+    print("ATTENTION - this needs to be a browser where you have an active and logged-in HedgeDoc session")
+    supported_browsers = ["firefox", "opera", "safari", "google-chrome", "chromium"]
     user_input = ''
-    input_message = "Pick an option:\n"
+    input_message = "Specify a browser which holds an active and logged-in HedgeDoc session:\n"
 
-    for index, item in enumerate(options):
-        input_message += f'{index + 1}) {item}\n'
+    for i, browser_suggestion in enumerate(supported_browsers):
+        input_message += f'{i + 1}) {browser_suggestion}\n'
 
-    input_message += 'Your choice: '
+    # input_message += 'Your choice: '
 
-    while user_input not in map(str, range(1, len(options) + 1)):
+    while user_input not in map(str, range(1, len(supported_browsers) + 1)):
         user_input = input(input_message)
 
-    print('You picked: ' + options[int(user_input) - 1])
+    print('You chose: ' + supported_browsers[int(user_input) - 1])
+    return supported_browsers[int(user_input) - 1]
 
 
 if __name__ == "__main__":
-    # select_browser()
-    import_into_hedgedoc("https://md.inf.tu-dresden.de/notes", 123, "codimd-documents", "archive.zip")
-
-    # import_into_hedgedoc("https://md.inf.tu-dresden.de/notes", get_sessionid("connect.hedgeDoc.sid"), "codimd-documents")
+    import_into_hedgedoc("https://md.inf.tu-dresden.de/notes", get_sessionid("connect.hedgeDoc.sid"),
+                         "codimd-documents")
-- 
GitLab