Skip to content
Snippets Groups Projects
Select Git revision
  • main default protected
1 result

hedgedoc_import.py

Blame
  • hedgedoc_import.py 5.69 KiB
    #!/usr/bin/env python3
    import json
    from urllib.error import HTTPError
    from common import get_sessionid
    from pathlib import Path
    import urllib.parse
    import urllib.request
    import os
    import subprocess
    from zipfile import ZipFile
    
    
    def check_accessibility(instance_url, session_id):
        request_url = instance_url + '/me/'
        headers = {"Cookie": f"connect.hedgeDoc.sid={session_id}"}
    
        req = urllib.request.Request(request_url, headers=headers)
        with urllib.request.urlopen(req) as response:
            response_json = json.load(response)
            if response_json["status"] != "ok":
                raise SystemExit(f"Could not access protected resources at {request_url}. Make sure that the specified "
                                 f"cookie is correct. Aborting...")
    
    
    def import_single_document(instance_url, hedgedoc_free_url, content, session_id):
        sanitized_free_url = hedgedoc_free_url.replace(" ", "%20")
        request_url = instance_url + '/new/' + sanitized_free_url
    
        headers = {"Cookie": f"connect.hedgeDoc.sid={session_id}", "Content-Type": "text/markdown"}
    
        req = urllib.request.Request(request_url, data=str.encode(content), method='POST', headers=headers)
        with urllib.request.urlopen(req) as response:
            return response.url
    
    
    def check_archive_exists(archive_file):
        if not os.path.exists(archive_file):
            raise SystemExit(
                f"ERROR: File {archive_file} does not exist. Export your data from CodiMD and re-execute this script again!"
                f" Aborting...")
    
    
    def import_into_hedgedoc(instance_url, session_id, export_folder, archive_file):
        print("Beginning import_md")
        check_archive_exists(archive_file)
        check_accessibility(instance_url, session_id)
        # get exported history map
        with open(os.path.join(export_folder, "history.json")) as map_file:
            history_dictionary = json.load(map_file)
    
        # mapping from title of a document (= filename without md extension in archive) to its id (= note url in CodiMD)
        lookup_map = {}
        for entry in history_dictionary["history"]:
            lookup_map[entry["text"]] = entry["id"]
    
        # URLs to visit to make the new document available in the history
        urls_to_visit = []
    
        iterate_over_archive(archive_file, instance_url, lookup_map, session_id, urls_to_visit)
        create_urls_to_visit_file("../hedgedoc_documents_to_visit.url", urls_to_visit)
    
    
    def iterate_over_archive(archive_file, instance_url, lookup_map, session_id, urls_to_visit):
        # iterate over files in archive
        with ZipFile(archive_file) as zf:
            print("Now scanning your provided archive file containing the documents you are the owner of")
            print("If you visited your own document via a, e.g., \"codi-instance.tld/my_own_path\") " +
                  "this script tries to migrate it to the HedgeDoc instance at \"hedgedoc-instance.tld/my_own_path\"")
            print("If this is not possible, a new random URL for the document will be created")
            for file in zf.namelist():
                document_title = Path(file).stem
                # check for every file if there is a match between its name (= title in history json) and an existing path
                # for a document
                with zf.open(file) as f:
                    document_content = f.read().decode("UTF-8")
    
                if document_title in lookup_map:
                    print(
                        f"\tYou visited your own document \"{document_title}\".md) via the identifier/path " +
                        f"\"{lookup_map[document_title]}\"")
                    print(f"\tTrying to migrate this document and make it available under the already visited path")
                    try:
                        new_url = import_single_document(instance_url, lookup_map[document_title], document_content,
                                                         session_id)
                        urls_to_visit.append(new_url)
                        print(f"\tMigration was possible. New URL: {instance_url}/{lookup_map[document_title]}")
                    except HTTPError as error:
                        if error.status == 409:
                            print("\tATTENTION: Could not migrate document with the same path. Uploading anyways and "
                                  "creating a new, random path")
                            new_url = import_single_document(instance_url, "", document_content, session_id)
                            print(f"New URL after document migration without migrating the URL/subpath: {new_url}")
                    urls_to_visit.append(new_url)
                else:
                    print(f"According to your history, you did not visit \"{document_title}.md\" in the CodiMD "
                          "instance recently. Migrating the document and generating a new, random URL/path for it")
                    # empty string implies HedgeDoc should create a new ID
                    generated_url = import_single_document(instance_url, "", document_content, session_id)
                    print(f"New URL after document migration with new, random URL/subpath: "
                          f"{generated_url}")
                    urls_to_visit.append(generated_url)
    
    
    def create_urls_to_visit_file(filename, urls_to_visit):
        print(f"A new file {filename} will be created. It contains all URLs you need to visit in order to make the"
              f"migrated documents appear in your HedgeDoc history.")
        print("This can be automated by running the scripts in the directory \"history_scripts\"")
        print("BE AWARE: Opening a lot of tabs might be quite resource-intensive.")
        with open(filename, 'w') as f:
            for url in urls_to_visit:
                f.write(url + "\n")
    
    
    if __name__ == "__main__":
        import_into_hedgedoc("https://md.inf.tu-dresden.de/notes", get_sessionid("HedgeDoc", "connect.hedgeDoc.sid"),
                             "codimd-documents", "../archive.zip")