diff --git a/.gitignore b/.gitignore index 721edef0cfc63282d2d4f7f93eb174adf32fbf49..0283d2c6ffa66d0b8cb87857385354069aa2385f 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,8 @@ __pycache__/ *.pyc codimd-documents/ + +.idea + +*.zip +hedgedoc_documents_to_visit.url \ No newline at end of file diff --git a/README.md b/README.md index ff738f51fbc57d377427be65c851edae3aa904f9..1432cddf085b649489805b85d51e895ab600243c 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,27 @@ -# CodiMD note exporter +```` +WARNING - Use this script at your own discretion. It's a prototype and not exhaustively tested and might throw HTTP errors that were not encounted for. +```` -This little tool is intended to help you backup the notes that you have in your CodiMD history to a -local folder. It complements CodiMD's "export user data" functionality, which only downloads -documents that you own. +# CodiMD Note Exporter + Hedgedoc Note Importer + +This little tool is intended to help you **backup** the notes that your CodiMD history shows to a local folder. +It complements CodiMD's "export user data" functionality, which only downloads +documents that you are the owner of. +This backup is not meant to be uploaded by you in its entirety, since this would create ownership issues of shared +documents. + +The export user data functionality of CodiMD creates an archive containing all the documents you are the owner of. +This tool furthermore implements functionality to upload each or a selection of your owned documents based on this +archive. +Based upon the path the document was served under at CodiMD, this script tries to migrate the document to the exact same +path if possible, e.g., https://md.inf.tu-dresden.de/my_custom_path gets mapped +to https://md.inf.tu-dresden.de/notes/my_custom_path + +Be aware that this approach is a little hacky. +Unfortunately, this is necessary since our CodiMD instance did use KeyCloak as an OAuth provider. + +Follow the steps closely to ensure that everything works as intended and that the ownership of documents is not messed +up. ## Requirements @@ -10,9 +29,21 @@ Its only dependency is Python >= 3.7 (use `python3 -V` to check your version). ## Usage -### 1. Extract value of the session cookie +After cloning this repository, follow the following steps to migrate your documents. +If you only want to export your visited documents and import them manually, just follow steps 2 and 4. + +### 1. Export Your Owned Documents From Our Codimd Instance and Weed Them Out + +- Go to https://md.inf.tu-dresden.de/ +- Click on your username in the upper right and hit "*Export user data*" +- Download the file "archive.zip" and place it in the same folder as this file (README.md) +- **Weed your archive out**! Most likely, it will contain a lot of unused or empty documents. Delete those files in + place in the archive, don't extract them. -First of all, you need to extract the value of the `connect.sid` browser cookie for the +### 2. Extract the Value of Your Codimd Session Cookie + +To ensure that the script can access the documents you accessed in our CodiMD instance, you need to extract the value of +the `connect.sid` browser cookie for the particular CodiMD instance that you use (e.g., `md.inf.tu-dresden.de`). This procedure is necessary because the CodiMD login is handled externally by Keycloak. @@ -22,38 +53,34 @@ password**. Don't share it with others! Now, the instructions are similar for Chrome and Firefox: -1. Navigate your browser to the CodiMD instance. -2. Open developer tools. +1. Navigate your browser to the [CodiMD instance](https://md.inf.tu-dresden.de/). Be sure that you are logged in +2. Open developer tools (F12). 3. **Chrome**: go to the "Application" tab. **Firefox**: go to the "Storage" tab ("Web-Speicher"). 4. Un-collapse "Cookies". 5. In the list, search for a cookie with the name `connect.sid` 6. Select and copy the value. It must start with the character sequence `s%3A`. +### 3. Extract the Value of Your Hedgedoc Session Cookie + +Follow the instructions from step 2, but copy the value of the cookie key `connect.hedgeDoc.sid` -### 2. Execute the script via the command line +### 4. Execute the Script via the Command Line -Clone this repository to a path of your liking and just execute `./codimd_export.py` in a -shell. The script will ask for your session id and download the notes to the relative path -`./codimd-documents`. +Execute `./md-import-export.py` or `python3 md-import-export.py` in a +shell. The script will ask for your session IDs and download the notes you visited to the relative +path `./codimd-documents`. Please note that your CodiMD history might reference already deleted notes or notes you no longer have access to. The URLs of these inaccessible notes are listed as part of the -script's output. - - -### Example - - $️ ./codimd_export.py - Please provide your CodiMD session id (connect.sid cookie): - HTTP 404 Not Found: https://md.inf.tu-dresden.de/TOPofLSB - HTTP 404 Not Found: https://md.inf.tu-dresden.de/6jeKGnJnSp6H0f0mVnBwQQ - HTTP 403 Forbidden: https://md.inf.tu-dresden.de/klausur-2020-experiences - HTTP 404 Not Found: https://md.inf.tu-dresden.de/iQVB7TO-QXqi3hGSZ7J4Eg - HTTP 404 Not Found: https://md.inf.tu-dresden.de/LSB-2020-11-17 - HTTP 404 Not Found: https://md.inf.tu-dresden.de/Nzbj13ydRLe1d0fRCyG6cA - HTTP 404 Not Found: https://md.inf.tu-dresden.de/LSB-2021-02-02 - HTTP 404 Not Found: https://md.inf.tu-dresden.de/LSB-2021-06-22 - HTTP 404 Not Found: https://md.inf.tu-dresden.de/LSB-2021-12-07 - HTTP 404 Not Found: https://md.inf.tu-dresden.de/ThesisTemplateBachelor - HTTP 404 Not Found: https://md.inf.tu-dresden.de/cnoGl7hoTaK1JIh2-u-h0g - Done: 163 notes successfully downloaded, 11 not accessible. +output of the script. + +Furthermore, the script tries to upload every file in the archive with a `.md` extension to our HedgeDoc instance. + +### 5. Visit the Uploaded Documents to Make Them Appear in Your Hedgedoc History + +Uploading the documents is not enough to make them appear in your HedgeDoc history. +You need to visit them at least once to make them available. +The script automatically generates a file `history_scripts/hedgedocu_documents_to_visit.url`. +You can either visit every file manually or execute the bash script `history_scripts/visit_migrated_documents.sh` (Unix & MacOS). +Feel free to contribute a Batch-Script for Windows environments. +Be aware: this might be pretty resource intensive and might take a while. diff --git a/common.py b/common.py new file mode 100644 index 0000000000000000000000000000000000000000..937389531c2f25ea80f229786fc029abce8b682f --- /dev/null +++ b/common.py @@ -0,0 +1,53 @@ +import json +import sys +import urllib.request +from urllib.error import HTTPError + +if not sys.platform.startswith("win"): + from getpass import getpass +else: + # on Windows, Ctrl-V doesn't work with getpass() + getpass = input + + +def get_sessionid(service_name, cookie_key): + """Ask the user for the session id, if it's not configured as an envvar.""" + print(f"You now will be asked for your session cookie for {service_name}") + print(f"To extract this cookie, you need to open your browser with an active and logged-in {service_name} session") + print(f"In Firefox or Chrome, you can do this by following the following steps:") + print(f"\t1. Open the developer tools (Shortcut: F12") + print(f"\t2. Select \"Storage -> Cookies\" (Firefox) or \"Application -> Cookies\"") + print(f"\t3. Copy the value for the cookie named {cookie_key}") + print(f"\t4. Input it in the prompt") + + sid = getpass(f"Please provide your {service_name} session id ({cookie_key} cookie): ") + if sid.startswith("s%3A"): + return sid + raise SystemExit(f"error: the supplied session id seems to be malformed") + + +def print_block_heading(message): + separator = "======================================================================================================" + print(separator) + print(message) + print(separator) + + +#def check_accessibility(instance_url, session_id, cookie_key): +# request_url = instance_url + '/me/' +# headers = {"Cookie": f"{cookie_key}={session_id}"} +# try: +# req = urllib.request.Request(request_url, headers=headers) +# with urllib.request.urlopen(req) as response: +# response_json = json.load(response) +# if response_json["status"] != "ok": +# raise_no_connection_error(request_url) +# print(f"Could access protected resources at {instance_url}. Proceeding...") +# except HTTPError as error: +# print(f"HTTP {error.status} {error.reason}") +# raise_no_connection_error(request_url) + + +def raise_no_connection_error(request_url): + raise SystemExit(f"Could not access protected resources at {request_url}. Make sure that the specified " + f"cookie is correct. Aborting...") diff --git a/codimd_export.py b/export_md/codimd_export.py similarity index 72% rename from codimd_export.py rename to export_md/codimd_export.py index cfb60d1a5f6daef1c0f8953bebd7489dc3be906a..18a931e7a2557a11ea13202d762e7e2c0727e2bc 100755 --- a/codimd_export.py +++ b/export_md/codimd_export.py @@ -4,17 +4,13 @@ Save all Markdown documents in your CodiMD history to a local directory. """ import json -import sys from pathlib import Path from urllib.error import HTTPError from urllib.parse import quote from urllib.request import Request, urlopen +import sys -if not sys.platform.startswith("win"): - from getpass import getpass -else: - # on Windows, Ctrl-V doesn't work with getpass() - getpass = input +from common import get_sessionid, print_block_heading def slurp(url, session_id): @@ -28,21 +24,28 @@ def prepare_target_dir(pathname): """Create the directory to dump documents to, but refuse to override an existing one.""" target_dir = Path(pathname) if target_dir.exists(): - raise SystemExit(f"error: the target directory {target_dir} already exists") + raise SystemExit(f"ERROR: the target directory {target_dir} already exists. Delete it, then re-execute this " + f"script") target_dir.mkdir() return target_dir -def main(instance_url, session_id, export_to): +def export_from_codimd(instance_url, session_id, export_to): """Retrieve CodiMD document history and try to download each document.""" + print_block_heading(f"Trying to fetch history ({instance_url})") try: data = json.loads(slurp(f"{instance_url}/history", session_id)) except OSError as error: raise SystemExit(f"error: couldn't access the /history endpoint: {error}") except json.JSONDecodeError as error: raise SystemExit(f"error: received malformed JSON: {error}") + print_block_heading(f"Preparing target directory ({export_to})") target_dir = prepare_target_dir(export_to) num_ok = num_fail = 0 + print_block_heading(f"Accessing history and trying to fetch each document") + with open(Path(target_dir, f"history.json"), mode="w") as stream: + json.dump(data, stream) + print("Hold on, this may take a while...") for row in data["history"]: document_id = row["id"] document_url = f"{instance_url}/{quote(document_id)}" @@ -59,13 +62,5 @@ def main(instance_url, session_id, export_to): print(f"Done: {num_ok} notes successfully downloaded, {num_fail} not accessible.") -def get_sessionid(): - """Ask the user for the session id, if it's not configured as an envvar.""" - sid = getpass("Please provide your CodiMD session id (connect.sid cookie): ") - if sid.startswith("s%3A"): - return sid - raise SystemExit(f"error: the supplied session id seems to be malformed") - - if __name__ == "__main__": - main("https://md.inf.tu-dresden.de", get_sessionid(), "codimd-documents") + export_from_codimd("https://md.inf.tu-dresden.de", get_sessionid("CodiMD", "connect.sid"), "codimd-documents") diff --git a/history_scripts/visit_migrated_documents.sh b/history_scripts/visit_migrated_documents.sh new file mode 100755 index 0000000000000000000000000000000000000000..69afd3e17adab7657ac52a3061e6a0b0871e8cdb --- /dev/null +++ b/history_scripts/visit_migrated_documents.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +open_command=open +if [[ $(command -v xdg-open) ]]; +then + open_command=xdg-open +fi +script_directory=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +url_file=$script_directory/hedgedoc_documents_to_visit.url +readarray -t url_array < "$url_file" +for i in "${url_array[@]}" +do + eval "$open_command $i" +done \ No newline at end of file diff --git a/import_md/hedgedoc_import.py b/import_md/hedgedoc_import.py new file mode 100644 index 0000000000000000000000000000000000000000..65978839c36bf22cbc8506e3fd9d4714bf2794be --- /dev/null +++ b/import_md/hedgedoc_import.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python3 +import json +from urllib.error import HTTPError +from common import get_sessionid, print_block_heading +from pathlib import Path +import urllib.parse +import urllib.request +import os +from zipfile import ZipFile + + +def import_single_document(instance_url, hedgedoc_free_url, content, session_id): + sanitized_free_url = urllib.parse.quote(hedgedoc_free_url) + request_url = instance_url + '/new/' + sanitized_free_url + + headers = {"Cookie": f"connect.hedgeDoc.sid={session_id}", "Content-Type": "text/markdown"} + + req = urllib.request.Request(request_url, data=str.encode(content), method='POST', headers=headers) + # unfortunately, no error is thrown if a document is not created when session cookie is invalid + # HTTP 409 is ignored for the sake of simplicity. Handled in import_into_hedgedoc(..) + # Not optimal, but nobody ain't got time for that + with urllib.request.urlopen(req) as response: + if response.url == instance_url + "/": + raise SystemExit("Could not import document. Please check your HedgeDoc session cookie. Aborting...") + return response.url + + +def check_file_exists(file): + if not os.path.exists(file): + raise SystemExit( + f"ERROR: File {file} does not exist. Export your data from CodiMD and re-execute this script again!" + f" Aborting...") + print(f"Required file {file} exists. Proceeding...") + + +def import_into_hedgedoc(instance_url, session_id, export_folder, archive_file): + print_block_heading( + f"Checking existence of archive file ({archive_file})") + check_file_exists(archive_file) + # get exported history map + history_json_filename = "history.json" + history_dictionary = {} + try: + with open(os.path.join(export_folder, "%s" % history_json_filename)) as map_file: + history_dictionary = json.load(map_file) + except FileNotFoundError: + print_block_heading( + f"INFO: could not find file {history_json_filename}. Continuing anyways with random generated paths for " + f"documents") + + # mapping from title of a document (= filename without md extension in archive) to its id (= note url in CodiMD) + lookup_map = {} + if "history" in history_dictionary: + for entry in history_dictionary["history"]: + lookup_map[entry["text"]] = entry["id"] + + # URLs to visit to make the new document available in the history + urls_to_visit = [] + + process_archive_export(archive_file, instance_url, lookup_map, session_id, urls_to_visit) + create_urls_to_visit_file("history_scripts/hedgedoc_documents_to_visit.url", urls_to_visit) + + +def process_archive_export(archive_file, instance_url, lookup_map, session_id, urls_to_visit): + # iterate over files in archive + with ZipFile(archive_file) as zf: + print("Now scanning your provided archive file containing the documents you are the owner of") + print("If you visited your own document via a, e.g., \"codi-instance.tld/my_specified_free_url\" " + + "this script tries to migrate it to the HedgeDoc instance at \"hedgedoc-instance.tld/my_own_path\"") + print("If this is not possible, a new random URL for the document will be created") + print_block_heading("Iterating over files in archive and trying to upload them") + for file in zf.namelist(): + document_title = Path(file).stem + # check for every file if there is a match between its name (= title in history json) and an existing path + # for a document + with zf.open(file) as f: + document_content = f.read().decode("UTF-8") + + if document_title in lookup_map: + try_generate_free_url_document(document_content, document_title, instance_url, lookup_map, session_id, + urls_to_visit) + else: + print( + f"According to your history (or lack thereof) , you did not visit \"{document_title}.md\" in the " + f"CodiMD instance recently. Migrating the document and generating a new, random URL/path for it") + # empty string implies HedgeDoc should create a new ID + generated_url = import_single_document(instance_url, "", document_content, session_id) + print(f"New URL after document migration with new, random URL/subpath: " + f"{generated_url}") + urls_to_visit.append(generated_url) + print() + + +def try_generate_free_url_document(document_content, document_title, instance_url, lookup_map, session_id, + urls_to_visit): + print( + f"You visited your own document \"{document_title}.md\" via the identifier/path " + + f"\"{lookup_map[document_title]}\"") + print(f"Trying to migrate this document and make it available under the already visited path") + try: + new_url = import_single_document(instance_url, lookup_map[document_title], document_content, + session_id) + print(f"Migration was possible. New URL: {instance_url}/{lookup_map[document_title]}") + except HTTPError as error: + if error.status == 409: + print("ATTENTION: Could not migrate document with the same path. Uploading anyways and " + "creating a new, random path") + new_url = import_single_document(instance_url, "", document_content, session_id) + print(f"New URL after document migration without migrating the URL/subpath: {new_url}") + else: + raise SystemExit("Could not create document. Please check your session cookie. Aborting...") + urls_to_visit.append(new_url) + + +def create_urls_to_visit_file(filename, urls_to_visit): + print_block_heading("Creating file containg the URLs to visit") + print(f"A new file {filename} will be created. It contains all URLs you need to visit in order to make the" + f"migrated documents appear in your HedgeDoc history.") + print("This can be automated by running the scripts in the directory \"history_scripts\"") + print("BE AWARE: Opening a lot of tabs might be quite resource-intensive.") + with open(filename, 'w') as f: + for url in urls_to_visit: + f.write(url + "\n") + + +if __name__ == "__main__": + import_into_hedgedoc("https://md.inf.tu-dresden.de/notes", get_sessionid("HedgeDoc", "connect.hedgeDoc.sid"), + "codimd-documents", "../archive.zip") diff --git a/md-import-export.py b/md-import-export.py new file mode 100644 index 0000000000000000000000000000000000000000..fcadb26a0127f8d622b1a3dc5fa2a53706f7cb37 --- /dev/null +++ b/md-import-export.py @@ -0,0 +1,14 @@ +from export_md.codimd_export import export_from_codimd +from import_md.hedgedoc_import import import_into_hedgedoc +from common import get_sessionid, print_block_heading + +if __name__ == "__main__": + export_folder = "codimd-documents" + export_archive = "archive.zip" + + print_block_heading("Beginning export from CodiMD...") + export_from_codimd("https://md.inf.tu-dresden.de", get_sessionid("CodiMD", "connect.sid"), export_folder) + + print_block_heading("Beginning import to HedgeDoc...") + import_into_hedgedoc("https://md.inf.tu-dresden.de/notes", get_sessionid("HedgeDoc", "connect.hedgeDoc.sid"), + export_folder, export_archive)