From 13b2e17b90a43abb3d2a5ab4fd5ee87d5c71a761 Mon Sep 17 00:00:00 2001 From: MK13 Date: Sat, 4 Apr 2020 09:38:55 +0200 Subject: [PATCH] Initial commit --- .gitignore | 4 ++ Pipfile | 12 +++++ Pipfile.lock | 71 ++++++++++++++++++++++++++++ doir.py | 131 +++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 218 insertions(+) create mode 100644 .gitignore create mode 100644 Pipfile create mode 100644 Pipfile.lock create mode 100755 doir.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..47b5298 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +cachefile.doir +index.html +links/ +.idea/ diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..1b8a978 --- /dev/null +++ b/Pipfile @@ -0,0 +1,12 @@ +[[source]] +name = "pypi" +url = "https://pypi.org/simple" +verify_ssl = true + +[dev-packages] + +[packages] +habanero = "*" + +[requires] +python_version = "3.8" diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 0000000..3ed1325 --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,71 @@ +{ + "_meta": { + "hash": { + "sha256": "703754edf167e2bedf149502d92a2b030f121676b4ee7dcf07db18d247316a26" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.8" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "certifi": { + "hashes": [ + "sha256:017c25db2a153ce562900032d5bc68e9f191e44e9a0f762f373977de9df1fbb3", + "sha256:25b64c7da4cd7479594d035c08c2d809eb4aab3a26e5a990ea98cc450c320f1f" + ], + "version": "==2019.11.28" + }, + "chardet": { + "hashes": [ + "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", + "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" + ], + "version": "==3.0.4" + }, + "habanero": { + "hashes": [ + "sha256:b8494c0c600a5973e91ca6dc007781158a2799431d9c61888e095a2091a435df", + "sha256:d6b545384285b98b1577b0f88fd0da4cd59d550134776dd08f13188e77f3af88" + ], + "index": "pypi", + "version": "==0.7.2" + }, + "idna": { + "hashes": [ + "sha256:7588d1c14ae4c77d74036e8c22ff447b26d0fde8f007354fd48a7814db15b7cb", + "sha256:a068a21ceac8a4d63dbfd964670474107f541babbd2250d61922f029858365fa" + ], + "version": "==2.9" + }, + "requests": { + "hashes": [ + "sha256:43999036bfa82904b6af1d99e4882b560e5e2c68e5c4b0aa03b655f3d7d73fee", + "sha256:b3f43d496c6daba4493e7c431722aeb7dbc6288f52a6e04e7b6023b0247817e6" + ], + "version": "==2.23.0" + }, + "tqdm": { + "hashes": [ + "sha256:00339634a22c10a7a22476ee946bbde2dbe48d042ded784e4d88e0236eca5d81", + "sha256:ea9e3fd6bd9a37e8783d75bfc4c1faf3c6813da6bd1c3e776488b41ec683af94" + ], + "version": "==4.45.0" + }, + "urllib3": { + "hashes": [ + "sha256:2f3db8b19923a873b3e5256dc9c2dedfa883e33d87c690d9c7913e1f40673cdc", + "sha256:87716c2d2a7121198ebcb7ce7cccf6ce5e9ba539041cfbaeecfb641dc0bf6acc" + ], + "version": "==1.25.8" + } + }, + "develop": {} +} diff --git a/doir.py b/doir.py new file mode 100755 index 0000000..862e4aa --- /dev/null +++ b/doir.py @@ -0,0 +1,131 @@ +import os.path +import argparse +from habanero import Crossref +import json +from datetime import datetime +import http.server +import socketserver + +parser = argparse.ArgumentParser() +parser.add_argument("directory", help="directory which contains the files to resolve", type=str, default=".") +parser.add_argument("cachefile", help="path to the cache file containing already resolved DOIs", type=str, + default="cachefile.doir", nargs='?') +parser.add_argument("htmlfile", help="path to the html result file", type=str, default="index.html", nargs='?') +args = parser.parse_args() + +cr = Crossref() +results = {} +# Fifty days +fifty_days = 60 * 60 * 24 * 50 + + +def query_crossref(doi, file): + result = cr.works(filter={"doi": doi}) + + if result['message']['total-results'] > 0: + title = result['message']['items'][0]['title'][0] + date_parts = result['message']['items'][0]['created']['date-parts'] + date = str(date_parts[0][0]) + "-" + str(date_parts[0][1]) + "-" + str(date_parts[0][2]) + + return {'title': title, 'date': date, 'file_ref': file} + + +# Expected directory structure is: +# [directory_arg] +# - 10.1145, organisation DOI prefix (e.g. for ACM) +# - 4547.4552.pdf, file name is the document DOI suffix +# - 361604.361612.pdf +# - ... +# - another prefix +# - more suffixes +def walk_files(dir, prefix): + files = os.listdir(dir) + + for file in files: + full_file = os.path.join(dir, file) + + print("Process file: " + full_file) + + if os.path.isdir(full_file): + walk_files(dir=full_file, prefix=file) + else: + doi_suffix = os.path.splitext(file)[0] + doi = prefix + "/" + doi_suffix + # Query the cache first + cached = doi in results + + # Cache hit + if cached: + print("Cache hit!") + + result = results[doi] + has_timestamp = 'timestamp' in result + + if has_timestamp and (result['timestamp'] + fifty_days) > datetime.now().timestamp(): + # Cache result is older than 50 days + # Query CrossRef again + result = query_crossref(doi, 'links/' + prefix + "/" + file) + + # Not found again, create a new cache miss entry + if result is None: + result = {'timestamp': datetime.now().timestamp(), 'file_ref': 'links/' + prefix + "/" + file} + # Cache miss + else: + print("Cache miss!") + + # Check whether CrossRef has metadata for this DOI + result = query_crossref(doi, 'links/' + prefix + "/" + file) + + # If not, create a cache miss entry + if result is None: + print("Not found!") + + result = {'timestamp': datetime.now().timestamp(), 'file_ref': 'links/' + prefix + "/" + file} + + results[doi] = result + + +def write_cache(file): + with open(file, 'w') as cache_file: + cache_file.write(json.dumps(results)) + + +def read_cache(file): + if os.path.isfile(file): + with open(file, 'r') as cache_file: + return json.loads(cache_file.read()) + else: + return {} + + +def write_html(file): + with open(file, 'w') as html_file: + html_file.write("") + for result in results: + html_file.write("") + if 'title' in results[result]: + html_file.write("") + else: + html_file.write("") + + if 'date' in results[result]: + html_file.write("") + else: + html_file.write("") + + html_file.write("") + html_file.write("") + + html_file.write("
TitleYearLink
" + results[result]['title'] + "No title!" + results[result]['date'] + "No date!click
") + + +results = read_cache(file=args.cachefile) +walk_files(dir=args.directory, prefix=None) +write_cache(file=args.cachefile) +write_html(file=args.htmlfile) + +os.symlink(args.directory, 'links') + +Handler = http.server.SimpleHTTPRequestHandler +httpd = socketserver.TCPServer(("", 8888), Handler) +httpd.serve_forever()