Initial commit

2020-04-04 09:38:55 +02:00
commit 13b2e17b90
4 changed files with 218 additions and 0 deletions
--- a/doir.py
+++ b/doir.py
@@ -0,0 +1,131 @@
+import os.path
+import argparse
+from habanero import Crossref
+import json
+from datetime import datetime
+import http.server
+import socketserver
+
+parser = argparse.ArgumentParser()
+parser.add_argument("directory", help="directory which contains the files to resolve", type=str, default=".")
+parser.add_argument("cachefile", help="path to the cache file containing already resolved DOIs", type=str,
+                    default="cachefile.doir", nargs='?')
+parser.add_argument("htmlfile", help="path to the html result file", type=str, default="index.html", nargs='?')
+args = parser.parse_args()
+
+cr = Crossref()
+results = {}
+# Fifty days
+fifty_days = 60 * 60 * 24 * 50
+
+
+def query_crossref(doi, file):
+    result = cr.works(filter={"doi": doi})
+
+    if result['message']['total-results'] > 0:
+        title = result['message']['items'][0]['title'][0]
+        date_parts = result['message']['items'][0]['created']['date-parts']
+        date = str(date_parts[0][0]) + "-" + str(date_parts[0][1]) + "-" + str(date_parts[0][2])
+
+        return {'title': title, 'date': date, 'file_ref': file}
+
+
+# Expected directory structure is:
+#  [directory_arg]
+#    - 10.1145, organisation DOI prefix (e.g. for ACM)
+#       - 4547.4552.pdf, file name is the document DOI suffix
+#       - 361604.361612.pdf
+#       - ...
+#    - another prefix
+#       - more suffixes
+def walk_files(dir, prefix):
+    files = os.listdir(dir)
+
+    for file in files:
+        full_file = os.path.join(dir, file)
+
+        print("Process file: " + full_file)
+
+        if os.path.isdir(full_file):
+            walk_files(dir=full_file, prefix=file)
+        else:
+            doi_suffix = os.path.splitext(file)[0]
+            doi = prefix + "/" + doi_suffix
+            # Query the cache first
+            cached = doi in results
+
+            # Cache hit
+            if cached:
+                print("Cache hit!")
+
+                result = results[doi]
+                has_timestamp = 'timestamp' in result
+
+                if has_timestamp and (result['timestamp'] + fifty_days) > datetime.now().timestamp():
+                    # Cache result is older than 50 days
+                    # Query CrossRef again
+                    result = query_crossref(doi, 'links/' + prefix + "/" + file)
+
+                    # Not found again, create a new cache miss entry
+                    if result is None:
+                        result = {'timestamp': datetime.now().timestamp(), 'file_ref': 'links/' + prefix + "/" + file}
+            # Cache miss
+            else:
+                print("Cache miss!")
+
+                # Check whether CrossRef has metadata for this DOI
+                result = query_crossref(doi, 'links/' + prefix + "/" + file)
+
+                # If not, create a cache miss entry
+                if result is None:
+                    print("Not found!")
+
+                    result = {'timestamp': datetime.now().timestamp(), 'file_ref': 'links/' + prefix + "/" + file}
+
+            results[doi] = result
+
+
+def write_cache(file):
+    with open(file, 'w') as cache_file:
+        cache_file.write(json.dumps(results))
+
+
+def read_cache(file):
+    if os.path.isfile(file):
+        with open(file, 'r') as cache_file:
+            return json.loads(cache_file.read())
+    else:
+        return {}
+
+
+def write_html(file):
+    with open(file, 'w') as html_file:
+        html_file.write("<html><head></head><body><table><tr><th>Title</th><th>Year</th><th>Link</th></tr>")
+        for result in results:
+            html_file.write("<tr>")
+            if 'title' in results[result]:
+                html_file.write("<td>" + results[result]['title'] + "</td>")
+            else:
+                html_file.write("<td>No title!</td>")
+
+            if 'date' in results[result]:
+                html_file.write("<td>" + results[result]['date'] + "</td>")
+            else:
+                html_file.write("<td>No date!</td>")
+
+            html_file.write("<td><a href=\"" + results[result]['file_ref'] + "\">click</a></td>")
+            html_file.write("</tr>")
+
+        html_file.write("</table></body>")
+
+
+results = read_cache(file=args.cachefile)
+walk_files(dir=args.directory, prefix=None)
+write_cache(file=args.cachefile)
+write_html(file=args.htmlfile)
+
+os.symlink(args.directory, 'links')
+
+Handler = http.server.SimpleHTTPRequestHandler
+httpd = socketserver.TCPServer(("", 8888), Handler)
+httpd.serve_forever()