Initial commit
This commit is contained in:
131
doir.py
Executable file
131
doir.py
Executable file
@@ -0,0 +1,131 @@
|
||||
import os.path
|
||||
import argparse
|
||||
from habanero import Crossref
|
||||
import json
|
||||
from datetime import datetime
|
||||
import http.server
|
||||
import socketserver
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("directory", help="directory which contains the files to resolve", type=str, default=".")
|
||||
parser.add_argument("cachefile", help="path to the cache file containing already resolved DOIs", type=str,
|
||||
default="cachefile.doir", nargs='?')
|
||||
parser.add_argument("htmlfile", help="path to the html result file", type=str, default="index.html", nargs='?')
|
||||
args = parser.parse_args()
|
||||
|
||||
cr = Crossref()
|
||||
results = {}
|
||||
# Fifty days
|
||||
fifty_days = 60 * 60 * 24 * 50
|
||||
|
||||
|
||||
def query_crossref(doi, file):
|
||||
result = cr.works(filter={"doi": doi})
|
||||
|
||||
if result['message']['total-results'] > 0:
|
||||
title = result['message']['items'][0]['title'][0]
|
||||
date_parts = result['message']['items'][0]['created']['date-parts']
|
||||
date = str(date_parts[0][0]) + "-" + str(date_parts[0][1]) + "-" + str(date_parts[0][2])
|
||||
|
||||
return {'title': title, 'date': date, 'file_ref': file}
|
||||
|
||||
|
||||
# Expected directory structure is:
|
||||
# [directory_arg]
|
||||
# - 10.1145, organisation DOI prefix (e.g. for ACM)
|
||||
# - 4547.4552.pdf, file name is the document DOI suffix
|
||||
# - 361604.361612.pdf
|
||||
# - ...
|
||||
# - another prefix
|
||||
# - more suffixes
|
||||
def walk_files(dir, prefix):
|
||||
files = os.listdir(dir)
|
||||
|
||||
for file in files:
|
||||
full_file = os.path.join(dir, file)
|
||||
|
||||
print("Process file: " + full_file)
|
||||
|
||||
if os.path.isdir(full_file):
|
||||
walk_files(dir=full_file, prefix=file)
|
||||
else:
|
||||
doi_suffix = os.path.splitext(file)[0]
|
||||
doi = prefix + "/" + doi_suffix
|
||||
# Query the cache first
|
||||
cached = doi in results
|
||||
|
||||
# Cache hit
|
||||
if cached:
|
||||
print("Cache hit!")
|
||||
|
||||
result = results[doi]
|
||||
has_timestamp = 'timestamp' in result
|
||||
|
||||
if has_timestamp and (result['timestamp'] + fifty_days) > datetime.now().timestamp():
|
||||
# Cache result is older than 50 days
|
||||
# Query CrossRef again
|
||||
result = query_crossref(doi, 'links/' + prefix + "/" + file)
|
||||
|
||||
# Not found again, create a new cache miss entry
|
||||
if result is None:
|
||||
result = {'timestamp': datetime.now().timestamp(), 'file_ref': 'links/' + prefix + "/" + file}
|
||||
# Cache miss
|
||||
else:
|
||||
print("Cache miss!")
|
||||
|
||||
# Check whether CrossRef has metadata for this DOI
|
||||
result = query_crossref(doi, 'links/' + prefix + "/" + file)
|
||||
|
||||
# If not, create a cache miss entry
|
||||
if result is None:
|
||||
print("Not found!")
|
||||
|
||||
result = {'timestamp': datetime.now().timestamp(), 'file_ref': 'links/' + prefix + "/" + file}
|
||||
|
||||
results[doi] = result
|
||||
|
||||
|
||||
def write_cache(file):
|
||||
with open(file, 'w') as cache_file:
|
||||
cache_file.write(json.dumps(results))
|
||||
|
||||
|
||||
def read_cache(file):
|
||||
if os.path.isfile(file):
|
||||
with open(file, 'r') as cache_file:
|
||||
return json.loads(cache_file.read())
|
||||
else:
|
||||
return {}
|
||||
|
||||
|
||||
def write_html(file):
|
||||
with open(file, 'w') as html_file:
|
||||
html_file.write("<html><head></head><body><table><tr><th>Title</th><th>Year</th><th>Link</th></tr>")
|
||||
for result in results:
|
||||
html_file.write("<tr>")
|
||||
if 'title' in results[result]:
|
||||
html_file.write("<td>" + results[result]['title'] + "</td>")
|
||||
else:
|
||||
html_file.write("<td>No title!</td>")
|
||||
|
||||
if 'date' in results[result]:
|
||||
html_file.write("<td>" + results[result]['date'] + "</td>")
|
||||
else:
|
||||
html_file.write("<td>No date!</td>")
|
||||
|
||||
html_file.write("<td><a href=\"" + results[result]['file_ref'] + "\">click</a></td>")
|
||||
html_file.write("</tr>")
|
||||
|
||||
html_file.write("</table></body>")
|
||||
|
||||
|
||||
results = read_cache(file=args.cachefile)
|
||||
walk_files(dir=args.directory, prefix=None)
|
||||
write_cache(file=args.cachefile)
|
||||
write_html(file=args.htmlfile)
|
||||
|
||||
os.symlink(args.directory, 'links')
|
||||
|
||||
Handler = http.server.SimpleHTTPRequestHandler
|
||||
httpd = socketserver.TCPServer(("", 8888), Handler)
|
||||
httpd.serve_forever()
|
||||
Reference in New Issue
Block a user