import os.path import argparse from habanero import Crossref import json from datetime import datetime import http.server import socketserver parser = argparse.ArgumentParser() parser.add_argument("directory", help="directory which contains the files to resolve", type=str, default=".") parser.add_argument("cachefile", help="path to the cache file containing already resolved DOIs", type=str, default="cachefile.doir", nargs='?') parser.add_argument("htmlfile", help="path to the html result file", type=str, default="index.html", nargs='?') args = parser.parse_args() cr = Crossref() results = {} # Fifty days fifty_days = 60 * 60 * 24 * 50 def query_crossref(doi, file): result = cr.works(filter={"doi": doi}) if result['message']['total-results'] > 0: title = result['message']['items'][0]['title'][0] date_parts = result['message']['items'][0]['created']['date-parts'] date = str(date_parts[0][0]) + "-" + str(date_parts[0][1]) + "-" + str(date_parts[0][2]) return {'title': title, 'date': date, 'file_ref': file} # Expected directory structure is: # [directory_arg] # - 10.1145, organisation DOI prefix (e.g. for ACM) # - 4547.4552.pdf, file name is the document DOI suffix # - 361604.361612.pdf # - ... # - another prefix # - more suffixes def walk_files(dir, prefix): files = os.listdir(dir) for file in files: full_file = os.path.join(dir, file) print("Process file: " + full_file) if os.path.isdir(full_file): walk_files(dir=full_file, prefix=file) else: doi_suffix = os.path.splitext(file)[0] doi = prefix + "/" + doi_suffix # Query the cache first cached = doi in results # Cache hit if cached: print("Cache hit!") result = results[doi] has_timestamp = 'timestamp' in result if has_timestamp and (result['timestamp'] + fifty_days) > datetime.now().timestamp(): # Cache result is older than 50 days # Query CrossRef again result = query_crossref(doi, 'links/' + prefix + "/" + file) # Not found again, create a new cache miss entry if result is None: result = {'timestamp': datetime.now().timestamp(), 'file_ref': 'links/' + prefix + "/" + file} # Cache miss else: print("Cache miss!") # Check whether CrossRef has metadata for this DOI result = query_crossref(doi, 'links/' + prefix + "/" + file) # If not, create a cache miss entry if result is None: print("Not found!") result = {'timestamp': datetime.now().timestamp(), 'file_ref': 'links/' + prefix + "/" + file} results[doi] = result def write_cache(file): with open(file, 'w') as cache_file: cache_file.write(json.dumps(results)) def read_cache(file): if os.path.isfile(file): with open(file, 'r') as cache_file: return json.loads(cache_file.read()) else: return {} def write_html(file): with open(file, 'w') as html_file: html_file.write("") for result in results: html_file.write("") if 'title' in results[result]: html_file.write("") else: html_file.write("") if 'date' in results[result]: html_file.write("") else: html_file.write("") html_file.write("") html_file.write("") html_file.write("
TitleYearLink
" + results[result]['title'] + "No title!" + results[result]['date'] + "No date!click
") results = read_cache(file=args.cachefile) walk_files(dir=args.directory, prefix=None) write_cache(file=args.cachefile) write_html(file=args.htmlfile) os.symlink(args.directory, 'links') Handler = http.server.SimpleHTTPRequestHandler httpd = socketserver.TCPServer(("", 8888), Handler) httpd.serve_forever()