Initial commit
This commit is contained in:
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
cachefile.doir
|
||||||
|
index.html
|
||||||
|
links/
|
||||||
|
.idea/
|
||||||
12
Pipfile
Normal file
12
Pipfile
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
[[source]]
|
||||||
|
name = "pypi"
|
||||||
|
url = "https://pypi.org/simple"
|
||||||
|
verify_ssl = true
|
||||||
|
|
||||||
|
[dev-packages]
|
||||||
|
|
||||||
|
[packages]
|
||||||
|
habanero = "*"
|
||||||
|
|
||||||
|
[requires]
|
||||||
|
python_version = "3.8"
|
||||||
71
Pipfile.lock
generated
Normal file
71
Pipfile.lock
generated
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
{
|
||||||
|
"_meta": {
|
||||||
|
"hash": {
|
||||||
|
"sha256": "703754edf167e2bedf149502d92a2b030f121676b4ee7dcf07db18d247316a26"
|
||||||
|
},
|
||||||
|
"pipfile-spec": 6,
|
||||||
|
"requires": {
|
||||||
|
"python_version": "3.8"
|
||||||
|
},
|
||||||
|
"sources": [
|
||||||
|
{
|
||||||
|
"name": "pypi",
|
||||||
|
"url": "https://pypi.org/simple",
|
||||||
|
"verify_ssl": true
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"default": {
|
||||||
|
"certifi": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:017c25db2a153ce562900032d5bc68e9f191e44e9a0f762f373977de9df1fbb3",
|
||||||
|
"sha256:25b64c7da4cd7479594d035c08c2d809eb4aab3a26e5a990ea98cc450c320f1f"
|
||||||
|
],
|
||||||
|
"version": "==2019.11.28"
|
||||||
|
},
|
||||||
|
"chardet": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
|
||||||
|
"sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
|
||||||
|
],
|
||||||
|
"version": "==3.0.4"
|
||||||
|
},
|
||||||
|
"habanero": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:b8494c0c600a5973e91ca6dc007781158a2799431d9c61888e095a2091a435df",
|
||||||
|
"sha256:d6b545384285b98b1577b0f88fd0da4cd59d550134776dd08f13188e77f3af88"
|
||||||
|
],
|
||||||
|
"index": "pypi",
|
||||||
|
"version": "==0.7.2"
|
||||||
|
},
|
||||||
|
"idna": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:7588d1c14ae4c77d74036e8c22ff447b26d0fde8f007354fd48a7814db15b7cb",
|
||||||
|
"sha256:a068a21ceac8a4d63dbfd964670474107f541babbd2250d61922f029858365fa"
|
||||||
|
],
|
||||||
|
"version": "==2.9"
|
||||||
|
},
|
||||||
|
"requests": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:43999036bfa82904b6af1d99e4882b560e5e2c68e5c4b0aa03b655f3d7d73fee",
|
||||||
|
"sha256:b3f43d496c6daba4493e7c431722aeb7dbc6288f52a6e04e7b6023b0247817e6"
|
||||||
|
],
|
||||||
|
"version": "==2.23.0"
|
||||||
|
},
|
||||||
|
"tqdm": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:00339634a22c10a7a22476ee946bbde2dbe48d042ded784e4d88e0236eca5d81",
|
||||||
|
"sha256:ea9e3fd6bd9a37e8783d75bfc4c1faf3c6813da6bd1c3e776488b41ec683af94"
|
||||||
|
],
|
||||||
|
"version": "==4.45.0"
|
||||||
|
},
|
||||||
|
"urllib3": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:2f3db8b19923a873b3e5256dc9c2dedfa883e33d87c690d9c7913e1f40673cdc",
|
||||||
|
"sha256:87716c2d2a7121198ebcb7ce7cccf6ce5e9ba539041cfbaeecfb641dc0bf6acc"
|
||||||
|
],
|
||||||
|
"version": "==1.25.8"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"develop": {}
|
||||||
|
}
|
||||||
131
doir.py
Executable file
131
doir.py
Executable file
@@ -0,0 +1,131 @@
|
|||||||
|
import os.path
|
||||||
|
import argparse
|
||||||
|
from habanero import Crossref
|
||||||
|
import json
|
||||||
|
from datetime import datetime
|
||||||
|
import http.server
|
||||||
|
import socketserver
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("directory", help="directory which contains the files to resolve", type=str, default=".")
|
||||||
|
parser.add_argument("cachefile", help="path to the cache file containing already resolved DOIs", type=str,
|
||||||
|
default="cachefile.doir", nargs='?')
|
||||||
|
parser.add_argument("htmlfile", help="path to the html result file", type=str, default="index.html", nargs='?')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
cr = Crossref()
|
||||||
|
results = {}
|
||||||
|
# Fifty days
|
||||||
|
fifty_days = 60 * 60 * 24 * 50
|
||||||
|
|
||||||
|
|
||||||
|
def query_crossref(doi, file):
|
||||||
|
result = cr.works(filter={"doi": doi})
|
||||||
|
|
||||||
|
if result['message']['total-results'] > 0:
|
||||||
|
title = result['message']['items'][0]['title'][0]
|
||||||
|
date_parts = result['message']['items'][0]['created']['date-parts']
|
||||||
|
date = str(date_parts[0][0]) + "-" + str(date_parts[0][1]) + "-" + str(date_parts[0][2])
|
||||||
|
|
||||||
|
return {'title': title, 'date': date, 'file_ref': file}
|
||||||
|
|
||||||
|
|
||||||
|
# Expected directory structure is:
|
||||||
|
# [directory_arg]
|
||||||
|
# - 10.1145, organisation DOI prefix (e.g. for ACM)
|
||||||
|
# - 4547.4552.pdf, file name is the document DOI suffix
|
||||||
|
# - 361604.361612.pdf
|
||||||
|
# - ...
|
||||||
|
# - another prefix
|
||||||
|
# - more suffixes
|
||||||
|
def walk_files(dir, prefix):
|
||||||
|
files = os.listdir(dir)
|
||||||
|
|
||||||
|
for file in files:
|
||||||
|
full_file = os.path.join(dir, file)
|
||||||
|
|
||||||
|
print("Process file: " + full_file)
|
||||||
|
|
||||||
|
if os.path.isdir(full_file):
|
||||||
|
walk_files(dir=full_file, prefix=file)
|
||||||
|
else:
|
||||||
|
doi_suffix = os.path.splitext(file)[0]
|
||||||
|
doi = prefix + "/" + doi_suffix
|
||||||
|
# Query the cache first
|
||||||
|
cached = doi in results
|
||||||
|
|
||||||
|
# Cache hit
|
||||||
|
if cached:
|
||||||
|
print("Cache hit!")
|
||||||
|
|
||||||
|
result = results[doi]
|
||||||
|
has_timestamp = 'timestamp' in result
|
||||||
|
|
||||||
|
if has_timestamp and (result['timestamp'] + fifty_days) > datetime.now().timestamp():
|
||||||
|
# Cache result is older than 50 days
|
||||||
|
# Query CrossRef again
|
||||||
|
result = query_crossref(doi, 'links/' + prefix + "/" + file)
|
||||||
|
|
||||||
|
# Not found again, create a new cache miss entry
|
||||||
|
if result is None:
|
||||||
|
result = {'timestamp': datetime.now().timestamp(), 'file_ref': 'links/' + prefix + "/" + file}
|
||||||
|
# Cache miss
|
||||||
|
else:
|
||||||
|
print("Cache miss!")
|
||||||
|
|
||||||
|
# Check whether CrossRef has metadata for this DOI
|
||||||
|
result = query_crossref(doi, 'links/' + prefix + "/" + file)
|
||||||
|
|
||||||
|
# If not, create a cache miss entry
|
||||||
|
if result is None:
|
||||||
|
print("Not found!")
|
||||||
|
|
||||||
|
result = {'timestamp': datetime.now().timestamp(), 'file_ref': 'links/' + prefix + "/" + file}
|
||||||
|
|
||||||
|
results[doi] = result
|
||||||
|
|
||||||
|
|
||||||
|
def write_cache(file):
|
||||||
|
with open(file, 'w') as cache_file:
|
||||||
|
cache_file.write(json.dumps(results))
|
||||||
|
|
||||||
|
|
||||||
|
def read_cache(file):
|
||||||
|
if os.path.isfile(file):
|
||||||
|
with open(file, 'r') as cache_file:
|
||||||
|
return json.loads(cache_file.read())
|
||||||
|
else:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def write_html(file):
|
||||||
|
with open(file, 'w') as html_file:
|
||||||
|
html_file.write("<html><head></head><body><table><tr><th>Title</th><th>Year</th><th>Link</th></tr>")
|
||||||
|
for result in results:
|
||||||
|
html_file.write("<tr>")
|
||||||
|
if 'title' in results[result]:
|
||||||
|
html_file.write("<td>" + results[result]['title'] + "</td>")
|
||||||
|
else:
|
||||||
|
html_file.write("<td>No title!</td>")
|
||||||
|
|
||||||
|
if 'date' in results[result]:
|
||||||
|
html_file.write("<td>" + results[result]['date'] + "</td>")
|
||||||
|
else:
|
||||||
|
html_file.write("<td>No date!</td>")
|
||||||
|
|
||||||
|
html_file.write("<td><a href=\"" + results[result]['file_ref'] + "\">click</a></td>")
|
||||||
|
html_file.write("</tr>")
|
||||||
|
|
||||||
|
html_file.write("</table></body>")
|
||||||
|
|
||||||
|
|
||||||
|
results = read_cache(file=args.cachefile)
|
||||||
|
walk_files(dir=args.directory, prefix=None)
|
||||||
|
write_cache(file=args.cachefile)
|
||||||
|
write_html(file=args.htmlfile)
|
||||||
|
|
||||||
|
os.symlink(args.directory, 'links')
|
||||||
|
|
||||||
|
Handler = http.server.SimpleHTTPRequestHandler
|
||||||
|
httpd = socketserver.TCPServer(("", 8888), Handler)
|
||||||
|
httpd.serve_forever()
|
||||||
Reference in New Issue
Block a user