1 |
#!/usr/bin/env python |
2 |
# -*- coding: iso-8859-1 -*- |
3 |
|
4 |
""" |
5 |
This script turns the contents of the IVOA document repository into |
6 |
the ADS tagged format. |
7 |
|
8 |
Warning: it will walk a major portion of the IVOA document repository, |
9 |
which translates into ~100 requests fired without rate limitation. |
10 |
|
11 |
Among the complications are: |
12 |
|
13 |
(1) We're creating electronic document identifiers (see make_ivoadoc_id and |
14 |
following) |
15 |
|
16 |
(2) We're manipulating the author lists to ensure the editor(s) are in the |
17 |
first position. |
18 |
|
19 |
(3) As ADS would rather not have records they already have resubmitted, |
20 |
we query it using a "new API" endpoint. |
21 |
|
22 |
After all these complications, it might make sense to finally introduce |
23 |
classes for representing records (rather than dictionaries, the keys of |
24 |
which are defined through the namespace in the parse_landing_page |
25 |
function...) and probably the whole collection, too (rather than a simple |
26 |
list). MD might do this if there's another feature request... |
27 |
|
28 |
|
29 |
|
30 |
Copyright 2014-2015, Markus Demleitner <msdemlei@ari.uni-heidelberg.de> |
31 |
This program is free software; you can redistribute it and/or modify |
32 |
it under the terms of the GNU General Public License as published by |
33 |
the Free Software Foundation; either version 2 of the License, or |
34 |
(at your option) any later version. |
35 |
|
36 |
This program is distributed in the hope that it will be useful, |
37 |
but WITHOUT ANY WARRANTY; without even the implied warranty of |
38 |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
39 |
GNU General Public License for more details. |
40 |
|
41 |
You should have received a copy of the GNU General Public License |
42 |
along with this program; if not, write to the Free Software |
43 |
Foundation, Inc., 59 Temple Street #330, Boston, MA 02111-1307, USA. |
44 |
""" |
45 |
|
46 |
import argparse |
47 |
import cgi |
48 |
import itertools |
49 |
import json |
50 |
import os |
51 |
import re |
52 |
import sys |
53 |
import traceback |
54 |
import urlparse |
55 |
|
56 |
import BeautifulSoup |
57 |
import requests |
58 |
|
59 |
|
60 |
CACHE_RESULTS = False |
61 |
|
62 |
# When two documents were published on the same date from authors |
63 |
# with the same initial, we need to reliably add a qualifier. |
64 |
# This is a dict of landing page URLs to qualifiers. In the future, |
65 |
# the document coordinator should try to avoid such situations, |
66 |
# so hopefully the following enumeration is exhaustive. |
67 |
BIBCODE_QUALIFIERS = { |
68 |
"http://www.ivoa.net/documents/cover/ConeSearch-20080222.html": "Q", |
69 |
"http://www.ivoa.net/documents/VOSpace/20091007/": "Q", |
70 |
"http://www.ivoa.net/documents/SLAP/20101209/": "Q", |
71 |
} |
72 |
|
73 |
# endpoint of the ADS "bigquery" API |
74 |
ADS_ENDPOINT = "https://api.adsabs.harvard.edu/v1/search/bigquery?" |
75 |
|
76 |
|
77 |
########################## Utilties |
78 |
|
79 |
class Error(Exception): |
80 |
"""Base class of exceptions raised by us. |
81 |
""" |
82 |
|
83 |
class ValidationError(Error): |
84 |
"""is raised for documents that are in some way invalid. |
85 |
""" |
86 |
|
87 |
class ExternalError(Error): |
88 |
"""is raised if some external service behaved unexpectedly. |
89 |
""" |
90 |
|
91 |
class Finished(Exception): |
92 |
"""used by the abstract collector to abort item collection in case of |
93 |
malstructured documents. |
94 |
""" |
95 |
def __init__(self, payload): |
96 |
self.payload = payload |
97 |
Exception.__init__(self, "Unexpected div") |
98 |
|
99 |
|
100 |
def get_with_cache(url): |
101 |
cacheName = re.sub("[^\w]+", "", url)+".cache" |
102 |
if CACHE_RESULTS and os.path.exists(cacheName): |
103 |
doc = open(cacheName).read().decode("utf-8") |
104 |
else: |
105 |
doc = requests.get(url).text |
106 |
if CACHE_RESULTS: |
107 |
f = open(cacheName, "w") |
108 |
f.write(doc.encode("utf-8")) |
109 |
f.close() |
110 |
return doc |
111 |
|
112 |
|
113 |
def get_enclosing_element(soup, tag, text): |
114 |
"""returns the first match of tag that contains an element containg |
115 |
text. |
116 |
""" |
117 |
for el in soup.findAll(tag): |
118 |
if text in el.text: |
119 |
return el |
120 |
|
121 |
|
122 |
########################## Screen scraping landing pages |
123 |
|
124 |
MONTH_NAMES = ["January", "February", "March", "April", "May", "June", |
125 |
"July", "August", "September", "October", "November", "December"] |
126 |
DATE_RE = re.compile(r"(\d{1,2})\s*(%s)\s*(\d\d\d\d)"% |
127 |
"|".join(MONTH_NAMES)) |
128 |
|
129 |
|
130 |
def parse_subhead_date(s): |
131 |
"""returns first year, month, and day for a date as on IVOA document |
132 |
landing pages. |
133 |
""" |
134 |
mat = DATE_RE.search(s) |
135 |
if not mat: |
136 |
raise Exception("No date visible in %s"%repr(s)) |
137 |
return (int(mat.group(3)), |
138 |
MONTH_NAMES.index(mat.group(2))+1, |
139 |
int(mat.group(1))) |
140 |
|
141 |
|
142 |
def format_abstract(el): |
143 |
"""returns plain text from a BeautifulSoup element. |
144 |
|
145 |
This traverses the tree, stopping when it encounters the first div. |
146 |
Only very little markup is supported (all we have is ADS' abstract |
147 |
syntax). |
148 |
""" |
149 |
accum = [] |
150 |
|
151 |
if isinstance(el, BeautifulSoup.NavigableString): |
152 |
accum.append(el.string) |
153 |
|
154 |
elif el.name=="div": |
155 |
# this is probably bad document structure, in that this div |
156 |
# should not be a child of the abstract. Stop collecting, but |
157 |
# pass upstream what we've collected so far. |
158 |
raise Finished(" ".join(accum)) |
159 |
|
160 |
elif el.name in ("ul", "ol"): |
161 |
# can't see a way to properly do ul in running text, so folding |
162 |
# it to ol. |
163 |
for index, child in enumerate(el.findAll("li", recursive=False)): |
164 |
accum.append(" (%s) %s "%(index+1, format_abstract(child))) |
165 |
|
166 |
else: |
167 |
if el.name=="p": |
168 |
accum.append("\n\n") |
169 |
for child in el: |
170 |
try: |
171 |
accum.append(format_abstract(child)) |
172 |
except Finished, rest: |
173 |
raise Finished(" ".join(accum+[rest.payload])) |
174 |
|
175 |
return " ".join(accum) |
176 |
|
177 |
|
178 |
def get_abstract_text(soup): |
179 |
"""returns a guess for what the abstract within soup is. |
180 |
|
181 |
Unfortunately, the abstract isn't marked up well on IVOA landing |
182 |
pages. Hence, we just look for the headline and gobble up material until |
183 |
we reach a div after that. |
184 |
""" |
185 |
abstract_head = get_enclosing_element(soup, "h2", "Abstract") |
186 |
el = abstract_head.nextSibling |
187 |
accum = [] |
188 |
while getattr(el, "name", None)!="div": |
189 |
try: |
190 |
accum.append(format_abstract(el)) |
191 |
except Finished, rest: |
192 |
# div found as abstract child, suspect malformed document. |
193 |
accum.append(rest.payload) |
194 |
break |
195 |
el = el.nextSibling |
196 |
return " ".join(accum) |
197 |
|
198 |
|
199 |
def clean_field(s): |
200 |
"""return s with normalised space and similar, ready for inclusion |
201 |
into ADS' tagged format. |
202 |
|
203 |
Don't do this to abstracts. |
204 |
""" |
205 |
# Oh shucks, "Grid *and* Web Services" requires a special hack. |
206 |
return re.sub(",? and ", ", ", |
207 |
re.sub("\s+", " ", s)).replace("Grid, ", "Grid and") |
208 |
|
209 |
|
210 |
SHORT_NAME_EXCEPTIONS = { |
211 |
"VOT": "VOTable" |
212 |
} |
213 |
|
214 |
def guess_short_name(url_in_docrepo): |
215 |
"""guesses the short name of a document based on its docrepo URL. |
216 |
|
217 |
Due to historically confusing practices, this is hard to do. Our |
218 |
heuristics: we throw out known parts of common URLs and take the |
219 |
segments that's the longest. |
220 |
>>> guess_short_name("http://www.ivoa.net/documents/SAMP/20120411/") |
221 |
'SAMP' |
222 |
>>> guess_short_name("www.ivoa.net/documents/cover/SAMP-20090421.html") |
223 |
'SAMP' |
224 |
>>> guess_short_name("http://www.ivoa.net/documents/cover/VOT-20040811.html") |
225 |
'VOTable' |
226 |
""" |
227 |
# cut prefix |
228 |
local_path = re.sub(".*documents/", "", url_in_docrepo) |
229 |
# cut known junk |
230 |
unjunked = re.sub("index.html", "", |
231 |
re.sub("cover/", "", local_path)) |
232 |
# score candidates according to |
233 |
scored = list(sorted((len(re.sub("[^A-Z]+", "", s)), s) |
234 |
for s in re.split("[/-]", unjunked))) |
235 |
# fail if inconclusive |
236 |
if len(scored)>1 and scored[-1][0]==scored[-2][0]: |
237 |
raise Error("Cannot infer short name: %s"%url_in_docrepo) |
238 |
|
239 |
return SHORT_NAME_EXCEPTIONS.get(scored[-1][1], scored[-1][1]) |
240 |
|
241 |
|
242 |
def parse_landing_page(url, local_metadata): |
243 |
"""returns a dictionary of document properties for a document taken from |
244 |
its landing page. |
245 |
""" |
246 |
soup = BeautifulSoup.BeautifulSoup(get_with_cache(url)) |
247 |
authors = clean_field( |
248 |
get_enclosing_element(soup, "dt", "Author(s):" |
249 |
).findNextSibling("dd").getText(" ")) |
250 |
editors = clean_field(get_enclosing_element(soup, "dt", "Editor(s):" |
251 |
).findNextSibling("dd").getText(" ")) |
252 |
tagline = soup.find("h2").text |
253 |
date = parse_subhead_date(tagline) |
254 |
abstract = get_abstract_text(soup).replace("\r", "") |
255 |
|
256 |
title = clean_field(soup.find("h1").getText(" ")) |
257 |
journal = tagline |
258 |
|
259 |
pdf_enclosure = get_enclosing_element(soup, "a", "PDF") |
260 |
if pdf_enclosure: |
261 |
pdf = urlparse.urljoin(url, pdf_enclosure.get("href")) |
262 |
|
263 |
try: |
264 |
arXiv_id = local_metadata.get_arXiv_id_for_URL(url) |
265 |
except KeyError: |
266 |
# That's ok for notes, and checked separately for RECs |
267 |
pass |
268 |
|
269 |
del soup |
270 |
return locals() |
271 |
|
272 |
|
273 |
########################## Screen scraping the index page |
274 |
|
275 |
def iter_links_from_table(src_table, rec_class): |
276 |
"""returns rec-like URLs from src_table. |
277 |
|
278 |
src_table is a BeautifulSoup node for one of our documents-in-progress |
279 |
tables (realistically, recommendations or endorsed notes). |
280 |
|
281 |
rec_class is a CSS class name which marks links to finished standards |
282 |
in the respective table (in reality, en or rec). |
283 |
|
284 |
The function yields anchor elements. |
285 |
""" |
286 |
for links in src_table.findAll("td", {"class": "versionold"}): |
287 |
for anchor in links.findAll("a", {"class": rec_class}): |
288 |
yield anchor |
289 |
|
290 |
|
291 |
def iter_REC_URLs(doc_index, repo_url): |
292 |
"""iterates over URLs to RECs (different versions are different documents). |
293 |
|
294 |
doc_index is a BeautifulSoup of the IVOA documents repo. Each URL |
295 |
in a class=rec anchor will be returned exactly once. Document |
296 |
order is maintained. |
297 |
""" |
298 |
seen_stds = set() |
299 |
rec_table = get_enclosing_element(doc_index, "h3", |
300 |
"Technical Specifications").findNextSibling("table") |
301 |
en_table = get_enclosing_element(doc_index, "h3", |
302 |
"Endorsed Note").findNextSibling("table") |
303 |
|
304 |
for anchor in itertools.chain( |
305 |
iter_links_from_table(rec_table, "rec"), |
306 |
iter_links_from_table(en_table, "en")): |
307 |
# we'll fix URLs to some degree here; in particular, |
308 |
# uppercase Documents, which was fairly common in the old days, |
309 |
# is lowercased. |
310 |
url = urlparse.urljoin(repo_url, anchor.get("href" |
311 |
).replace("Documents", "documents")) |
312 |
|
313 |
if url in seen_stds: |
314 |
continue |
315 |
seen_stds.add(url) |
316 |
yield url |
317 |
|
318 |
|
319 |
def iter_Notes_URLs(): |
320 |
"""iterates over URLs of published notes. |
321 |
|
322 |
Right now, most notes are not pushed to ADS. Instead, the exec |
323 |
lists the ones it wants published, and the document coordinator |
324 |
manually adds the URLs to published_notes.txt. |
325 |
""" |
326 |
with open("published_notes.txt") as f: |
327 |
for ln in f: |
328 |
if ln.strip() and not ln.startswith("#"): |
329 |
yield ln.strip() |
330 |
|
331 |
|
332 |
########################## record generation logic |
333 |
|
334 |
class Document(dict): |
335 |
"""Metadata of an IVOA document. |
336 |
|
337 |
These are constructed with a dictionary of items found; this |
338 |
includes authors*, editors*, date*, abstract*, title*, type* |
339 |
(spec/rept), pdf (its URL), url* (of the landing page), journal*, |
340 |
arXiv_id (mandatory for RECs), but additional arbitrary keys are allowed. |
341 |
Items with stars are mandatory. |
342 |
|
343 |
You'll usually use the from_URL class function to construct one |
344 |
from an IVOA document landing page. |
345 |
|
346 |
>>> Document(TEST_DATA["ru"]) |
347 |
Traceback (most recent call last): |
348 |
ValidationError: Document at http://foo/bar: Missing key(s) date, editors |
349 |
>>> d = Document(TEST_DATA["r1"]) |
350 |
>>> d["authors"] |
351 |
'Greg Ju, Fred Gnu Test, Wang Chu' |
352 |
>>> d.bibcode |
353 |
'2014ivoa.spec.0307J' |
354 |
>>> d.as_ADS_record()[:59] |
355 |
'%R 2014ivoa.spec.0307J\\n%D 3/2014\\n%I ELECTR: http://foo/bar;' |
356 |
>>> d2 = Document.from_URL("http://www.ivoa.net/documents/SAMP/20120411" |
357 |
... "/index.html", TEST_DATA["lm"]) |
358 |
>>> d2["authors"] |
359 |
u'T. Boch, M. Fitzpatrick, M. Taylor, A. Allan, J. Fay, L. Paioro, J. Taylor, D. Tody' |
360 |
>>> d2.bibcode |
361 |
u'2012ivoa.spec.0411B' |
362 |
>>> Document(TEST_DATA["rr"]) |
363 |
Traceback (most recent call last): |
364 |
Error: RECs must have arXiv_id (add to arXiv_ids.txt); failing on document at http://foo/failrec |
365 |
>>> Document(TEST_DATA["rme"])["authors"] |
366 |
'First Editor, Second Editor, Some Guy, Guy Rixon' |
367 |
""" |
368 |
|
369 |
mandatory_keys = frozenset( |
370 |
["url", "authors", "editors", "date", "abstract", "title", "journal"]) |
371 |
key_to_ads = [ |
372 |
("authors", "A"), |
373 |
("editors", "e"), |
374 |
("title", "T"), |
375 |
("source", "G"), |
376 |
("journal", "J"), |
377 |
("abstract", "B"), |
378 |
] |
379 |
|
380 |
def __init__(self, vals): |
381 |
dict.__init__(self, vals) |
382 |
self["source"] = "IVOA" |
383 |
self.validate() |
384 |
self._perform_editor_hack() |
385 |
self._infer_type() |
386 |
# if self["type"]=="spec": |
387 |
# if not self.get("arXiv_id"): |
388 |
# raise Error("RECs must have arXiv_id (add to arXiv_ids.txt);" |
389 |
# " failing on document at %s"%(self["url"])) |
390 |
|
391 |
@classmethod |
392 |
def from_URL(cls, url, local_metadata): |
393 |
"""returns a new Document made from the IVOA landing page at url. |
394 |
""" |
395 |
return cls(parse_landing_page(url, local_metadata)) |
396 |
|
397 |
def validate(self): |
398 |
"""raises a ValidationError if one or more of the mandatory_keys |
399 |
are missing. |
400 |
""" |
401 |
missing_keys = self.mandatory_keys-set(self) |
402 |
if missing_keys: |
403 |
raise ValidationError("Document at %s: Missing key(s) %s"%( |
404 |
self.get("url", "<unknown origin>"), ", ".join(missing_keys))) |
405 |
|
406 |
def _infer_type(self): |
407 |
"""decides whether this document is a spec (Recommendation) or |
408 |
rept (Note). |
409 |
|
410 |
We currently do this according to the journal field (specs have |
411 |
"Recommendation" or "Endorsed Note" in there). |
412 |
""" |
413 |
if ("Recommendation" in self["journal"] |
414 |
or "Endorsed Note" in self["journal"]): |
415 |
self["type"] = "spec" |
416 |
else: |
417 |
self["type"] = "rept" |
418 |
|
419 |
def _perform_editor_hack(self): |
420 |
"""fudges the authors list to include the editor(s) in the first place. |
421 |
|
422 |
This was the express wish of Francoise Genova to provide sufficient |
423 |
credit to the editors who, typically, did most of the work that went |
424 |
into a document. |
425 |
|
426 |
This method is called by the constructor; it's designed to be |
427 |
idempotent. |
428 |
""" |
429 |
if not self["editors"].strip(): |
430 |
return |
431 |
|
432 |
eds = set(s.strip() for s in self["editors"].split(",")) |
433 |
auths = [s.strip() for s in self["authors"].split(",")] |
434 |
|
435 |
# sanity check: if an item of eds or auths contains no blank, |
436 |
# we have a bad author format |
437 |
name_pattern = re.compile(r"[\. ]") |
438 |
if ([item for item in eds if not name_pattern.search(item)] |
439 |
or [item for item in auths if not name_pattern.search(item)]): |
440 |
raise Exception("Bad author format suspected in %s/%s"%( |
441 |
eds, auths)) |
442 |
|
443 |
non_editors = ", ".join(item for item in auths if item not in eds) |
444 |
if non_editors: |
445 |
self["authors"] = "%s, %s"%(self["editors"], non_editors) |
446 |
else: |
447 |
self["authors"] = self["editors"] |
448 |
|
449 |
def get_first_author_surname(self): |
450 |
"""returns the surname for the first author. |
451 |
|
452 |
This is pure heuristics -- we need it for bibcode generation, and |
453 |
hence we should keep this in sync with what ADS wants. |
454 |
""" |
455 |
# current heuristics: first character of last "word" in front of the |
456 |
# first comma. This will fail for many interesting cases, but |
457 |
# IOVA contributors appear to have tame names for now. |
458 |
return self["authors"].split(",")[0].split()[-1] |
459 |
|
460 |
@property |
461 |
def bibcode(self): |
462 |
"""returns the bibcode for this record. |
463 |
""" |
464 |
year, month, day = self["date"] |
465 |
return "%sivoa.%s%s%02d%02d%s"%( |
466 |
year, self["type"], |
467 |
BIBCODE_QUALIFIERS.get(self["url"], "."), |
468 |
month, day, |
469 |
self.get_first_author_surname()[0]) |
470 |
|
471 |
def as_ADS_record(self): |
472 |
"""returns UTF-8 encoded ADS tagged format for doc_dict as returned |
473 |
by our parsers. |
474 |
""" |
475 |
parts = ["%%R %s"%self.bibcode] |
476 |
|
477 |
year, month, day = self["date"] |
478 |
parts.append("%%D %s/%s"%(month, year)) |
479 |
|
480 |
links = "%%I ELECTR: %s"%self["url"] |
481 |
if "pdf" in self: |
482 |
links += ";\nPDF: %s"%self["pdf"] |
483 |
if "ivoadoc-id" in self: |
484 |
links += ";\nEPRINT: %s"%self["ivoadoc-id"] |
485 |
if "arXiv_id" in self: |
486 |
links += ";\nARXIV: %s"%self["arXiv_id"] |
487 |
parts.append(links) |
488 |
|
489 |
for our_key, ads_key in self.key_to_ads: |
490 |
if our_key in self: |
491 |
parts.append("%%%s %s"%(ads_key, self[our_key])) |
492 |
|
493 |
return "\n".join(parts).encode("utf-8") |
494 |
|
495 |
|
496 |
class DocumentCollection(object): |
497 |
"""A collection of IVOA document metadata. |
498 |
|
499 |
This also contains logic that needs to see the entire collection. |
500 |
|
501 |
It is constructed with a sequence of Document instances; you |
502 |
will usually use the from_repo_URL class method which takes the |
503 |
URL of the IVOA's document collection. |
504 |
|
505 |
These things are conceptually immutable (i.e., you're not supposed |
506 |
to change self.docs). |
507 |
|
508 |
The main interface to this is iteration -- you'll get all the |
509 |
documents in temporal order. |
510 |
|
511 |
>>> dc = DocumentCollection( |
512 |
... Document(TEST_DATA[k]) for k in "r1 r2 r3".split()) |
513 |
>>> dc.docs[0].bibcode |
514 |
'2014ivoa.spec.0307J' |
515 |
""" |
516 |
def __init__(self, docs): |
517 |
self.docs = list(docs) |
518 |
self._sort_recs() |
519 |
self._create_identifiers() |
520 |
self.validate() |
521 |
|
522 |
@classmethod |
523 |
def from_repo_URL(cls, root_url, local_metadata): |
524 |
"""returns a DocumentCollection ready for export, constructed |
525 |
from the index at root_url. |
526 |
""" |
527 |
doc_index = BeautifulSoup.BeautifulSoup( |
528 |
requests.get(root_url).text) |
529 |
docs = [] |
530 |
|
531 |
for url in itertools.chain( |
532 |
iter_REC_URLs(doc_index, root_url), |
533 |
iter_Notes_URLs()): |
534 |
try: |
535 |
docs.append( |
536 |
Document.from_URL(urlparse.urljoin(root_url, url), local_metadata)) |
537 |
except KeyboardInterrupt: |
538 |
raise |
539 |
except: |
540 |
sys.stderr.write("\nIn document %s:\n"%url) |
541 |
traceback.print_exc() |
542 |
return cls(docs) |
543 |
|
544 |
def __iter__(self): |
545 |
return iter(self.docs) |
546 |
|
547 |
def validate(self): |
548 |
"""runs some simple tests to avoid certain undesirable situations. |
549 |
|
550 |
Problems will lead to a validation error being raised. |
551 |
""" |
552 |
docs_per_bibcode = {} |
553 |
for doc in self: |
554 |
docs_per_bibcode.setdefault(doc.bibcode, []).append(doc) |
555 |
dupes = [item for item in docs_per_bibcode.iteritems() |
556 |
if len(item[1])>1] |
557 |
if dupes: |
558 |
raise ValidationError("The following documents generated" |
559 |
" clashing bibcodes: %s. Fix by adding one of them to" |
560 |
" BIBCODE_QUALIFIERS in the source."%( |
561 |
" AND ALSO\n".join( |
562 |
" and ".join(c["url"] for c in clashing[1]) |
563 |
for clashing in dupes))) |
564 |
|
565 |
def _make_ivoadoc_id(self, rec, index): |
566 |
"""returns, for a rec as returned by parse_landing_page |
567 |
and the document index within the publication month, |
568 |
the document's IVOA document id. |
569 |
|
570 |
The IVOA document id has the form |
571 |
ivoa:<t>.<year>.<month>.<count>. count is a running |
572 |
number per month, where documents are sorted first |
573 |
by date, then by first author last name, and finally |
574 |
by title. <t> is r for a REC-type thing, n for a |
575 |
NOTE-like thing. |
576 |
|
577 |
This is a helper for _create_identifiers. |
578 |
""" |
579 |
return "ivoa:%s.%04d.%02d.%02d"%( |
580 |
"r" if rec["type"]=="spec" else "n", |
581 |
rec["date"][0], |
582 |
rec["date"][1], |
583 |
index) |
584 |
|
585 |
def _sort_recs(self): |
586 |
"""sorts our records as required for IVOA identifier generation. |
587 |
|
588 |
That is, sorted by date, authors, and titles, in that order. |
589 |
This is called by the constructor. |
590 |
""" |
591 |
self.docs.sort(key=lambda rec: rec["date"]+( |
592 |
rec.get_first_author_surname(), rec["title"])) |
593 |
|
594 |
def _get_month_partition(self): |
595 |
"""returns a dictionary mapping (year, month) to the documents published |
596 |
in that month |
597 |
|
598 |
This is a helper for _create_identifiers |
599 |
""" |
600 |
by_month = {} |
601 |
for rec in self.docs: |
602 |
year, month, day = rec["date"] |
603 |
by_month.setdefault((year, month), []).append(rec) |
604 |
return by_month |
605 |
|
606 |
def _create_identifiers(self): |
607 |
"""adds ivoadoc-id keys to every record in self. |
608 |
|
609 |
See _make_ivoadoc_id for what this is. |
610 |
|
611 |
This is called by the constructor. |
612 |
""" |
613 |
for (year, month), recs in self._get_month_partition().iteritems(): |
614 |
for index, rec in enumerate(d for d in self.docs if d["type"]=="spec"): |
615 |
rec["ivoadoc-id"] = self._make_ivoadoc_id(rec, index) |
616 |
for index, rec in enumerate(d for d in self.docs if d["type"]=="rept"): |
617 |
rec["ivoadoc-id"] = self._make_ivoadoc_id(rec, index) |
618 |
|
619 |
|
620 |
########################## local metadata injection |
621 |
|
622 |
class LocalMetadata(object): |
623 |
"""A container for parsed metadata from kept in the SVN repo. |
624 |
|
625 |
Currently, that's a mapping from document short names to arXiv ids, kept in |
626 |
arXiv_map. By Exec decree, this is only available for IVOA RECs. |
627 |
""" |
628 |
def __init__(self): |
629 |
self._load_arXiv_map() |
630 |
|
631 |
def _load_arXiv_map(self): |
632 |
self.arXiv_map = {} |
633 |
with open("arXiv_ids.txt") as f: |
634 |
for ln_index, ln in enumerate(f): |
635 |
try: |
636 |
if ln.strip(): |
637 |
access_URL, arXiv_id = ln.split() |
638 |
self.arXiv_map[access_URL.strip()] = arXiv_id.strip() |
639 |
except ValueError: |
640 |
sys.exit("arXiv_ids.txt, line %s: entry not in <local><white><arxiv>" |
641 |
" format."%(ln_index+1)) |
642 |
|
643 |
def get_arXiv_id_for_URL(self, url): |
644 |
"""returns the arXiv id based on a URL into the document repository. |
645 |
|
646 |
This involves guessing the short name, which may fail for weirdly formed |
647 |
docrepo URLs. |
648 |
|
649 |
If the lookup itself fails, a KeyError with the original url is raised. |
650 |
""" |
651 |
short_name = guess_short_name(url) |
652 |
if short_name in self.arXiv_map: |
653 |
return self.arXiv_map[short_name] |
654 |
raise KeyError(url) |
655 |
|
656 |
|
657 |
########################## ADS interface |
658 |
|
659 |
def filter_unpublished_bibcodes(bibcodes, auth): |
660 |
"""returns a list of bibcodes not yet known to ADS from bibcodes. |
661 |
""" |
662 |
params = { |
663 |
'q': '*:*', |
664 |
'rows': 1000, |
665 |
'wt': 'json', |
666 |
'fq': '{!bitset}', |
667 |
'fl': 'bibcode'} |
668 |
payload = "bibcode\n"+"\n".join(bibcodes) |
669 |
|
670 |
req = requests.post(ADS_ENDPOINT, |
671 |
params=params, |
672 |
headers={'Authorization': 'Bearer:%s'%auth}, |
673 |
data=payload) |
674 |
response = json.loads(req.text) |
675 |
|
676 |
if response["responseHeader"]["status"]!=0: |
677 |
raise ExternalError("ADS API returned error: %s"%repr(response)) |
678 |
|
679 |
known_bibcodes = set([r["bibcode"] for r in response["response"]["docs"]]) |
680 |
for bibcode in bibcodes: |
681 |
if not bibcode in known_bibcodes: |
682 |
yield bibcode |
683 |
|
684 |
|
685 |
########################## command line interface |
686 |
|
687 |
def _test(): |
688 |
"""runs the embedded doctests. |
689 |
""" |
690 |
import doctest, harvest |
691 |
harvest.TEST_DATA = { |
692 |
"r1": {"url": "http://foo/bar", "title": "Test doc", |
693 |
"authors": "Fred Gnu Test, Wang Chu", "editors": "Greg Ju", |
694 |
"date": (2014, 3, 7), "abstract": "N/A", "pdf": "uh", |
695 |
"journal": "IVOA Recommendation", "arXiv_id": "a-p/1"}, |
696 |
"r2": {"url": "http://foo/baz", "title": "More Testing", |
697 |
"authors": u"René Descartes", "editors": "J.C. Maxwell", |
698 |
"date": (2014, 3, 7), "abstract": "N/A", |
699 |
"journal": "IVOA Recommendation", "arXiv_id": "a-p/2"}, |
700 |
"r3": {"url": "http://foo/quux", "title": "Still more", |
701 |
"authors": "Leonhard Euler, Georg Cantor", |
702 |
"editors": "Frederic Chopin", |
703 |
"date": (2014, 5, 7), "abstract": "N/A", |
704 |
"journal": "IVOA Note"}, |
705 |
"ru": {"url": "http://foo/bar", "title": "Test doc", |
706 |
"journal": "Broken Mess", "abstract": "", "authors": "X"}, |
707 |
"rr": {"url": "http://foo/failrec", "title": "Test REC", |
708 |
"authors": "Fred Gnu Test, Wang Chu", "editors": "Greg Ju", |
709 |
"date": (2014, 3, 7), "abstract": "N/A", "pdf": "uh", |
710 |
"journal": "IVOA Recommendation"}, |
711 |
"rme": {"url": "http://foo/twoeditors", "title": "I have two editors", |
712 |
"authors": "Second Editor, Some Guy, Guy Rixon, First Editor", |
713 |
"editors": "First Editor, Second Editor", |
714 |
"date": (2014, 3, 20), "abstract": "N/A", |
715 |
"journal": "IVOA Note"}, |
716 |
"lm": LocalMetadata(), |
717 |
} |
718 |
doctest.testmod(harvest) |
719 |
|
720 |
|
721 |
def parse_command_line(): |
722 |
parser = argparse.ArgumentParser( |
723 |
description="Generate ADS records from the IVOA document repo.") |
724 |
parser.add_argument("-r", "--repo-url", |
725 |
action="store", dest="repo_url", |
726 |
help="Use URL as the document repository's URL", |
727 |
metavar="URL", default="http://www.ivoa.net/documents/") |
728 |
parser.add_argument("-t", "--test-only", |
729 |
action="store_true", dest="run_tests", |
730 |
help="Only run doctests, then exit (requires network).") |
731 |
parser.add_argument("-C", "--use-cache", |
732 |
action="store_true", dest="cache_web", |
733 |
help="Use cached copies of things obtained from the net" |
734 |
" (or create these caches).") |
735 |
parser.add_argument("-a", "--ads-token", |
736 |
action="store", type=str, dest="ads_token", |
737 |
help="ADS access token to filter out records already in ADS.", |
738 |
default=None) |
739 |
parser.add_argument("-s", "--single-doc", |
740 |
action="store", dest="doc_url", |
741 |
help="Only translate document with landing page url URL (only for" |
742 |
" testing/debugging; bibcodes may be wrong).", |
743 |
metavar="URL", default=None) |
744 |
return parser.parse_args() |
745 |
|
746 |
|
747 |
def main(): |
748 |
global CACHE_RESULTS |
749 |
args = parse_command_line() |
750 |
if args.cache_web or args.run_tests: |
751 |
CACHE_RESULTS = True |
752 |
|
753 |
if args.run_tests: |
754 |
_test() |
755 |
return |
756 |
|
757 |
local_metadata = LocalMetadata() |
758 |
if args.doc_url: |
759 |
dc = DocumentCollection( |
760 |
[Document.from_URL(args.doc_url, local_metadata)]) |
761 |
else: |
762 |
dc = DocumentCollection.from_repo_URL( |
763 |
args.repo_url, local_metadata) |
764 |
|
765 |
limit_to = None |
766 |
if args.ads_token: |
767 |
limit_to = set(filter_unpublished_bibcodes( |
768 |
[doc.bibcode for doc in dc], args.ads_token)) |
769 |
|
770 |
for rec in dc: |
771 |
if limit_to is not None: |
772 |
if rec.bibcode not in limit_to: |
773 |
continue |
774 |
|
775 |
print rec.as_ADS_record() |
776 |
print "" |
777 |
|
778 |
|
779 |
if __name__=="__main__": |
780 |
try: |
781 |
main() |
782 |
except ValidationError, msg: |
783 |
sys.stderr.write(str(msg)+"\n") |
784 |
sys.stderr.write( |
785 |
"\nDocument repository invalid, not generating records.\n") |