/[volute]/trunk/projects/WebAssets/tools/docrepoToADS/harvest.py
ViewVC logotype

Contents of /trunk/projects/WebAssets/tools/docrepoToADS/harvest.py

Parent Directory Parent Directory | Revision Log Revision Log


Revision 5328 - (show annotations)
Thu Mar 14 13:22:27 2019 UTC (22 months ago) by msdemlei
File MIME type: text/x-python
File size: 24260 byte(s)
docrepoToADS: migrating to requests to mitigate impact of varying to encodings.


1 #!/usr/bin/env python
2 # -*- coding: iso-8859-1 -*-
3
4 """
5 This script turns the contents of the IVOA document repository into
6 the ADS tagged format.
7
8 Warning: it will walk a major portion of the IVOA document repository,
9 which translates into ~100 requests fired without rate limitation.
10
11 Among the complications are:
12
13 (1) We're creating electronic document identifiers (see make_ivoadoc_id and
14 following)
15
16 (2) We're manipulating the author lists to ensure the editor(s) are in the
17 first position.
18
19 (3) As ADS would rather not have records they already have resubmitted,
20 we query it using a "new API" endpoint.
21
22 After all these complications, it might make sense to finally introduce
23 classes for representing records (rather than dictionaries, the keys of
24 which are defined through the namespace in the parse_landing_page
25 function...) and probably the whole collection, too (rather than a simple
26 list). MD might do this if there's another feature request...
27
28
29
30 Copyright 2014-2015, Markus Demleitner <msdemlei@ari.uni-heidelberg.de>
31 This program is free software; you can redistribute it and/or modify
32 it under the terms of the GNU General Public License as published by
33 the Free Software Foundation; either version 2 of the License, or
34 (at your option) any later version.
35
36 This program is distributed in the hope that it will be useful,
37 but WITHOUT ANY WARRANTY; without even the implied warranty of
38 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
39 GNU General Public License for more details.
40
41 You should have received a copy of the GNU General Public License
42 along with this program; if not, write to the Free Software
43 Foundation, Inc., 59 Temple Street #330, Boston, MA 02111-1307, USA.
44 """
45
46 import argparse
47 import cgi
48 import itertools
49 import json
50 import os
51 import re
52 import sys
53 import traceback
54 import urlparse
55
56 import BeautifulSoup
57 import requests
58
59
60 CACHE_RESULTS = False
61
62 # When two documents were published on the same date from authors
63 # with the same initial, we need to reliably add a qualifier.
64 # This is a dict of landing page URLs to qualifiers. In the future,
65 # the document coordinator should try to avoid such situations,
66 # so hopefully the following enumeration is exhaustive.
67 BIBCODE_QUALIFIERS = {
68 "http://www.ivoa.net/documents/cover/ConeSearch-20080222.html": "Q",
69 "http://www.ivoa.net/documents/VOSpace/20091007/": "Q",
70 "http://www.ivoa.net/documents/SLAP/20101209/": "Q",
71 }
72
73 # endpoint of the ADS "bigquery" API
74 ADS_ENDPOINT = "https://api.adsabs.harvard.edu/v1/search/bigquery?"
75
76
77 ########################## Utilties
78
79 class Error(Exception):
80 """Base class of exceptions raised by us.
81 """
82
83 class ValidationError(Error):
84 """is raised for documents that are in some way invalid.
85 """
86
87 class ExternalError(Error):
88 """is raised if some external service behaved unexpectedly.
89 """
90
91 class Finished(Exception):
92 """used by the abstract collector to abort item collection in case of
93 malstructured documents.
94 """
95 def __init__(self, payload):
96 self.payload = payload
97 Exception.__init__(self, "Unexpected div")
98
99
100 def get_with_cache(url):
101 cacheName = re.sub("[^\w]+", "", url)+".cache"
102 if CACHE_RESULTS and os.path.exists(cacheName):
103 doc = open(cacheName).read().decode("utf-8")
104 else:
105 doc = requests.get(url).text
106 if CACHE_RESULTS:
107 f = open(cacheName, "w")
108 f.write(doc.encode("utf-8"))
109 f.close()
110 return doc
111
112
113 def get_enclosing_element(soup, tag, text):
114 """returns the first match of tag that contains an element containg
115 text.
116 """
117 for el in soup.findAll(tag):
118 if text in el.text:
119 return el
120
121
122 ########################## Screen scraping landing pages
123
124 MONTH_NAMES = ["January", "February", "March", "April", "May", "June",
125 "July", "August", "September", "October", "November", "December"]
126 DATE_RE = re.compile(r"(\d{1,2})\s*(%s)\s*(\d\d\d\d)"%
127 "|".join(MONTH_NAMES))
128
129
130 def parse_subhead_date(s):
131 """returns first year, month, and day for a date as on IVOA document
132 landing pages.
133 """
134 mat = DATE_RE.search(s)
135 if not mat:
136 raise Exception("No date visible in %s"%repr(s))
137 return (int(mat.group(3)),
138 MONTH_NAMES.index(mat.group(2))+1,
139 int(mat.group(1)))
140
141
142 def format_abstract(el):
143 """returns plain text from a BeautifulSoup element.
144
145 This traverses the tree, stopping when it encounters the first div.
146 Only very little markup is supported (all we have is ADS' abstract
147 syntax).
148 """
149 accum = []
150
151 if isinstance(el, BeautifulSoup.NavigableString):
152 accum.append(el.string)
153
154 elif el.name=="div":
155 # this is probably bad document structure, in that this div
156 # should not be a child of the abstract. Stop collecting, but
157 # pass upstream what we've collected so far.
158 raise Finished(" ".join(accum))
159
160 elif el.name in ("ul", "ol"):
161 # can't see a way to properly do ul in running text, so folding
162 # it to ol.
163 for index, child in enumerate(el.findAll("li", recursive=False)):
164 accum.append(" (%s) %s "%(index+1, format_abstract(child)))
165
166 else:
167 if el.name=="p":
168 accum.append("\n\n")
169 for child in el:
170 try:
171 accum.append(format_abstract(child))
172 except Finished, rest:
173 raise Finished(" ".join(accum+[rest.payload]))
174
175 return " ".join(accum)
176
177
178 def get_abstract_text(soup):
179 """returns a guess for what the abstract within soup is.
180
181 Unfortunately, the abstract isn't marked up well on IVOA landing
182 pages. Hence, we just look for the headline and gobble up material until
183 we reach a div after that.
184 """
185 abstract_head = get_enclosing_element(soup, "h2", "Abstract")
186 el = abstract_head.nextSibling
187 accum = []
188 while getattr(el, "name", None)!="div":
189 try:
190 accum.append(format_abstract(el))
191 except Finished, rest:
192 # div found as abstract child, suspect malformed document.
193 accum.append(rest.payload)
194 break
195 el = el.nextSibling
196 return " ".join(accum)
197
198
199 def clean_field(s):
200 """return s with normalised space and similar, ready for inclusion
201 into ADS' tagged format.
202
203 Don't do this to abstracts.
204 """
205 # Oh shucks, "Grid *and* Web Services" requires a special hack.
206 return re.sub(",? and ", ", ",
207 re.sub("\s+", " ", s)).replace("Grid, ", "Grid and")
208
209
210 SHORT_NAME_EXCEPTIONS = {
211 "VOT": "VOTable"
212 }
213
214 def guess_short_name(url_in_docrepo):
215 """guesses the short name of a document based on its docrepo URL.
216
217 Due to historically confusing practices, this is hard to do. Our
218 heuristics: we throw out known parts of common URLs and take the
219 segments that's the longest.
220 >>> guess_short_name("http://www.ivoa.net/documents/SAMP/20120411/")
221 'SAMP'
222 >>> guess_short_name("www.ivoa.net/documents/cover/SAMP-20090421.html")
223 'SAMP'
224 >>> guess_short_name("http://www.ivoa.net/documents/cover/VOT-20040811.html")
225 'VOTable'
226 """
227 # cut prefix
228 local_path = re.sub(".*documents/", "", url_in_docrepo)
229 # cut known junk
230 unjunked = re.sub("index.html", "",
231 re.sub("cover/", "", local_path))
232 # score candidates according to
233 scored = list(sorted((len(re.sub("[^A-Z]+", "", s)), s)
234 for s in re.split("[/-]", unjunked)))
235 # fail if inconclusive
236 if len(scored)>1 and scored[-1][0]==scored[-2][0]:
237 raise Error("Cannot infer short name: %s"%url_in_docrepo)
238
239 return SHORT_NAME_EXCEPTIONS.get(scored[-1][1], scored[-1][1])
240
241
242 def parse_landing_page(url, local_metadata):
243 """returns a dictionary of document properties for a document taken from
244 its landing page.
245 """
246 soup = BeautifulSoup.BeautifulSoup(get_with_cache(url))
247 authors = clean_field(
248 get_enclosing_element(soup, "dt", "Author(s):"
249 ).findNextSibling("dd").getText(" "))
250 editors = clean_field(get_enclosing_element(soup, "dt", "Editor(s):"
251 ).findNextSibling("dd").getText(" "))
252 tagline = soup.find("h2").text
253 date = parse_subhead_date(tagline)
254 abstract = get_abstract_text(soup).replace("\r", "")
255
256 title = clean_field(soup.find("h1").getText(" "))
257 journal = tagline
258
259 pdf_enclosure = get_enclosing_element(soup, "a", "PDF")
260 if pdf_enclosure:
261 pdf = urlparse.urljoin(url, pdf_enclosure.get("href"))
262
263 try:
264 arXiv_id = local_metadata.get_arXiv_id_for_URL(url)
265 except KeyError:
266 # That's ok for notes, and checked separately for RECs
267 pass
268
269 del soup
270 return locals()
271
272
273 ########################## Screen scraping the index page
274
275 def iter_links_from_table(src_table, rec_class):
276 """returns rec-like URLs from src_table.
277
278 src_table is a BeautifulSoup node for one of our documents-in-progress
279 tables (realistically, recommendations or endorsed notes).
280
281 rec_class is a CSS class name which marks links to finished standards
282 in the respective table (in reality, en or rec).
283
284 The function yields anchor elements.
285 """
286 for links in src_table.findAll("td", {"class": "versionold"}):
287 for anchor in links.findAll("a", {"class": rec_class}):
288 yield anchor
289
290
291 def iter_REC_URLs(doc_index, repo_url):
292 """iterates over URLs to RECs (different versions are different documents).
293
294 doc_index is a BeautifulSoup of the IVOA documents repo. Each URL
295 in a class=rec anchor will be returned exactly once. Document
296 order is maintained.
297 """
298 seen_stds = set()
299 rec_table = get_enclosing_element(doc_index, "h3",
300 "Technical Specifications").findNextSibling("table")
301 en_table = get_enclosing_element(doc_index, "h3",
302 "Endorsed Note").findNextSibling("table")
303
304 for anchor in itertools.chain(
305 iter_links_from_table(rec_table, "rec"),
306 iter_links_from_table(en_table, "en")):
307 # we'll fix URLs to some degree here; in particular,
308 # uppercase Documents, which was fairly common in the old days,
309 # is lowercased.
310 url = urlparse.urljoin(repo_url, anchor.get("href"
311 ).replace("Documents", "documents"))
312
313 if url in seen_stds:
314 continue
315 seen_stds.add(url)
316 yield url
317
318
319 def iter_Notes_URLs():
320 """iterates over URLs of published notes.
321
322 Right now, most notes are not pushed to ADS. Instead, the exec
323 lists the ones it wants published, and the document coordinator
324 manually adds the URLs to published_notes.txt.
325 """
326 with open("published_notes.txt") as f:
327 for ln in f:
328 if ln.strip() and not ln.startswith("#"):
329 yield ln.strip()
330
331
332 ########################## record generation logic
333
334 class Document(dict):
335 """Metadata of an IVOA document.
336
337 These are constructed with a dictionary of items found; this
338 includes authors*, editors*, date*, abstract*, title*, type*
339 (spec/rept), pdf (its URL), url* (of the landing page), journal*,
340 arXiv_id (mandatory for RECs), but additional arbitrary keys are allowed.
341 Items with stars are mandatory.
342
343 You'll usually use the from_URL class function to construct one
344 from an IVOA document landing page.
345
346 >>> Document(TEST_DATA["ru"])
347 Traceback (most recent call last):
348 ValidationError: Document at http://foo/bar: Missing key(s) date, editors
349 >>> d = Document(TEST_DATA["r1"])
350 >>> d["authors"]
351 'Greg Ju, Fred Gnu Test, Wang Chu'
352 >>> d.bibcode
353 '2014ivoa.spec.0307J'
354 >>> d.as_ADS_record()[:59]
355 '%R 2014ivoa.spec.0307J\\n%D 3/2014\\n%I ELECTR: http://foo/bar;'
356 >>> d2 = Document.from_URL("http://www.ivoa.net/documents/SAMP/20120411"
357 ... "/index.html", TEST_DATA["lm"])
358 >>> d2["authors"]
359 u'T. Boch, M. Fitzpatrick, M. Taylor, A. Allan, J. Fay, L. Paioro, J. Taylor, D. Tody'
360 >>> d2.bibcode
361 u'2012ivoa.spec.0411B'
362 >>> Document(TEST_DATA["rr"])
363 Traceback (most recent call last):
364 Error: RECs must have arXiv_id (add to arXiv_ids.txt); failing on document at http://foo/failrec
365 >>> Document(TEST_DATA["rme"])["authors"]
366 'First Editor, Second Editor, Some Guy, Guy Rixon'
367 """
368
369 mandatory_keys = frozenset(
370 ["url", "authors", "editors", "date", "abstract", "title", "journal"])
371 key_to_ads = [
372 ("authors", "A"),
373 ("editors", "e"),
374 ("title", "T"),
375 ("source", "G"),
376 ("journal", "J"),
377 ("abstract", "B"),
378 ]
379
380 def __init__(self, vals):
381 dict.__init__(self, vals)
382 self["source"] = "IVOA"
383 self.validate()
384 self._perform_editor_hack()
385 self._infer_type()
386 # if self["type"]=="spec":
387 # if not self.get("arXiv_id"):
388 # raise Error("RECs must have arXiv_id (add to arXiv_ids.txt);"
389 # " failing on document at %s"%(self["url"]))
390
391 @classmethod
392 def from_URL(cls, url, local_metadata):
393 """returns a new Document made from the IVOA landing page at url.
394 """
395 return cls(parse_landing_page(url, local_metadata))
396
397 def validate(self):
398 """raises a ValidationError if one or more of the mandatory_keys
399 are missing.
400 """
401 missing_keys = self.mandatory_keys-set(self)
402 if missing_keys:
403 raise ValidationError("Document at %s: Missing key(s) %s"%(
404 self.get("url", "<unknown origin>"), ", ".join(missing_keys)))
405
406 def _infer_type(self):
407 """decides whether this document is a spec (Recommendation) or
408 rept (Note).
409
410 We currently do this according to the journal field (specs have
411 "Recommendation" or "Endorsed Note" in there).
412 """
413 if ("Recommendation" in self["journal"]
414 or "Endorsed Note" in self["journal"]):
415 self["type"] = "spec"
416 else:
417 self["type"] = "rept"
418
419 def _perform_editor_hack(self):
420 """fudges the authors list to include the editor(s) in the first place.
421
422 This was the express wish of Francoise Genova to provide sufficient
423 credit to the editors who, typically, did most of the work that went
424 into a document.
425
426 This method is called by the constructor; it's designed to be
427 idempotent.
428 """
429 if not self["editors"].strip():
430 return
431
432 eds = set(s.strip() for s in self["editors"].split(","))
433 auths = [s.strip() for s in self["authors"].split(",")]
434
435 # sanity check: if an item of eds or auths contains no blank,
436 # we have a bad author format
437 name_pattern = re.compile(r"[\. ]")
438 if ([item for item in eds if not name_pattern.search(item)]
439 or [item for item in auths if not name_pattern.search(item)]):
440 raise Exception("Bad author format suspected in %s/%s"%(
441 eds, auths))
442
443 non_editors = ", ".join(item for item in auths if item not in eds)
444 if non_editors:
445 self["authors"] = "%s, %s"%(self["editors"], non_editors)
446 else:
447 self["authors"] = self["editors"]
448
449 def get_first_author_surname(self):
450 """returns the surname for the first author.
451
452 This is pure heuristics -- we need it for bibcode generation, and
453 hence we should keep this in sync with what ADS wants.
454 """
455 # current heuristics: first character of last "word" in front of the
456 # first comma. This will fail for many interesting cases, but
457 # IOVA contributors appear to have tame names for now.
458 return self["authors"].split(",")[0].split()[-1]
459
460 @property
461 def bibcode(self):
462 """returns the bibcode for this record.
463 """
464 year, month, day = self["date"]
465 return "%sivoa.%s%s%02d%02d%s"%(
466 year, self["type"],
467 BIBCODE_QUALIFIERS.get(self["url"], "."),
468 month, day,
469 self.get_first_author_surname()[0])
470
471 def as_ADS_record(self):
472 """returns UTF-8 encoded ADS tagged format for doc_dict as returned
473 by our parsers.
474 """
475 parts = ["%%R %s"%self.bibcode]
476
477 year, month, day = self["date"]
478 parts.append("%%D %s/%s"%(month, year))
479
480 links = "%%I ELECTR: %s"%self["url"]
481 if "pdf" in self:
482 links += ";\nPDF: %s"%self["pdf"]
483 if "ivoadoc-id" in self:
484 links += ";\nEPRINT: %s"%self["ivoadoc-id"]
485 if "arXiv_id" in self:
486 links += ";\nARXIV: %s"%self["arXiv_id"]
487 parts.append(links)
488
489 for our_key, ads_key in self.key_to_ads:
490 if our_key in self:
491 parts.append("%%%s %s"%(ads_key, self[our_key]))
492
493 return "\n".join(parts).encode("utf-8")
494
495
496 class DocumentCollection(object):
497 """A collection of IVOA document metadata.
498
499 This also contains logic that needs to see the entire collection.
500
501 It is constructed with a sequence of Document instances; you
502 will usually use the from_repo_URL class method which takes the
503 URL of the IVOA's document collection.
504
505 These things are conceptually immutable (i.e., you're not supposed
506 to change self.docs).
507
508 The main interface to this is iteration -- you'll get all the
509 documents in temporal order.
510
511 >>> dc = DocumentCollection(
512 ... Document(TEST_DATA[k]) for k in "r1 r2 r3".split())
513 >>> dc.docs[0].bibcode
514 '2014ivoa.spec.0307J'
515 """
516 def __init__(self, docs):
517 self.docs = list(docs)
518 self._sort_recs()
519 self._create_identifiers()
520 self.validate()
521
522 @classmethod
523 def from_repo_URL(cls, root_url, local_metadata):
524 """returns a DocumentCollection ready for export, constructed
525 from the index at root_url.
526 """
527 doc_index = BeautifulSoup.BeautifulSoup(
528 requests.get(root_url).text)
529 docs = []
530
531 for url in itertools.chain(
532 iter_REC_URLs(doc_index, root_url),
533 iter_Notes_URLs()):
534 try:
535 docs.append(
536 Document.from_URL(urlparse.urljoin(root_url, url), local_metadata))
537 except KeyboardInterrupt:
538 raise
539 except:
540 sys.stderr.write("\nIn document %s:\n"%url)
541 traceback.print_exc()
542 return cls(docs)
543
544 def __iter__(self):
545 return iter(self.docs)
546
547 def validate(self):
548 """runs some simple tests to avoid certain undesirable situations.
549
550 Problems will lead to a validation error being raised.
551 """
552 docs_per_bibcode = {}
553 for doc in self:
554 docs_per_bibcode.setdefault(doc.bibcode, []).append(doc)
555 dupes = [item for item in docs_per_bibcode.iteritems()
556 if len(item[1])>1]
557 if dupes:
558 raise ValidationError("The following documents generated"
559 " clashing bibcodes: %s. Fix by adding one of them to"
560 " BIBCODE_QUALIFIERS in the source."%(
561 " AND ALSO\n".join(
562 " and ".join(c["url"] for c in clashing[1])
563 for clashing in dupes)))
564
565 def _make_ivoadoc_id(self, rec, index):
566 """returns, for a rec as returned by parse_landing_page
567 and the document index within the publication month,
568 the document's IVOA document id.
569
570 The IVOA document id has the form
571 ivoa:<t>.<year>.<month>.<count>. count is a running
572 number per month, where documents are sorted first
573 by date, then by first author last name, and finally
574 by title. <t> is r for a REC-type thing, n for a
575 NOTE-like thing.
576
577 This is a helper for _create_identifiers.
578 """
579 return "ivoa:%s.%04d.%02d.%02d"%(
580 "r" if rec["type"]=="spec" else "n",
581 rec["date"][0],
582 rec["date"][1],
583 index)
584
585 def _sort_recs(self):
586 """sorts our records as required for IVOA identifier generation.
587
588 That is, sorted by date, authors, and titles, in that order.
589 This is called by the constructor.
590 """
591 self.docs.sort(key=lambda rec: rec["date"]+(
592 rec.get_first_author_surname(), rec["title"]))
593
594 def _get_month_partition(self):
595 """returns a dictionary mapping (year, month) to the documents published
596 in that month
597
598 This is a helper for _create_identifiers
599 """
600 by_month = {}
601 for rec in self.docs:
602 year, month, day = rec["date"]
603 by_month.setdefault((year, month), []).append(rec)
604 return by_month
605
606 def _create_identifiers(self):
607 """adds ivoadoc-id keys to every record in self.
608
609 See _make_ivoadoc_id for what this is.
610
611 This is called by the constructor.
612 """
613 for (year, month), recs in self._get_month_partition().iteritems():
614 for index, rec in enumerate(d for d in self.docs if d["type"]=="spec"):
615 rec["ivoadoc-id"] = self._make_ivoadoc_id(rec, index)
616 for index, rec in enumerate(d for d in self.docs if d["type"]=="rept"):
617 rec["ivoadoc-id"] = self._make_ivoadoc_id(rec, index)
618
619
620 ########################## local metadata injection
621
622 class LocalMetadata(object):
623 """A container for parsed metadata from kept in the SVN repo.
624
625 Currently, that's a mapping from document short names to arXiv ids, kept in
626 arXiv_map. By Exec decree, this is only available for IVOA RECs.
627 """
628 def __init__(self):
629 self._load_arXiv_map()
630
631 def _load_arXiv_map(self):
632 self.arXiv_map = {}
633 with open("arXiv_ids.txt") as f:
634 for ln_index, ln in enumerate(f):
635 try:
636 if ln.strip():
637 access_URL, arXiv_id = ln.split()
638 self.arXiv_map[access_URL.strip()] = arXiv_id.strip()
639 except ValueError:
640 sys.exit("arXiv_ids.txt, line %s: entry not in <local><white><arxiv>"
641 " format."%(ln_index+1))
642
643 def get_arXiv_id_for_URL(self, url):
644 """returns the arXiv id based on a URL into the document repository.
645
646 This involves guessing the short name, which may fail for weirdly formed
647 docrepo URLs.
648
649 If the lookup itself fails, a KeyError with the original url is raised.
650 """
651 short_name = guess_short_name(url)
652 if short_name in self.arXiv_map:
653 return self.arXiv_map[short_name]
654 raise KeyError(url)
655
656
657 ########################## ADS interface
658
659 def filter_unpublished_bibcodes(bibcodes, auth):
660 """returns a list of bibcodes not yet known to ADS from bibcodes.
661 """
662 params = {
663 'q': '*:*',
664 'rows': 1000,
665 'wt': 'json',
666 'fq': '{!bitset}',
667 'fl': 'bibcode'}
668 payload = "bibcode\n"+"\n".join(bibcodes)
669
670 req = requests.post(ADS_ENDPOINT,
671 params=params,
672 headers={'Authorization': 'Bearer:%s'%auth},
673 data=payload)
674 response = json.loads(req.text)
675
676 if response["responseHeader"]["status"]!=0:
677 raise ExternalError("ADS API returned error: %s"%repr(response))
678
679 known_bibcodes = set([r["bibcode"] for r in response["response"]["docs"]])
680 for bibcode in bibcodes:
681 if not bibcode in known_bibcodes:
682 yield bibcode
683
684
685 ########################## command line interface
686
687 def _test():
688 """runs the embedded doctests.
689 """
690 import doctest, harvest
691 harvest.TEST_DATA = {
692 "r1": {"url": "http://foo/bar", "title": "Test doc",
693 "authors": "Fred Gnu Test, Wang Chu", "editors": "Greg Ju",
694 "date": (2014, 3, 7), "abstract": "N/A", "pdf": "uh",
695 "journal": "IVOA Recommendation", "arXiv_id": "a-p/1"},
696 "r2": {"url": "http://foo/baz", "title": "More Testing",
697 "authors": u"René Descartes", "editors": "J.C. Maxwell",
698 "date": (2014, 3, 7), "abstract": "N/A",
699 "journal": "IVOA Recommendation", "arXiv_id": "a-p/2"},
700 "r3": {"url": "http://foo/quux", "title": "Still more",
701 "authors": "Leonhard Euler, Georg Cantor",
702 "editors": "Frederic Chopin",
703 "date": (2014, 5, 7), "abstract": "N/A",
704 "journal": "IVOA Note"},
705 "ru": {"url": "http://foo/bar", "title": "Test doc",
706 "journal": "Broken Mess", "abstract": "", "authors": "X"},
707 "rr": {"url": "http://foo/failrec", "title": "Test REC",
708 "authors": "Fred Gnu Test, Wang Chu", "editors": "Greg Ju",
709 "date": (2014, 3, 7), "abstract": "N/A", "pdf": "uh",
710 "journal": "IVOA Recommendation"},
711 "rme": {"url": "http://foo/twoeditors", "title": "I have two editors",
712 "authors": "Second Editor, Some Guy, Guy Rixon, First Editor",
713 "editors": "First Editor, Second Editor",
714 "date": (2014, 3, 20), "abstract": "N/A",
715 "journal": "IVOA Note"},
716 "lm": LocalMetadata(),
717 }
718 doctest.testmod(harvest)
719
720
721 def parse_command_line():
722 parser = argparse.ArgumentParser(
723 description="Generate ADS records from the IVOA document repo.")
724 parser.add_argument("-r", "--repo-url",
725 action="store", dest="repo_url",
726 help="Use URL as the document repository's URL",
727 metavar="URL", default="http://www.ivoa.net/documents/")
728 parser.add_argument("-t", "--test-only",
729 action="store_true", dest="run_tests",
730 help="Only run doctests, then exit (requires network).")
731 parser.add_argument("-C", "--use-cache",
732 action="store_true", dest="cache_web",
733 help="Use cached copies of things obtained from the net"
734 " (or create these caches).")
735 parser.add_argument("-a", "--ads-token",
736 action="store", type=str, dest="ads_token",
737 help="ADS access token to filter out records already in ADS.",
738 default=None)
739 parser.add_argument("-s", "--single-doc",
740 action="store", dest="doc_url",
741 help="Only translate document with landing page url URL (only for"
742 " testing/debugging; bibcodes may be wrong).",
743 metavar="URL", default=None)
744 return parser.parse_args()
745
746
747 def main():
748 global CACHE_RESULTS
749 args = parse_command_line()
750 if args.cache_web or args.run_tests:
751 CACHE_RESULTS = True
752
753 if args.run_tests:
754 _test()
755 return
756
757 local_metadata = LocalMetadata()
758 if args.doc_url:
759 dc = DocumentCollection(
760 [Document.from_URL(args.doc_url, local_metadata)])
761 else:
762 dc = DocumentCollection.from_repo_URL(
763 args.repo_url, local_metadata)
764
765 limit_to = None
766 if args.ads_token:
767 limit_to = set(filter_unpublished_bibcodes(
768 [doc.bibcode for doc in dc], args.ads_token))
769
770 for rec in dc:
771 if limit_to is not None:
772 if rec.bibcode not in limit_to:
773 continue
774
775 print rec.as_ADS_record()
776 print ""
777
778
779 if __name__=="__main__":
780 try:
781 main()
782 except ValidationError, msg:
783 sys.stderr.write(str(msg)+"\n")
784 sys.stderr.write(
785 "\nDocument repository invalid, not generating records.\n")

msdemlei@ari.uni-heidelberg.de
ViewVC Help
Powered by ViewVC 1.1.26