/[volute]/trunk/projects/semantics/vocinvo2/revovo.py
ViewVC logotype

Contents of /trunk/projects/semantics/vocinvo2/revovo.py

Parent Directory Parent Directory | Revision Log Revision Log


Revision 5596 - (show annotations)
Tue Aug 27 14:43:53 2019 UTC (23 months ago) by msdemlei
File MIME type: text/x-python
File size: 12021 byte(s)
vocinvo2: Adding revovo.

This will grow into a reference implemenation for how to consume
our vocabularies without RDF tooling, and a bit of a validator for the
RDF/X representation.

At this point, the basic functionality of parsing the various vocabulary
flavours is there, but not much else (and there's certainly quite
a bit missing in terms of robustness).



1 """
2 This python module ("REad VOcabularies in the VO") is a small reference
3 implemenation pulling the central pieces of information from vocabularies
4 conforming to version 2 of the Vocabularies in the VO recommendation.
5
6 It can be used as a standalone so-so validator (it's not checking every
7 conceivable aspect), and it can be dropped into other software (and then
8 licensed as required there) and used as a library.
9
10 To run the embedded doctests, run the script without arguments.
11
12 This is written for python3 and has no dependencies beyond the standard
13 python library.
14
15 Written by Markus Demleitner <msdemlei@ari.uni-heidelberg.de> in 2019.
16
17 This code is in the public domain.
18 """
19
20 import re
21 import sys
22 from urllib.request import Request, urlopen
23 from xml.etree import ElementTree
24
25
26 # A dict of namespace URIs to canonical prefixes -- we replace those
27 # on incoming items to have more readable code.
28 PREFIX_DEF = {
29 "http://purl.org/dc/terms/": "dc",
30 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
31 "http://www.w3.org/2000/01/rdf-schema#": "rdfs",
32 "http://www.w3.org/2002/07/owl#": "owl",
33 "http://www.w3.org/2004/02/skos/core#": "skos",
34 "http://www.ivoa.net/rdf/ivoasem#": "ivoasem",
35 }
36
37
38 # The flavour-specific properties
39 PROPERTIES_BY_FLAVOUR = {
40 "RDF Class": {
41 "label_property": "rdfs:label",
42 "description_property": "rdfs:comment",
43 "wider_property": "rdfs:subClassOf",
44 "term_type": "rdfs:Class",
45 },
46 "RDF Property": {
47 "label_property": "rdfs:label",
48 "description_property": "rdfs:comment",
49 "wider_property": "rdfs:subPropertyOf",
50 "term_type": "rdf:Property",
51 },
52 "SKOS": {
53 "label_property": "skos:prefLabel",
54 "description_property": "skos:definition",
55 "wider_property": "skos:broader",
56 "term_type": "skos:Concept",
57 },
58 }
59
60 def prefixify(etree_name):
61 """returns a tag or attribute name with a canonical prefix (by
62 PREFIX_DEF).
63
64 etree_name is what's coming out of etree, i.e., simple strings,
65 possibly of the form {ns-url}name.
66
67 >>> prefixify("{http://purl.org/dc/terms/}foo")
68 'dc:foo'
69 >>> prefixify("{http://dc.g-vo.org}foo")
70 '{http://dc.g-vo.org}foo'
71 >>> prefixify("src")
72 'src'
73 """
74 if etree_name.startswith('{'):
75 mat = re.match("{([^}]+)}(.*)", str(etree_name))
76 if mat and mat.group(1) in PREFIX_DEF:
77 return "{}:{}".format(PREFIX_DEF[mat.group(1)], mat.group(2))
78 # else fall through to return etree_name unchanged
79 return etree_name
80
81
82 def prefixify_attrib(etree_attrib):
83 """returns etree_attrib as a simple dictionary with keys from
84 PREFIX_DEF namespaces having prefixes rather than URIs.
85 """
86 return dict(
87 (prefixify(name), value)
88 for name, value in etree_attrib.items())
89
90 def _make_url_prefixifier():
91 """returns a function turning URLs with known prefixes into the
92 prefix form.
93
94 This should run in about linear time with the number of known
95 namespaces; it will break when one ns URL is a prefix of another
96 (which would be a terrible idea anyway).
97 """
98 prefix_re = re.compile(
99 "({})(.*)".format("|".join(re.escape(u) for u in PREFIX_DEF)))
100
101 def prefixify_url(url):
102 """returns the prefix form of url if it starts with a known prefix.
103
104 It will return url unchanged otherwise.
105
106 >>> prefixify_url("http://purl.org/dc/terms/author")
107 'dc:author'
108 >>> prefixify_url("abc")
109 'abc'
110 """
111 mat = prefix_re.match(url)
112 if mat:
113 return "{}:{}".format(PREFIX_DEF[mat.group(1)], mat.group(2))
114 return url
115
116 return prefixify_url
117
118 prefixify_url = _make_url_prefixifier()
119 del _make_url_prefixifier
120
121
122 class Vocabulary(object):
123 """A facade for the major properties of VO vocabularies.
124
125 This exposes:
126
127 * terms -- a dictionary term -> (label, description).
128 * deprecated_terms -- a dictionary mapping deprecated terms to their
129 replacements. Things just deprecated map to nothing.
130 * preliminary_terms -- a set of preliminary terms.
131 * wider_terms -- a dictionary of term -> [wider terms]
132 * uri -- the vocabulary URI
133 * flavour -- the kind of vocabulary; as of Vocinvo 2.0, this
134 can be one of 'RDF Class', 'RDF Proprety' or 'SKOS'.
135
136 * errors -- a list of strings naming errors encountered during parsing.
137 Clearly, this should be empty for a half-way reasonable vocabulary
138 (but an empty errors list is not a sufficient condition for
139 validity by the REC).
140
141 It is likely that for actual use, you will want to derive some class
142 with application-specific postprocessing. The simplest way to achieve
143 this is to override the postprocess method (that is a no-op in the
144 default implementation, so there's no reason to up-call).
145
146 You will usually construct these using the from_file class method;
147 if you already have triples, feel free to construct them directly.
148 In triples, all members must be written as CURIES ("rdfs:label")
149 if they are URIs starting with something in PREFIX_DEF (use
150 prefixify_url if necessary).
151 """
152 def __init__(self, triples):
153 self.terms = {}
154 self.deprecated_terms = {}
155 self.preliminary_terms = set()
156 self.wider_terms = {}
157 self.errors = []
158 self.uri = "Vocabulary URI not found in RDF/X"
159
160 self._build_vocabulary(triples)
161 self.postprocess()
162
163 @classmethod
164 def from_file(cls, fp):
165 """returns a Vocabulary read from fp.
166
167 fp must be an open file-like object containing RDF/X.
168 """
169 triples = []
170 elem_stack = [] # containing (name, attib) pairs with canonical
171 # prefixes.
172
173 triple_generating_elements = set([
174 'rdf:type',
175 'rdf:about', 'rdfs:label', 'rdfs:comment',
176 'rdfs:subClassOf', 'rdf:subPropertyOf',
177 'ivoasem:preliminary', 'ivoasem:deprecated', 'ivoasem:useInstead',
178 'ivoasem:vocflavour',
179 ])
180
181 for event, elem in ElementTree.iterparse(
182 fp, events=["start", "end"]):
183 if event=="start":
184 elem_stack.append(
185 (prefixify(elem.tag), prefixify_attrib(elem.attrib)))
186 else: # event=="end"
187 tag_name, attrs = elem_stack.pop()
188 if tag_name in triple_generating_elements:
189 triples.append((
190 prefixify_url(elem_stack[-1][1].get("rdf:about")),
191 tag_name,
192 # We probably should only prefixify rdf:resource
193 # values; but then, errors here are highly improbable.
194 prefixify_url(attrs.get("rdf:resource", elem.text))))
195
196 return cls(triples)
197
198 def postprocess(self):
199 """called when all triples are digested.
200
201 To be overridden by subclasses. The default implemenation is a
202 no-op.
203 """
204 pass
205
206 def to_term(self, uri):
207 """returns the term (the thing behind the #) if uri is in this
208 vocabulary's namespace, the full uri otherwise.
209 """
210 if uri.startswith(self.uri):
211 return uri[len(self.uri):]
212 return uri
213
214 ############## only constructor helpers beyond this point
215
216 def _add_error(self, error_string):
217 """adds an error message.
218
219 A constructor helper, not for users.
220 """
221 self.errors.append(error_string)
222
223 def _get_vocab_uri(self, by_property):
224 """sets the vocabulary URI.
225
226 A constructor helper, not for users.
227 """
228 pairs = by_property.get('ivoasem:vocflavour')
229
230 if not pairs:
231 self._add_error("No ivoasem:vocflavour declared. Is this"
232 " an IVOA vocabulary?")
233 return
234 if len(pairs)>1:
235 self.add_error("More than one ivoasem:vocflavour clause"
236 " found. Picking one at random. This is going to"
237 " be trouble.")
238
239 self.uri = pairs[0][0]
240 self.flavour = pairs[0][1]
241
242 if self.flavour not in PROPERTIES_BY_FLAVOUR:
243 self.add_error("Flavour {} unknown. This must be one of {}."
244 .format(self.flavour, ", ".join(PROPERTIES_BY_FLAVOUR)))
245
246 for key, value in PROPERTIES_BY_FLAVOUR[self.flavour].items():
247 setattr(self, key, value)
248
249 def _build_terms(self, by_property):
250 """fills the terms attribute.
251
252 A constructor helper, not for users.
253 """
254 labels = dict(by_property.get(self.label_property, []))
255 definitions = dict(by_property.get(self.description_property, []))
256
257 for s, o in by_property.get("rdf:type", []):
258 if o==self.term_type and s.startswith(self.uri):
259 self.terms[s[len(self.uri):]] = (
260 labels.get(s),
261 definitions.get(s))
262
263 def _build_hierarchy(self, by_property):
264 """fills the wider_terms attribute.
265
266 A constructor helper, not for users.
267 """
268 for s, o in by_property.get(self.wider_property, []):
269 if s.startswith(self.uri):
270 self.wider_terms[s[len(self.uri):]] = self.to_term(o)
271
272 def _build_deprecated_terms(self, by_property):
273 """fills the wider_terms attribute.
274
275 A constructor helper, not for users.
276 """
277 for s, o in by_property.get("ivoasem:deprecated", []):
278 if s.startswith(self.uri):
279 self.deprecated_terms[self.to_term(s)] = []
280
281 for s, o in by_property.get("ivoasem:useInstead", []):
282 if s.startswith(self.uri):
283 try:
284 self.deprecated_terms[self.to_term(s)
285 ].append(self.to_term(o))
286 except KeyError:
287 self.errors.append("UseInstead given for non-deprecated"
288 " term {}. Ignoring.".format(self.to_term(s)))
289
290 def _build_preliminary(self, by_property):
291 """fills the preliminary_terms attribute.
292
293 A constructor helper, not for users.
294 """
295 for s, _ in by_property.get("ivoasem:deprecated", []):
296 if s.startswith(self.uri):
297 self.preliminary_terms.add(self.to_term(s))
298
299
300 def _build_vocabulary(self, triples):
301 """builds the vocabulary from RDF triples.
302
303 A constructor helper, not for users.
304 """
305 by_property = {}
306 for s, p, o in triples:
307 by_property.setdefault(p, []).append((s,o))
308
309 self._get_vocab_uri(by_property)
310 self._build_terms(by_property)
311 self._build_hierarchy(by_property)
312 self._build_deprecated_terms(by_property)
313 self._build_preliminary(by_property)
314
315
316 def load_vocabulary(voc_spec):
317 """returns a Vocabulary instance for voc_spec.
318
319 voc_spec is either a (http/https) URL or a path to a local file.
320 """
321 if re.match("https?://", voc_spec):
322 req = Request(voc_spec, headers={"accept": "application/rdf+xml"})
323 in_file = urlopen(req)
324 else:
325 in_file = open(voc_spec)
326
327 try:
328 return Vocabulary.from_file(in_file)
329 finally:
330 in_file.close()
331
332
333 def check_one(voc_spec):
334 """reads a vocabulary and emits errors and properties about it on
335 stdout.
336 """
337 voc = load_vocabulary(voc_spec)
338 print(voc.terms)
339
340
341 def _test():
342 """runs some doctests (we've lazy with those).
343 """
344 import doctest
345 doctest.testmod()
346
347
348 def main():
349 if len(sys.argv)<2:
350 _test()
351 sys.exit("Usage: {} <voc-spec> {{<voc-spec>}}\nwhere <voc-spec>"
352 " either references a local RDF/X file or the vocabulary"
353 " URL.".format(sys.argv[0]))
354
355 for voc_spec in sys.argv[1:]:
356 print("\n=== Vocabulary {}".format(voc_spec))
357 check_one(voc_spec)
358
359
360 if __name__=="__main__":
361 main()
362
363 # vim:et:sta:sw=4

msdemlei@ari.uni-heidelberg.de
ViewVC Help
Powered by ViewVC 1.1.26