src/p4l/mapping/parsers.py
changeset 107 48440ff95906
child 119 ece69ca3ac24
equal deleted inserted replaced
106:71684a2ea502 107:48440ff95906
       
     1 # -*- coding: utf-8 -*-
       
     2 '''
       
     3 Created on Sep 20, 2013
       
     4 
       
     5 @author: ymh
       
     6 '''
       
     7 from rdflib.plugins.sparql.processor import prepareQuery
       
     8 from rdflib.term import URIRef
       
     9 from p4l.models.data import Language, Record
       
    10 
       
    11 
       
    12 class QueryCache(object):
       
    13     def __init__(self, *args, **kwargs):
       
    14         self.__query_cache = {}
       
    15 
       
    16     def get_sparql_query(self, query, namespaces_dict):
       
    17         return self.__query_cache.get(query, False) \
       
    18             or self.__query_cache.setdefault(query, prepareQuery(query, initNs=namespaces_dict))
       
    19     
       
    20 
       
    21 def convert_bool(val):
       
    22     if val == True or val == False:
       
    23         return val
       
    24     if val is None:
       
    25         return False
       
    26     if isinstance(val, basestring):
       
    27         if len(val) == 0:
       
    28             return False
       
    29         if val[0].lower() in ['t','y','1','o']:
       
    30             return True
       
    31         else:
       
    32             return False        
       
    33     return bool(val)
       
    34 
       
    35 class RecordParser(object):
       
    36 
       
    37     
       
    38     def __init__(self, query_cache = None):
       
    39         self.query_cache = None
       
    40         if self.query_cache is None:
       
    41             self.query_cache = QueryCache()        
       
    42     
       
    43     def extract_single_value_form_graph(self, graph, q, bindings={}, index=0, convert=lambda v:unicode(v) if v is not None else None, default=None):
       
    44         return next(self.extract_multiple_values_from_graph(graph, q, bindings, index, convert), default)
       
    45 
       
    46     def extract_multiple_values_from_graph(self, graph, q, bindings={}, index=0, convert=lambda v:unicode(v) if v is not None else None):
       
    47 
       
    48         index_list = index
       
    49         if isinstance(index, int):
       
    50             index_list = range(index+1)
       
    51 
       
    52         if hasattr(convert, '__call__'):
       
    53             convert_dict = dict((k, convert) for k in index_list)
       
    54         else:
       
    55             convert_dict = convert
       
    56 
       
    57         convert_dict = dict((k, f if hasattr(f,'__call__') else lambda v:unicode(v) if v is not None else None) for k,f in convert_dict.iteritems())
       
    58 
       
    59         for row in graph.query(self.query_cache.get_sparql_query(q, dict(graph.namespaces())), initBindings=bindings):
       
    60             if len(row) < len(index_list):
       
    61                 break
       
    62             else:
       
    63                 res = dict([ (k, convert_dict.get(k, lambda v:unicode(v) if v is not None else None)(v)) for k, v in zip(index_list, row)])
       
    64                 if isinstance(index, int):
       
    65                     yield res[index]
       
    66                 else:
       
    67                     yield res
       
    68 
       
    69 
       
    70     def convert_bool(self, val):
       
    71         if val == True or val == False:
       
    72             return val
       
    73         if val is None:
       
    74             return False
       
    75         if isinstance(val, basestring):
       
    76             if len(val) == 0:
       
    77                 return False
       
    78             if val[0].lower() in ['t','y','1','o']:
       
    79                 return True
       
    80             else:
       
    81                 return False        
       
    82         return bool(val)
       
    83 
       
    84 
       
    85     def add_to_related_collection(self, coll, graph, fields, q, bindings={},  convert=lambda v: unicode(v) if v is not None else None, through_fields=None):
       
    86         
       
    87         for val in self.extract_multiple_values_from_graph(graph, q, bindings=bindings, index=fields, convert=convert):
       
    88 
       
    89             if through_fields:                
       
    90                 new_obj_val = dict([(k,v) for k,v in val.iteritems() if k not in through_fields])
       
    91             else:
       
    92                 new_obj_val = val
       
    93 
       
    94             if hasattr(coll, 'through'):
       
    95                 new_obj_rel, _ = coll.model.objects.get_or_create(**new_obj_val)
       
    96                 if through_fields:
       
    97                     through_vals = {coll.source_field_name: coll.instance, coll.target_field_name: new_obj_rel}
       
    98                     through_vals.update(dict([(k,v) for k,v in val.iteritems() if k in through_fields]))
       
    99                     coll.through.objects.create(**through_vals)
       
   100                     new_obj = None
       
   101                 else:
       
   102                     new_obj = new_obj_rel
       
   103 
       
   104             else:
       
   105                 new_obj = coll.create(**new_obj_val)
       
   106             
       
   107             if new_obj:
       
   108                 coll.add(new_obj)
       
   109 
       
   110 
       
   111 
       
   112 
       
   113     def build_record(self, graph, delete=True):
       
   114 
       
   115         record_uri = self.extract_single_value_form_graph(graph,"SELECT DISTINCT ?s WHERE { ?s rdf:type iiep:Record .}")
       
   116         record_identifier = self.extract_single_value_form_graph(graph,"SELECT DISTINCT ?o WHERE { ?s dct:identifier ?o .}", bindings={'s':URIRef(record_uri)})
       
   117         
       
   118         if delete:
       
   119             Record.objects.filter(identifier=record_identifier).delete()
       
   120 
       
   121         record = Record()
       
   122         record.uri = record_uri
       
   123         record.identifier = record_identifier
       
   124         record.notes = self.extract_single_value_form_graph(graph,"SELECT DISTINCT ?o WHERE { ?s iiep:notes ?o .}", bindings={'s':URIRef(record.uri)})
       
   125         record.recordType = self.extract_single_value_form_graph(graph,"SELECT DISTINCT ?o WHERE { ?s dct:type ?o .}", bindings={'s':URIRef(record.uri)})
       
   126         record.isDocumentPart = self.extract_single_value_form_graph(graph,"SELECT DISTINCT ?o WHERE { ?s iiep:isDocumentPart ?o .}", bindings={'s':URIRef(record.uri)}, convert=self.convert_bool, default=False)
       
   127         record.hidden = self.extract_single_value_form_graph(graph,"SELECT DISTINCT ?o WHERE { ?s iiep:hidden ?o .}", bindings={'s':URIRef(record.uri)}, convert=self.convert_bool, default=False)
       
   128         record.restricted = self.extract_single_value_form_graph(graph,"SELECT DISTINCT ?o WHERE { ?s iiep:restricted ?o .}", bindings={'s':URIRef(record.uri)}, convert=self.convert_bool, default=False)
       
   129         record.editionStatement = self.extract_single_value_form_graph(graph,"SELECT DISTINCT ?o WHERE { ?s iiep:editionStatement ?o .}", bindings={'s':URIRef(record.uri)})
       
   130         record.corporateAuthorLabel = self.extract_single_value_form_graph(graph,"SELECT DISTINCT ?o WHERE { ?s iiep:corporateAuthorLabel ?o .}", bindings={'s':URIRef(record.uri)})
       
   131 
       
   132         language = self.extract_single_value_form_graph(graph,"SELECT DISTINCT ?o WHERE { ?s dct:language ?o .}", bindings={'s':URIRef(record.uri)})
       
   133         if language:
       
   134             record.language, _ = Language.objects.get_or_create(uri=language)
       
   135 
       
   136         record.save()
       
   137 
       
   138         self.add_to_related_collection(record.otherLanguages, graph,  ['uri'], "SELECT ?o WHERE { ?s iiep:otherLanguage ?o .}", bindings={'s':URIRef(record.uri)})
       
   139         self.add_to_related_collection(record.subjects, graph, ['uri'], "SELECT ?o WHERE { ?s dct:subject ?o .}", bindings={'s':URIRef(record.uri)})
       
   140         self.add_to_related_collection(record.themes, graph, ['uri'], "SELECT ?o WHERE { ?s iiep:theme ?o .}", bindings={'s':URIRef(record.uri)})
       
   141         self.add_to_related_collection(record.countries, graph,  ['uri'], "SELECT ?o WHERE { ?s iiep:country ?o .}", bindings={'s':URIRef(record.uri)})
       
   142         self.add_to_related_collection(record.authors, graph, ['name'], "SELECT ?o WHERE { ?s iiep:author ?o .}", bindings={'s':URIRef(record.uri)})
       
   143         self.add_to_related_collection(record.subjectPersons, graph, ['name'], "SELECT ?o WHERE { ?s iiep:subjectPerson ?o .}", bindings={'s':URIRef(record.uri)})
       
   144         self.add_to_related_collection(record.projectNames, graph, ['uri'], "SELECT ?o WHERE { ?s iiep:projectName ?o . }")
       
   145         self.add_to_related_collection(record.audiences, graph,  ['uri'], "SELECT ?o WHERE { ?s dct:audience ?o .}", bindings={'s':URIRef(record.uri)})
       
   146 
       
   147         self.add_to_related_collection(
       
   148             record.periodicals,
       
   149             graph, 
       
   150             ['label','lang'],
       
   151             "SELECT DISTINCT ?o  ( lang(?o) as ?l) WHERE { ?s iiep:periodical ?o .}",
       
   152             bindings={'s':URIRef(record.uri)}
       
   153         )
       
   154 
       
   155         self.add_to_related_collection(
       
   156             record.meetings,
       
   157             graph, 
       
   158             ['label', 'meetingNumber', 'meetingPlace', 'meetingDate', 'meetingYear', 'lang'],
       
   159             "SELECT ?l ?mn ?mp ?md ?my (lang(COALESCE(?l,?nm, ?mp,?md,?my)) as ?lang) WHERE { [iiep:meeting ?bnode]. OPTIONAL { ?bnode rdfs:label ?l }. OPTIONAL { ?bnode iiep:meetingNumber ?mn }. OPTIONAL { ?bnode iiep:meetingPlace ?mp }.  OPTIONAL { ?bnode iiep:meetingDate ?md }. OPTIONAL { ?bnode iiep:meetingYear ?my }}",
       
   160             convert={'meetingYear' : lambda y: int(y) if y is not None else None}
       
   161         )
       
   162 
       
   163         self.add_to_related_collection(
       
   164             record.series,
       
   165             graph, 
       
   166             ['title', 'volume', 'lang'],
       
   167             "SELECT ?t ?vol (lang(COALESCE(?t,?vol)) as ?lang) WHERE { [iiep:serie ?bnode]. OPTIONAL { ?bnode dct:title ?t }. OPTIONAL { ?bnode iiep:volume ?vol } }",
       
   168         )
       
   169 
       
   170         self.add_to_related_collection(
       
   171             record.subjectCorporateBodies,
       
   172             graph,
       
   173             ['uri'],
       
   174             "SELECT ?o WHERE { ?s iiep:subjectCorporateBody ?o. }",
       
   175             bindings={'s':URIRef(record.uri)}
       
   176         )
       
   177 
       
   178         self.add_to_related_collection(
       
   179             record.subjectMeetings,
       
   180             graph,
       
   181             ['label', 'meetingNumber', 'meetingPlace', 'meetingDate', 'meetingYear'],
       
   182             "SELECT ?l ?mn ?mp ?md ?my WHERE { [iiep:subjectMeeting ?bnode]. OPTIONAL { ?bnode rdfs:label ?l }. OPTIONAL { ?bnode iiep:meetingNumber ?mn }. OPTIONAL { ?bnode iiep:meetingPlace ?mp }.  OPTIONAL { ?bnode iiep:meetingDate ?md }. OPTIONAL { ?bnode iiep:meetingYear ?my }}",            
       
   183             convert={'meetingYear' : lambda y: int(y) if y is not None else None}
       
   184         )
       
   185 
       
   186         self.add_to_related_collection(
       
   187             record.corporateAuthors,
       
   188             graph,
       
   189             ['uri'],
       
   190             "SELECT ?o WHERE { ?s iiep:corporateAuthor ?o.}",
       
   191             bindings={'s':URIRef(record.uri)}            
       
   192         )
       
   193 
       
   194         self.add_to_related_collection(
       
   195             record.issns,
       
   196             graph,
       
   197             ['issn', 'lang'],
       
   198             "SELECT ?issn (lang(COALESCE(?issn)) as ?lang) WHERE { ?s iiep:issn ?issn . }",
       
   199             bindings={'s':URIRef(record.uri)},
       
   200         )
       
   201 
       
   202         self.add_to_related_collection(
       
   203             record.isbns,
       
   204             graph,
       
   205             ['isbn', 'lang'],
       
   206             "SELECT ?isbn (lang(COALESCE(?isbn)) as ?lang) WHERE { ?s iiep:isbn ?isbn . }",
       
   207             bindings={'s':URIRef(record.uri)},
       
   208         )
       
   209 
       
   210         self.add_to_related_collection(
       
   211             record.documentCodes,
       
   212             graph,
       
   213             ['documentCode', 'lang'],
       
   214             "SELECT ?c (lang(COALESCE(?c)) as ?lang) WHERE { ?s iiep:documentCode ?c . }",
       
   215             bindings={'s':URIRef(record.uri)},
       
   216         )
       
   217 
       
   218         self.add_to_related_collection(
       
   219             record.titles,
       
   220             graph,
       
   221             ['title', 'lang'],
       
   222             "SELECT ?t (lang(COALESCE(?t)) as ?lang) WHERE { ?s dct:title ?t . }",
       
   223             bindings={'s':URIRef(record.uri)},
       
   224         )
       
   225 
       
   226         self.add_to_related_collection(
       
   227             record.abstracts,
       
   228             graph,
       
   229             ['abstract', 'lang'],
       
   230             "SELECT ?t (lang(COALESCE(?t)) as ?lang) WHERE { ?s dct:abstract ?t . }",
       
   231             bindings={'s':URIRef(record.uri)},
       
   232         )
       
   233 
       
   234         self.add_to_related_collection(
       
   235             record.addedTitles,
       
   236             graph,
       
   237             ['title', 'lang'],
       
   238             "SELECT ?t (lang(COALESCE(?t)) as ?lang) WHERE { ?s iiep:addedTitle ?t . }",
       
   239             bindings={'s':URIRef(record.uri)},
       
   240         )
       
   241 
       
   242         self.add_to_related_collection(
       
   243             record.titlesMainDocument,
       
   244             graph,
       
   245             ['title', 'lang'],
       
   246             "SELECT ?t (lang(COALESCE(?t)) as ?lang) WHERE { ?s iiep:titleMainDocument ?t . }",
       
   247             bindings={'s':URIRef(record.uri)},
       
   248         )
       
   249 
       
   250         self.add_to_related_collection(
       
   251             record.imprints,
       
   252             graph,
       
   253             ['imprintCity', 'publisher', 'imprintDate', 'lang'],
       
   254             "SELECT ?c ?p ?d (lang(COALESCE(?c, ?p, ?d)) as ?lang) WHERE { [ iiep:imprint ?bnode ]. OPTIONAL { ?bnode iiep:imprintCity ?c }. OPTIONAL { ?bnode dct:publisher ?p }. OPTIONAL { ?bnode iiep:imprintDate ?d }}",
       
   255         )
       
   256 
       
   257         self.add_to_related_collection(
       
   258             record.collations,
       
   259             graph,
       
   260             ['collation', 'lang'],
       
   261             "SELECT ?c (lang(COALESCE(?c)) as ?lang) WHERE { ?s iiep:collation ?c . }",
       
   262             bindings={'s':URIRef(record.uri)},
       
   263         )
       
   264 
       
   265         self.add_to_related_collection(
       
   266             record.volumeIssues,
       
   267             graph,
       
   268             ['volume', 'number', 'lang'],
       
   269             "SELECT ?v ?n (lang(COALESCE(?v, ?n)) as ?lang) WHERE { [ iiep:volumeIssue ?bnode ]. OPTIONAL { ?bnode iiep:volume ?v }. OPTIONAL { ?bnode iiep:number ?n }}",
       
   270         )
       
   271 
       
   272         self.add_to_related_collection(
       
   273             record.urls,
       
   274             graph,
       
   275             ['address', 'display'],
       
   276             "SELECT ?a ?d WHERE { [ iiep:url ?bnode ]. OPTIONAL { ?bnode iiep:address ?a }. OPTIONAL { ?bnode iiep:display ?d }.}",
       
   277         )
       
   278 
       
   279         return record