src/p4l/management/commands/import_record.py
changeset 107 48440ff95906
parent 106 71684a2ea502
child 108 c08f9b46a6c5
equal deleted inserted replaced
106:71684a2ea502 107:48440ff95906
     3 import logging
     3 import logging
     4 from optparse import make_option
     4 from optparse import make_option
     5 
     5 
     6 from django.core.management import BaseCommand
     6 from django.core.management import BaseCommand
     7 from django.db import reset_queries, transaction
     7 from django.db import reset_queries, transaction
     8 from rdflib import BNode, URIRef
     8 from rdflib import BNode
     9 from rdflib.plugins.sparql import prepareQuery
       
    10 
     9 
    11 from p4l.management.constants import get_empty_graph, IIEP
    10 from p4l.mapping.constants import get_empty_graph, IIEP
    12 from p4l.models import Record, Language
    11 from p4l.mapping.parsers import RecordParser, QueryCache
    13 from p4l.utils import show_progress
    12 from p4l.utils import show_progress
    14 import xml.etree.cElementTree as ET
    13 import xml.etree.cElementTree as ET
    15 
    14 
    16 
    15 
    17 logger = logging.getLogger(__name__)
    16 logger = logging.getLogger(__name__)
    46         ),
    45         ),
    47     )
    46     )
    48 
    47 
    49     def __init__(self, *args, **kwargs):
    48     def __init__(self, *args, **kwargs):
    50         super(Command, self).__init__(*args, **kwargs)
    49         super(Command, self).__init__(*args, **kwargs)
    51         self.__query_cache = {}
    50         self.record_parser = RecordParser(query_cache=QueryCache())
    52 
       
    53 
       
    54     def __get_sparql_query(self, query, namespaces):
       
    55 
       
    56         return self.__query_cache[query] \
       
    57             if query in self.__query_cache \
       
    58             else self.__query_cache.setdefault(query, prepareQuery(query, initNs=namespaces))
       
    59 
       
    60 
       
    61     def extract_single_value_form_graph(self, graph, q, bindings={}, index=0, convert=lambda v:unicode(v) if v is not None else None):
       
    62         return next(self.extract_multiple_values_from_graph(graph, q, bindings, index, convert), None)
       
    63 
       
    64     def extract_multiple_values_from_graph(self, graph, q, bindings={}, index=0, convert=lambda v:unicode(v) if v is not None else None):
       
    65 
       
    66         index_list = index
       
    67         if isinstance(index, int):
       
    68             index_list = range(index+1)
       
    69 
       
    70         if hasattr(convert, '__call__'):
       
    71             convert_dict = dict((k, convert) for k in index_list)
       
    72         else:
       
    73             convert_dict = convert
       
    74 
       
    75         convert_dict = dict((k, f if hasattr(f,'__call__') else lambda v:unicode(v) if v is not None else None) for k,f in convert_dict.iteritems())
       
    76 
       
    77         for row in graph.query(self.__get_sparql_query(q, dict(graph.namespaces())), initBindings=bindings):
       
    78             if len(row) < len(index_list):
       
    79                 break
       
    80             else:
       
    81                 res = dict([ (k, convert_dict.get(k, lambda v:unicode(v) if v is not None else None)(v)) for k, v in zip(index_list, row)])
       
    82                 if isinstance(index, int):
       
    83                     yield res[index]
       
    84                 else:
       
    85                     yield res
       
    86 
       
    87 
       
    88     def convert_bool(self, val):
       
    89         if val == True or val == False:
       
    90             return val
       
    91         if val is None:
       
    92             return False
       
    93         if isinstance(val, basestring):
       
    94             if len(val) == 0:
       
    95                 return False
       
    96             if val[0].lower() in ['t','y','1','o']:
       
    97                 return True
       
    98             else:
       
    99                 return False        
       
   100         return bool(val)
       
   101 
       
   102 
       
   103     def add_to_related_collection(self, coll, graph, fields, q, bindings={},  convert=lambda v: unicode(v) if v is not None else None, through_fields=None):
       
   104         
    51         
   105         for val in self.extract_multiple_values_from_graph(graph, q, bindings=bindings, index=fields, convert=convert):
       
   106 
       
   107             if through_fields:                
       
   108                 new_obj_val = dict([(k,v) for k,v in val.iteritems() if k not in through_fields])
       
   109             else:
       
   110                 new_obj_val = val
       
   111 
       
   112             if hasattr(coll, 'through'):
       
   113                 new_obj_rel, _ = coll.model.objects.get_or_create(**new_obj_val)
       
   114                 if through_fields:
       
   115                     through_vals = {coll.source_field_name: coll.instance, coll.target_field_name: new_obj_rel}
       
   116                     through_vals.update(dict([(k,v) for k,v in val.iteritems() if k in through_fields]))
       
   117                     coll.through.objects.create(**through_vals)
       
   118                     new_obj = None
       
   119                 else:
       
   120                     new_obj = new_obj_rel
       
   121 
       
   122             else:
       
   123                 new_obj = coll.create(**new_obj_val)
       
   124             
       
   125             if new_obj:
       
   126                 coll.add(new_obj)
       
   127 
       
   128 
       
   129 
       
   130 
       
   131     def build_record(self, graph, delete=True):
       
   132 
       
   133         record_uri = self.extract_single_value_form_graph(graph,"SELECT DISTINCT ?s WHERE { ?s rdf:type iiep:Record .}")
       
   134         record_identifier = self.extract_single_value_form_graph(graph,"SELECT DISTINCT ?o WHERE { ?s dct:identifier ?o .}", bindings={'s':URIRef(record_uri)})
       
   135         
       
   136         if delete:
       
   137             Record.objects.filter(identifier=record_identifier).delete()
       
   138 
       
   139         record = Record()
       
   140         record.uri = record_uri
       
   141         record.identifier = record_identifier
       
   142         record.notes = self.extract_single_value_form_graph(graph,"SELECT DISTINCT ?o WHERE { ?s iiep:notes ?o .}", bindings={'s':URIRef(record.uri)})
       
   143         record.recordType = self.extract_single_value_form_graph(graph,"SELECT DISTINCT ?o WHERE { ?s dct:type ?o .}", bindings={'s':URIRef(record.uri)})
       
   144         record.isDocumentPart = self.extract_single_value_form_graph(graph,"SELECT DISTINCT ?o WHERE { ?s iiep:isDocumentPart ?o .}", bindings={'s':URIRef(record.uri)}, convert=self.convert_bool)
       
   145         record.hidden = self.extract_single_value_form_graph(graph,"SELECT DISTINCT ?o WHERE { ?s iiep:hidden ?o .}", bindings={'s':URIRef(record.uri)}, convert=self.convert_bool)
       
   146         record.restricted = self.extract_single_value_form_graph(graph,"SELECT DISTINCT ?o WHERE { ?s iiep:restricted ?o .}", bindings={'s':URIRef(record.uri)}, convert=self.convert_bool)
       
   147         record.editionStatement = self.extract_single_value_form_graph(graph,"SELECT DISTINCT ?o WHERE { ?s iiep:editionStatement ?o .}", bindings={'s':URIRef(record.uri)})
       
   148         record.corporateAuthorLabel = self.extract_single_value_form_graph(graph,"SELECT DISTINCT ?o WHERE { ?s iiep:corporateAuthorLabel ?o .}", bindings={'s':URIRef(record.uri)})
       
   149 
       
   150         language = self.extract_single_value_form_graph(graph,"SELECT DISTINCT ?o WHERE { ?s dct:language ?o .}", bindings={'s':URIRef(record.uri)})
       
   151         if language:
       
   152             record.language, _ = Language.objects.get_or_create(uri=language)
       
   153 
       
   154         record.save()
       
   155 
       
   156         self.add_to_related_collection(record.otherLanguages, graph,  ['uri'], "SELECT ?o WHERE { ?s iiep:otherLanguage ?o .}", bindings={'s':URIRef(record.uri)})
       
   157         self.add_to_related_collection(record.subjects, graph, ['uri'], "SELECT ?o WHERE { ?s dct:subject ?o .}", bindings={'s':URIRef(record.uri)})
       
   158         self.add_to_related_collection(record.themes, graph, ['uri'], "SELECT ?o WHERE { ?s iiep:theme ?o .}", bindings={'s':URIRef(record.uri)})
       
   159         self.add_to_related_collection(record.countries, graph,  ['uri'], "SELECT ?o WHERE { ?s iiep:country ?o .}", bindings={'s':URIRef(record.uri)})
       
   160         self.add_to_related_collection(record.authors, graph, ['name'], "SELECT ?o WHERE { ?s iiep:author ?o .}", bindings={'s':URIRef(record.uri)})
       
   161         self.add_to_related_collection(record.subjectPersons, graph, ['name'], "SELECT ?o WHERE { ?s iiep:subjectPerson ?o .}", bindings={'s':URIRef(record.uri)})
       
   162         self.add_to_related_collection(record.projectNames, graph, ['uri'], "SELECT ?o WHERE { ?s iiep:projectName ?o . }")
       
   163         self.add_to_related_collection(record.audiences, graph,  ['uri'], "SELECT ?o WHERE { ?s dct:audience ?o .}", bindings={'s':URIRef(record.uri)})
       
   164 
       
   165         self.add_to_related_collection(
       
   166             record.periodicals,
       
   167             graph, 
       
   168             ['label','lang'],
       
   169             "SELECT DISTINCT ?o  ( lang(?o) as ?l) WHERE { ?s iiep:periodical ?o .}",
       
   170             bindings={'s':URIRef(record.uri)}
       
   171         )
       
   172 
       
   173         self.add_to_related_collection(
       
   174             record.meetings,
       
   175             graph, 
       
   176             ['label', 'meetingNumber', 'meetingPlace', 'meetingDate', 'meetingYear', 'lang'],
       
   177             "SELECT ?l ?mn ?mp ?md ?my (lang(COALESCE(?l,?nm, ?mp,?md,?my)) as ?lang) WHERE { [iiep:meeting ?bnode]. OPTIONAL { ?bnode rdfs:label ?l }. OPTIONAL { ?bnode iiep:meetingNumber ?mn }. OPTIONAL { ?bnode iiep:meetingPlace ?mp }.  OPTIONAL { ?bnode iiep:meetingDate ?md }. OPTIONAL { ?bnode iiep:meetingYear ?my }}",
       
   178             convert={'meetingYear' : lambda y: int(y) if y is not None else None}
       
   179         )
       
   180 
       
   181         self.add_to_related_collection(
       
   182             record.series,
       
   183             graph, 
       
   184             ['title', 'volume', 'lang'],
       
   185             "SELECT ?t ?vol (lang(COALESCE(?t,?vol)) as ?lang) WHERE { [iiep:serie ?bnode]. OPTIONAL { ?bnode dct:title ?t }. OPTIONAL { ?bnode iiep:volume ?vol } }",
       
   186         )
       
   187 
       
   188         self.add_to_related_collection(
       
   189             record.subjectCorporateBodies,
       
   190             graph,
       
   191             ['uri'],
       
   192             "SELECT ?o WHERE { ?s iiep:subjectCorporateBody ?o. }",
       
   193             bindings={'s':URIRef(record.uri)}
       
   194         )
       
   195 
       
   196         self.add_to_related_collection(
       
   197             record.subjectMeetings,
       
   198             graph,
       
   199             ['label', 'meetingNumber', 'meetingPlace', 'meetingDate', 'meetingYear'],
       
   200             "SELECT ?l ?mn ?mp ?md ?my WHERE { [iiep:subjectMeeting ?bnode]. OPTIONAL { ?bnode rdfs:label ?l }. OPTIONAL { ?bnode iiep:meetingNumber ?mn }. OPTIONAL { ?bnode iiep:meetingPlace ?mp }.  OPTIONAL { ?bnode iiep:meetingDate ?md }. OPTIONAL { ?bnode iiep:meetingYear ?my }}",            
       
   201             convert={'meetingYear' : lambda y: int(y) if y is not None else None}
       
   202         )
       
   203 
       
   204         self.add_to_related_collection(
       
   205             record.corporateAuthors,
       
   206             graph,
       
   207             ['uri'],
       
   208             "SELECT ?o WHERE { ?s iiep:corporateAuthor ?o.}",
       
   209             bindings={'s':URIRef(record.uri)}            
       
   210         )
       
   211 
       
   212         self.add_to_related_collection(
       
   213             record.issns,
       
   214             graph,
       
   215             ['issn', 'lang'],
       
   216             "SELECT ?issn (lang(COALESCE(?issn)) as ?lang) WHERE { ?s iiep:issn ?issn . }",
       
   217             bindings={'s':URIRef(record.uri)},
       
   218         )
       
   219 
       
   220         self.add_to_related_collection(
       
   221             record.isbns,
       
   222             graph,
       
   223             ['isbn', 'lang'],
       
   224             "SELECT ?isbn (lang(COALESCE(?isbn)) as ?lang) WHERE { ?s iiep:isbn ?isbn . }",
       
   225             bindings={'s':URIRef(record.uri)},
       
   226         )
       
   227 
       
   228         self.add_to_related_collection(
       
   229             record.documentCodes,
       
   230             graph,
       
   231             ['documentCode', 'lang'],
       
   232             "SELECT ?c (lang(COALESCE(?c)) as ?lang) WHERE { ?s iiep:documentCode ?c . }",
       
   233             bindings={'s':URIRef(record.uri)},
       
   234         )
       
   235 
       
   236         self.add_to_related_collection(
       
   237             record.titles,
       
   238             graph,
       
   239             ['title', 'lang'],
       
   240             "SELECT ?t (lang(COALESCE(?t)) as ?lang) WHERE { ?s dct:title ?t . }",
       
   241             bindings={'s':URIRef(record.uri)},
       
   242         )
       
   243 
       
   244         self.add_to_related_collection(
       
   245             record.abstracts,
       
   246             graph,
       
   247             ['abstract', 'lang'],
       
   248             "SELECT ?t (lang(COALESCE(?t)) as ?lang) WHERE { ?s dct:abstract ?t . }",
       
   249             bindings={'s':URIRef(record.uri)},
       
   250         )
       
   251 
       
   252         self.add_to_related_collection(
       
   253             record.addedTitles,
       
   254             graph,
       
   255             ['title', 'lang'],
       
   256             "SELECT ?t (lang(COALESCE(?t)) as ?lang) WHERE { ?s iiep:addedTitle ?t . }",
       
   257             bindings={'s':URIRef(record.uri)},
       
   258         )
       
   259 
       
   260         self.add_to_related_collection(
       
   261             record.titlesMainDocument,
       
   262             graph,
       
   263             ['title', 'lang'],
       
   264             "SELECT ?t (lang(COALESCE(?t)) as ?lang) WHERE { ?s iiep:titleMainDocument ?t . }",
       
   265             bindings={'s':URIRef(record.uri)},
       
   266         )
       
   267 
       
   268         self.add_to_related_collection(
       
   269             record.imprints,
       
   270             graph,
       
   271             ['imprintCity', 'publisher', 'imprintDate', 'lang'],
       
   272             "SELECT ?c ?p ?d (lang(COALESCE(?c, ?p, ?d)) as ?lang) WHERE { [ iiep:imprint ?bnode ]. OPTIONAL { ?bnode iiep:imprintCity ?c }. OPTIONAL { ?bnode dct:publisher ?p }. OPTIONAL { ?bnode iiep:imprintDate ?d }}",
       
   273         )
       
   274 
       
   275         self.add_to_related_collection(
       
   276             record.collations,
       
   277             graph,
       
   278             ['collation', 'lang'],
       
   279             "SELECT ?c (lang(COALESCE(?c)) as ?lang) WHERE { ?s iiep:collation ?c . }",
       
   280             bindings={'s':URIRef(record.uri)},
       
   281         )
       
   282 
       
   283         self.add_to_related_collection(
       
   284             record.volumeIssues,
       
   285             graph,
       
   286             ['volume', 'number', 'lang'],
       
   287             "SELECT ?v ?n (lang(COALESCE(?v, ?n)) as ?lang) WHERE { [ iiep:volumeIssue ?bnode ]. OPTIONAL { ?bnode iiep:volume ?v }. OPTIONAL { ?bnode iiep:number ?n }}",
       
   288         )
       
   289 
       
   290         self.add_to_related_collection(
       
   291             record.urls,
       
   292             graph,
       
   293             ['address', 'display'],
       
   294             "SELECT ?a ?d WHERE { [ iiep:url ?bnode ]. OPTIONAL { ?bnode iiep:address ?a }. OPTIONAL { ?bnode iiep:display ?d }.}",
       
   295         )
       
   296 
       
   297         return record
       
   298 
       
   299 
    52 
   300     def filter_node(self, node, graph, res_graph):
    53     def filter_node(self, node, graph, res_graph):
   301         for p,o in graph[node]:
    54         for p,o in graph[node]:
   302             res_graph.add((node,p,o))
    55             res_graph.add((node,p,o))
   303             if isinstance(o, BNode):
    56             if isinstance(o, BNode):
   326                 i += 1
    79                 i += 1
   327                 writer = show_progress(i, total_records, "Processing record nb %d " % i, 50, writer=writer)
    80                 writer = show_progress(i, total_records, "Processing record nb %d " % i, 50, writer=writer)
   328                 try:
    81                 try:
   329                     record_graph = get_empty_graph()
    82                     record_graph = get_empty_graph()
   330                     record_graph.parse(data=ET.tostring(elem, encoding='utf-8'), format='xml')                    
    83                     record_graph.parse(data=ET.tostring(elem, encoding='utf-8'), format='xml')                    
   331                     self.build_record(record_graph, delete=(not self.preserve))                    
    84                     self.record_parser.build_record(record_graph, delete=(not self.preserve))                    
   332                 except Exception as e:
    85                 except Exception as e:
   333                     transaction.rollback()
    86                     transaction.rollback()
   334                     msg = "Error processing resource %d in %s : %s" % (i, records_url, repr(e))
    87                     msg = "Error processing resource %d in %s : %s" % (i, records_url, repr(e))
   335                     logger.exception(msg)
    88                     logger.exception(msg)
   336                     errors.append((i, records_url, msg))
    89                     errors.append((i, records_url, msg))