src/p4l/management/commands/import_record.py
author ymh <ymh.work@gmail.com>
Fri, 20 Sep 2013 10:55:05 +0200
changeset 108 c08f9b46a6c5
parent 107 48440ff95906
child 114 93b45b4f423c
permissions -rw-r--r--
use PEP8 convention on system fields for Records
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
0
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
     1
# -*- coding: utf-8 -*-
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
     2
101
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents: 22
diff changeset
     3
import logging
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents: 22
diff changeset
     4
from optparse import make_option
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents: 22
diff changeset
     5
6
ff4d2d4f1fb0 correct import of VolumeIssue
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
     6
from django.core.management import BaseCommand
ff4d2d4f1fb0 correct import of VolumeIssue
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
     7
from django.db import reset_queries, transaction
107
48440ff95906 small code reorg
ymh <ymh.work@gmail.com>
parents: 106
diff changeset
     8
from rdflib import BNode
101
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents: 22
diff changeset
     9
107
48440ff95906 small code reorg
ymh <ymh.work@gmail.com>
parents: 106
diff changeset
    10
from p4l.mapping.constants import get_empty_graph, IIEP
48440ff95906 small code reorg
ymh <ymh.work@gmail.com>
parents: 106
diff changeset
    11
from p4l.mapping.parsers import RecordParser, QueryCache
13
6296aa12fd71 model simplification, correct import on language. We do not try to impose a language when none is found. add forgotten abstract field on import.
ymh <ymh.work@gmail.com>
parents: 7
diff changeset
    12
from p4l.utils import show_progress
0
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    13
import xml.etree.cElementTree as ET
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    14
101
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents: 22
diff changeset
    15
0
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    16
logger = logging.getLogger(__name__)
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    17
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    18
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    19
DEFAULT_LANGUAGE_URI = "http://psi.oasis-open.org/iso/639/#eng"
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    20
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    21
DEFAULT_LANGUAGE_QUERY =  """SELECT ( COALESCE(?lang, ?other_lang) as ?main_lang) WHERE {
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    22
    OPTIONAL { ?s dct:language ?lang }.
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    23
    OPTIONAL { ?s iiep:otherLanguage ?other_lang }.
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    24
}"""
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    25
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    26
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    27
class Command(BaseCommand):
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    28
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    29
    args = "record_url ..."
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    30
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    31
    help = "Import p4l record rdf format"
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    32
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    33
    option_list = BaseCommand.option_list + (
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    34
        make_option('-b', '--batch-size',
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    35
            dest= 'batch_size',
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    36
            type='int',
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    37
            default= 50,
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    38
            help= 'number of object to import in bulk operations' 
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    39
        ),
106
71684a2ea502 delete record by default when importing
ymh <ymh.work@gmail.com>
parents: 105
diff changeset
    40
        make_option('-p', '--preserve',
71684a2ea502 delete record by default when importing
ymh <ymh.work@gmail.com>
parents: 105
diff changeset
    41
            dest= 'preserve',
71684a2ea502 delete record by default when importing
ymh <ymh.work@gmail.com>
parents: 105
diff changeset
    42
            action='store_true',
71684a2ea502 delete record by default when importing
ymh <ymh.work@gmail.com>
parents: 105
diff changeset
    43
            default=False,
71684a2ea502 delete record by default when importing
ymh <ymh.work@gmail.com>
parents: 105
diff changeset
    44
            help= 'preserve existing record' 
71684a2ea502 delete record by default when importing
ymh <ymh.work@gmail.com>
parents: 105
diff changeset
    45
        ),
0
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    46
    )
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    47
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    48
    def __init__(self, *args, **kwargs):
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    49
        super(Command, self).__init__(*args, **kwargs)
107
48440ff95906 small code reorg
ymh <ymh.work@gmail.com>
parents: 106
diff changeset
    50
        self.record_parser = RecordParser(query_cache=QueryCache())
106
71684a2ea502 delete record by default when importing
ymh <ymh.work@gmail.com>
parents: 105
diff changeset
    51
        
0
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    52
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    53
    def filter_node(self, node, graph, res_graph):
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    54
        for p,o in graph[node]:
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    55
            res_graph.add((node,p,o))
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    56
            if isinstance(o, BNode):
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    57
                self.filter_node(o, graph, res_graph)
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    58
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    59
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    60
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    61
    def calculate_records_nb(self, records_url):
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    62
        context = ET.iterparse(records_url, events=("end",))
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    63
        i = 0
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    64
        for _,elem in context:
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    65
            if elem.tag == "{%s}Record" % IIEP:
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    66
                i += 1
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    67
        return i
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    68
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    69
    def process_url(self, records_url, options):
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    70
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    71
        total_records = self.calculate_records_nb(records_url)
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    72
        writer = None
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    73
        errors=[]
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    74
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    75
        context = ET.iterparse(records_url, events=("end",))
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    76
        i = 0
6
ff4d2d4f1fb0 correct import of VolumeIssue
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
    77
        for _,elem in context:
0
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    78
            if elem.tag == "{%s}Record" % IIEP:
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    79
                i += 1
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    80
                writer = show_progress(i, total_records, "Processing record nb %d " % i, 50, writer=writer)
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    81
                try:
101
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents: 22
diff changeset
    82
                    record_graph = get_empty_graph()
106
71684a2ea502 delete record by default when importing
ymh <ymh.work@gmail.com>
parents: 105
diff changeset
    83
                    record_graph.parse(data=ET.tostring(elem, encoding='utf-8'), format='xml')                    
107
48440ff95906 small code reorg
ymh <ymh.work@gmail.com>
parents: 106
diff changeset
    84
                    self.record_parser.build_record(record_graph, delete=(not self.preserve))                    
0
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    85
                except Exception as e:
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    86
                    transaction.rollback()
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    87
                    msg = "Error processing resource %d in %s : %s" % (i, records_url, repr(e))
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    88
                    logger.exception(msg)
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    89
                    errors.append((i, records_url, msg))
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    90
                else:
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    91
                    transaction.commit()
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    92
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    93
                if i%self.batch_size == 0:                    
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    94
                    reset_queries()
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    95
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    96
        return errors
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    97
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    98
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    99
    # def process_url(self, records_url, options):
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   100
    #     #open graph with rdflib
13
6296aa12fd71 model simplification, correct import on language. We do not try to impose a language when none is found. add forgotten abstract field on import.
ymh <ymh.work@gmail.com>
parents: 7
diff changeset
   101
    #             
0
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   102
    #     g = Graph()
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   103
    #     print("Loading %s" % records_url)
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   104
    #     g.parse(records_url)
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   105
    #     print("Parsing %s done" % records_url)
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   106
    #     for i,record_uri in enumerate(g[:RDF.type:IIEP.Record]):
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   107
    #         print(i, repr(record_uri))
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   108
    #         record_graph = self.get_empty_graph()
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   109
    #         self.filter_node(record_uri, g, record_graph)
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   110
    #         self.build_record(record_graph)
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   111
    #         if i > 3:
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   112
    #             break
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   113
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   114
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   115
    def handle(self, *args, **options):
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   116
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   117
        self.batch_size = options.get('batch_size', 50)
106
71684a2ea502 delete record by default when importing
ymh <ymh.work@gmail.com>
parents: 105
diff changeset
   118
        self.preserve = options.get("preserve", False)
0
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   119
        transaction.enter_transaction_management()
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   120
        transaction.managed(True)
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   121
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   122
        for records_url in args:
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   123
            print("Processing %s" % records_url)
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   124
            errors = self.process_url(records_url, options)
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   125
            print("Processing %s Done" % records_url)
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   126
            if errors:
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   127
                print("%d error(s) when processing %s, check your log file." % (len(errors), records_url))
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   128
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   129
        transaction.leave_transaction_management()
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   130