src/p4l/management/commands/import_record.py
author ymh <ymh.work@gmail.com>
Thu, 26 Sep 2013 15:24:41 +0200
changeset 119 ece69ca3ac24
parent 114 93b45b4f423c
child 126 a345f1a67bf1
permissions -rw-r--r--
- correct import pour indexation - improve new record management
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
0
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
     1
# -*- coding: utf-8 -*-
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
     2
101
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents: 22
diff changeset
     3
import logging
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents: 22
diff changeset
     4
from optparse import make_option
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents: 22
diff changeset
     5
6
ff4d2d4f1fb0 correct import of VolumeIssue
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
     6
from django.core.management import BaseCommand
ff4d2d4f1fb0 correct import of VolumeIssue
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
     7
from django.db import reset_queries, transaction
107
48440ff95906 small code reorg
ymh <ymh.work@gmail.com>
parents: 106
diff changeset
     8
from rdflib import BNode
101
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents: 22
diff changeset
     9
107
48440ff95906 small code reorg
ymh <ymh.work@gmail.com>
parents: 106
diff changeset
    10
from p4l.mapping.constants import get_empty_graph, IIEP
48440ff95906 small code reorg
ymh <ymh.work@gmail.com>
parents: 106
diff changeset
    11
from p4l.mapping.parsers import RecordParser, QueryCache
13
6296aa12fd71 model simplification, correct import on language. We do not try to impose a language when none is found. add forgotten abstract field on import.
ymh <ymh.work@gmail.com>
parents: 7
diff changeset
    12
from p4l.utils import show_progress
0
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    13
import xml.etree.cElementTree as ET
114
93b45b4f423c add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents: 108
diff changeset
    14
from django.conf import settings
0
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    15
101
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents: 22
diff changeset
    16
0
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    17
logger = logging.getLogger(__name__)
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    18
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    19
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    20
DEFAULT_LANGUAGE_URI = "http://psi.oasis-open.org/iso/639/#eng"
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    21
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    22
DEFAULT_LANGUAGE_QUERY =  """SELECT ( COALESCE(?lang, ?other_lang) as ?main_lang) WHERE {
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    23
    OPTIONAL { ?s dct:language ?lang }.
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    24
    OPTIONAL { ?s iiep:otherLanguage ?other_lang }.
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    25
}"""
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    26
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    27
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    28
class Command(BaseCommand):
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    29
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    30
    args = "record_url ..."
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    31
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    32
    help = "Import p4l record rdf format"
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    33
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    34
    option_list = BaseCommand.option_list + (
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    35
        make_option('-b', '--batch-size',
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    36
            dest= 'batch_size',
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    37
            type='int',
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    38
            default= 50,
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    39
            help= 'number of object to import in bulk operations' 
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    40
        ),
106
71684a2ea502 delete record by default when importing
ymh <ymh.work@gmail.com>
parents: 105
diff changeset
    41
        make_option('-p', '--preserve',
71684a2ea502 delete record by default when importing
ymh <ymh.work@gmail.com>
parents: 105
diff changeset
    42
            dest= 'preserve',
71684a2ea502 delete record by default when importing
ymh <ymh.work@gmail.com>
parents: 105
diff changeset
    43
            action='store_true',
71684a2ea502 delete record by default when importing
ymh <ymh.work@gmail.com>
parents: 105
diff changeset
    44
            default=False,
71684a2ea502 delete record by default when importing
ymh <ymh.work@gmail.com>
parents: 105
diff changeset
    45
            help= 'preserve existing record' 
71684a2ea502 delete record by default when importing
ymh <ymh.work@gmail.com>
parents: 105
diff changeset
    46
        ),
114
93b45b4f423c add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents: 108
diff changeset
    47
        make_option('-i', '--index',
93b45b4f423c add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents: 108
diff changeset
    48
            dest= 'index',
93b45b4f423c add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents: 108
diff changeset
    49
            action='store_true',
93b45b4f423c add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents: 108
diff changeset
    50
            default=False,
93b45b4f423c add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents: 108
diff changeset
    51
            help= 'index while importing' 
93b45b4f423c add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents: 108
diff changeset
    52
        ),
0
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    53
    )
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    54
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    55
    def __init__(self, *args, **kwargs):
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    56
        super(Command, self).__init__(*args, **kwargs)
107
48440ff95906 small code reorg
ymh <ymh.work@gmail.com>
parents: 106
diff changeset
    57
        self.record_parser = RecordParser(query_cache=QueryCache())
106
71684a2ea502 delete record by default when importing
ymh <ymh.work@gmail.com>
parents: 105
diff changeset
    58
        
0
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    59
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    60
    def filter_node(self, node, graph, res_graph):
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    61
        for p,o in graph[node]:
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    62
            res_graph.add((node,p,o))
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    63
            if isinstance(o, BNode):
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    64
                self.filter_node(o, graph, res_graph)
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    65
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    66
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    67
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    68
    def calculate_records_nb(self, records_url):
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    69
        context = ET.iterparse(records_url, events=("end",))
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    70
        i = 0
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    71
        for _,elem in context:
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    72
            if elem.tag == "{%s}Record" % IIEP:
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    73
                i += 1
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    74
        return i
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    75
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    76
    def process_url(self, records_url, options):
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    77
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    78
        total_records = self.calculate_records_nb(records_url)
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    79
        writer = None
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    80
        errors=[]
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    81
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    82
        context = ET.iterparse(records_url, events=("end",))
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    83
        i = 0
6
ff4d2d4f1fb0 correct import of VolumeIssue
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
    84
        for _,elem in context:
0
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    85
            if elem.tag == "{%s}Record" % IIEP:
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    86
                i += 1
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    87
                writer = show_progress(i, total_records, "Processing record nb %d " % i, 50, writer=writer)
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    88
                try:
101
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents: 22
diff changeset
    89
                    record_graph = get_empty_graph()
106
71684a2ea502 delete record by default when importing
ymh <ymh.work@gmail.com>
parents: 105
diff changeset
    90
                    record_graph.parse(data=ET.tostring(elem, encoding='utf-8'), format='xml')                    
107
48440ff95906 small code reorg
ymh <ymh.work@gmail.com>
parents: 106
diff changeset
    91
                    self.record_parser.build_record(record_graph, delete=(not self.preserve))                    
0
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    92
                except Exception as e:
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    93
                    transaction.rollback()
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    94
                    msg = "Error processing resource %d in %s : %s" % (i, records_url, repr(e))
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    95
                    logger.exception(msg)
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    96
                    errors.append((i, records_url, msg))
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    97
                else:
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    98
                    transaction.commit()
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    99
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   100
                if i%self.batch_size == 0:                    
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   101
                    reset_queries()
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   102
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   103
        return errors
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   104
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   105
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   106
    # def process_url(self, records_url, options):
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   107
    #     #open graph with rdflib
13
6296aa12fd71 model simplification, correct import on language. We do not try to impose a language when none is found. add forgotten abstract field on import.
ymh <ymh.work@gmail.com>
parents: 7
diff changeset
   108
    #             
0
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   109
    #     g = Graph()
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   110
    #     print("Loading %s" % records_url)
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   111
    #     g.parse(records_url)
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   112
    #     print("Parsing %s done" % records_url)
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   113
    #     for i,record_uri in enumerate(g[:RDF.type:IIEP.Record]):
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   114
    #         print(i, repr(record_uri))
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   115
    #         record_graph = self.get_empty_graph()
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   116
    #         self.filter_node(record_uri, g, record_graph)
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   117
    #         self.build_record(record_graph)
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   118
    #         if i > 3:
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   119
    #             break
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   120
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   121
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   122
    def handle(self, *args, **options):
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   123
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   124
        self.batch_size = options.get('batch_size', 50)
106
71684a2ea502 delete record by default when importing
ymh <ymh.work@gmail.com>
parents: 105
diff changeset
   125
        self.preserve = options.get("preserve", False)
114
93b45b4f423c add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents: 108
diff changeset
   126
        self.index = options.get("index", False)
93b45b4f423c add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents: 108
diff changeset
   127
        
93b45b4f423c add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents: 108
diff changeset
   128
        if not self.index:
119
ece69ca3ac24 - correct import pour indexation
ymh <ymh.work@gmail.com>
parents: 114
diff changeset
   129
            old_realtime_indexing = getattr(settings, "REALTIME_INDEXING", None)
114
93b45b4f423c add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents: 108
diff changeset
   130
            #this is not recommended by the django manual, but in case of management command it seems to work
119
ece69ca3ac24 - correct import pour indexation
ymh <ymh.work@gmail.com>
parents: 114
diff changeset
   131
            settings.REALTIME_INDEXING = False 
114
93b45b4f423c add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents: 108
diff changeset
   132
        
0
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   133
        transaction.enter_transaction_management()
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   134
        transaction.managed(True)
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   135
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   136
        for records_url in args:
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   137
            print("Processing %s" % records_url)
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   138
            errors = self.process_url(records_url, options)
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   139
            print("Processing %s Done" % records_url)
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   140
            if errors:
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   141
                print("%d error(s) when processing %s, check your log file." % (len(errors), records_url))
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   142
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   143
        transaction.leave_transaction_management()
114
93b45b4f423c add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents: 108
diff changeset
   144
        
119
ece69ca3ac24 - correct import pour indexation
ymh <ymh.work@gmail.com>
parents: 114
diff changeset
   145
        if not self.index and old_realtime_indexing:
ece69ca3ac24 - correct import pour indexation
ymh <ymh.work@gmail.com>
parents: 114
diff changeset
   146
            settings.REALTIME_INDEXING = old_realtime_indexing
0
81e7900b06a7 First import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   147