| author | ymh <ymh.work@gmail.com> |
| Thu, 26 Sep 2013 15:24:41 +0200 | |
| changeset 119 | ece69ca3ac24 |
| parent 114 | 93b45b4f423c |
| child 126 | a345f1a67bf1 |
| permissions | -rw-r--r-- |
| 0 | 1 |
# -*- coding: utf-8 -*- |
2 |
||
|
101
71532a54d1c4
update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
22
diff
changeset
|
3 |
import logging |
|
71532a54d1c4
update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
22
diff
changeset
|
4 |
from optparse import make_option |
|
71532a54d1c4
update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
22
diff
changeset
|
5 |
|
| 6 | 6 |
from django.core.management import BaseCommand |
7 |
from django.db import reset_queries, transaction |
|
| 107 | 8 |
from rdflib import BNode |
|
101
71532a54d1c4
update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
22
diff
changeset
|
9 |
|
| 107 | 10 |
from p4l.mapping.constants import get_empty_graph, IIEP |
11 |
from p4l.mapping.parsers import RecordParser, QueryCache |
|
|
13
6296aa12fd71
model simplification, correct import on language. We do not try to impose a language when none is found. add forgotten abstract field on import.
ymh <ymh.work@gmail.com>
parents:
7
diff
changeset
|
12 |
from p4l.utils import show_progress |
| 0 | 13 |
import xml.etree.cElementTree as ET |
|
114
93b45b4f423c
add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
14 |
from django.conf import settings |
| 0 | 15 |
|
|
101
71532a54d1c4
update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
22
diff
changeset
|
16 |
|
| 0 | 17 |
logger = logging.getLogger(__name__) |
18 |
||
19 |
||
20 |
DEFAULT_LANGUAGE_URI = "http://psi.oasis-open.org/iso/639/#eng" |
|
21 |
||
22 |
DEFAULT_LANGUAGE_QUERY = """SELECT ( COALESCE(?lang, ?other_lang) as ?main_lang) WHERE { |
|
23 |
OPTIONAL { ?s dct:language ?lang }. |
|
24 |
OPTIONAL { ?s iiep:otherLanguage ?other_lang }. |
|
25 |
}""" |
|
26 |
||
27 |
||
28 |
class Command(BaseCommand): |
|
29 |
||
30 |
args = "record_url ..." |
|
31 |
||
32 |
help = "Import p4l record rdf format" |
|
33 |
||
34 |
option_list = BaseCommand.option_list + ( |
|
35 |
make_option('-b', '--batch-size', |
|
36 |
dest= 'batch_size', |
|
37 |
type='int', |
|
38 |
default= 50, |
|
39 |
help= 'number of object to import in bulk operations' |
|
40 |
), |
|
|
106
71684a2ea502
delete record by default when importing
ymh <ymh.work@gmail.com>
parents:
105
diff
changeset
|
41 |
make_option('-p', '--preserve', |
|
71684a2ea502
delete record by default when importing
ymh <ymh.work@gmail.com>
parents:
105
diff
changeset
|
42 |
dest= 'preserve', |
|
71684a2ea502
delete record by default when importing
ymh <ymh.work@gmail.com>
parents:
105
diff
changeset
|
43 |
action='store_true', |
|
71684a2ea502
delete record by default when importing
ymh <ymh.work@gmail.com>
parents:
105
diff
changeset
|
44 |
default=False, |
|
71684a2ea502
delete record by default when importing
ymh <ymh.work@gmail.com>
parents:
105
diff
changeset
|
45 |
help= 'preserve existing record' |
|
71684a2ea502
delete record by default when importing
ymh <ymh.work@gmail.com>
parents:
105
diff
changeset
|
46 |
), |
|
114
93b45b4f423c
add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
47 |
make_option('-i', '--index', |
|
93b45b4f423c
add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
48 |
dest= 'index', |
|
93b45b4f423c
add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
49 |
action='store_true', |
|
93b45b4f423c
add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
50 |
default=False, |
|
93b45b4f423c
add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
51 |
help= 'index while importing' |
|
93b45b4f423c
add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
52 |
), |
| 0 | 53 |
) |
54 |
||
55 |
def __init__(self, *args, **kwargs): |
|
56 |
super(Command, self).__init__(*args, **kwargs) |
|
| 107 | 57 |
self.record_parser = RecordParser(query_cache=QueryCache()) |
|
106
71684a2ea502
delete record by default when importing
ymh <ymh.work@gmail.com>
parents:
105
diff
changeset
|
58 |
|
| 0 | 59 |
|
60 |
def filter_node(self, node, graph, res_graph): |
|
61 |
for p,o in graph[node]: |
|
62 |
res_graph.add((node,p,o)) |
|
63 |
if isinstance(o, BNode): |
|
64 |
self.filter_node(o, graph, res_graph) |
|
65 |
||
66 |
||
67 |
||
68 |
def calculate_records_nb(self, records_url): |
|
69 |
context = ET.iterparse(records_url, events=("end",)) |
|
70 |
i = 0 |
|
71 |
for _,elem in context: |
|
72 |
if elem.tag == "{%s}Record" % IIEP: |
|
73 |
i += 1 |
|
74 |
return i |
|
75 |
||
76 |
def process_url(self, records_url, options): |
|
77 |
||
78 |
total_records = self.calculate_records_nb(records_url) |
|
79 |
writer = None |
|
80 |
errors=[] |
|
81 |
||
82 |
context = ET.iterparse(records_url, events=("end",)) |
|
83 |
i = 0 |
|
| 6 | 84 |
for _,elem in context: |
| 0 | 85 |
if elem.tag == "{%s}Record" % IIEP: |
86 |
i += 1 |
|
87 |
writer = show_progress(i, total_records, "Processing record nb %d " % i, 50, writer=writer) |
|
88 |
try: |
|
|
101
71532a54d1c4
update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
22
diff
changeset
|
89 |
record_graph = get_empty_graph() |
|
106
71684a2ea502
delete record by default when importing
ymh <ymh.work@gmail.com>
parents:
105
diff
changeset
|
90 |
record_graph.parse(data=ET.tostring(elem, encoding='utf-8'), format='xml') |
| 107 | 91 |
self.record_parser.build_record(record_graph, delete=(not self.preserve)) |
| 0 | 92 |
except Exception as e: |
93 |
transaction.rollback() |
|
94 |
msg = "Error processing resource %d in %s : %s" % (i, records_url, repr(e)) |
|
95 |
logger.exception(msg) |
|
96 |
errors.append((i, records_url, msg)) |
|
97 |
else: |
|
98 |
transaction.commit() |
|
99 |
||
100 |
if i%self.batch_size == 0: |
|
101 |
reset_queries() |
|
102 |
||
103 |
return errors |
|
104 |
||
105 |
||
106 |
# def process_url(self, records_url, options): |
|
107 |
# #open graph with rdflib |
|
|
13
6296aa12fd71
model simplification, correct import on language. We do not try to impose a language when none is found. add forgotten abstract field on import.
ymh <ymh.work@gmail.com>
parents:
7
diff
changeset
|
108 |
# |
| 0 | 109 |
# g = Graph() |
110 |
# print("Loading %s" % records_url) |
|
111 |
# g.parse(records_url) |
|
112 |
# print("Parsing %s done" % records_url) |
|
113 |
# for i,record_uri in enumerate(g[:RDF.type:IIEP.Record]): |
|
114 |
# print(i, repr(record_uri)) |
|
115 |
# record_graph = self.get_empty_graph() |
|
116 |
# self.filter_node(record_uri, g, record_graph) |
|
117 |
# self.build_record(record_graph) |
|
118 |
# if i > 3: |
|
119 |
# break |
|
120 |
||
121 |
||
122 |
def handle(self, *args, **options): |
|
123 |
||
124 |
self.batch_size = options.get('batch_size', 50) |
|
|
106
71684a2ea502
delete record by default when importing
ymh <ymh.work@gmail.com>
parents:
105
diff
changeset
|
125 |
self.preserve = options.get("preserve", False) |
|
114
93b45b4f423c
add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
126 |
self.index = options.get("index", False) |
|
93b45b4f423c
add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
127 |
|
|
93b45b4f423c
add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
128 |
if not self.index: |
| 119 | 129 |
old_realtime_indexing = getattr(settings, "REALTIME_INDEXING", None) |
|
114
93b45b4f423c
add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
130 |
#this is not recommended by the django manual, but in case of management command it seems to work |
| 119 | 131 |
settings.REALTIME_INDEXING = False |
|
114
93b45b4f423c
add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
132 |
|
| 0 | 133 |
transaction.enter_transaction_management() |
134 |
transaction.managed(True) |
|
135 |
||
136 |
for records_url in args: |
|
137 |
print("Processing %s" % records_url) |
|
138 |
errors = self.process_url(records_url, options) |
|
139 |
print("Processing %s Done" % records_url) |
|
140 |
if errors: |
|
141 |
print("%d error(s) when processing %s, check your log file." % (len(errors), records_url)) |
|
142 |
||
143 |
transaction.leave_transaction_management() |
|
|
114
93b45b4f423c
add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
144 |
|
| 119 | 145 |
if not self.index and old_realtime_indexing: |
146 |
settings.REALTIME_INDEXING = old_realtime_indexing |
|
| 0 | 147 |