| author | ymh <ymh.work@gmail.com> |
| Fri, 20 Sep 2013 10:34:49 +0200 | |
| changeset 107 | 48440ff95906 |
| parent 106 | 71684a2ea502 |
| child 108 | c08f9b46a6c5 |
| permissions | -rw-r--r-- |
| 0 | 1 |
# -*- coding: utf-8 -*- |
2 |
||
|
101
71532a54d1c4
update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
22
diff
changeset
|
3 |
import logging |
|
71532a54d1c4
update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
22
diff
changeset
|
4 |
from optparse import make_option |
|
71532a54d1c4
update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
22
diff
changeset
|
5 |
|
| 6 | 6 |
from django.core.management import BaseCommand |
7 |
from django.db import reset_queries, transaction |
|
| 107 | 8 |
from rdflib import BNode |
|
101
71532a54d1c4
update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
22
diff
changeset
|
9 |
|
| 107 | 10 |
from p4l.mapping.constants import get_empty_graph, IIEP |
11 |
from p4l.mapping.parsers import RecordParser, QueryCache |
|
|
13
6296aa12fd71
model simplification, correct import on language. We do not try to impose a language when none is found. add forgotten abstract field on import.
ymh <ymh.work@gmail.com>
parents:
7
diff
changeset
|
12 |
from p4l.utils import show_progress |
| 0 | 13 |
import xml.etree.cElementTree as ET |
14 |
||
|
101
71532a54d1c4
update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
22
diff
changeset
|
15 |
|
| 0 | 16 |
logger = logging.getLogger(__name__) |
17 |
||
18 |
||
19 |
DEFAULT_LANGUAGE_URI = "http://psi.oasis-open.org/iso/639/#eng" |
|
20 |
||
21 |
DEFAULT_LANGUAGE_QUERY = """SELECT ( COALESCE(?lang, ?other_lang) as ?main_lang) WHERE { |
|
22 |
OPTIONAL { ?s dct:language ?lang }. |
|
23 |
OPTIONAL { ?s iiep:otherLanguage ?other_lang }. |
|
24 |
}""" |
|
25 |
||
26 |
||
27 |
class Command(BaseCommand): |
|
28 |
||
29 |
args = "record_url ..." |
|
30 |
||
31 |
help = "Import p4l record rdf format" |
|
32 |
||
33 |
option_list = BaseCommand.option_list + ( |
|
34 |
make_option('-b', '--batch-size', |
|
35 |
dest= 'batch_size', |
|
36 |
type='int', |
|
37 |
default= 50, |
|
38 |
help= 'number of object to import in bulk operations' |
|
39 |
), |
|
|
106
71684a2ea502
delete record by default when importing
ymh <ymh.work@gmail.com>
parents:
105
diff
changeset
|
40 |
make_option('-p', '--preserve', |
|
71684a2ea502
delete record by default when importing
ymh <ymh.work@gmail.com>
parents:
105
diff
changeset
|
41 |
dest= 'preserve', |
|
71684a2ea502
delete record by default when importing
ymh <ymh.work@gmail.com>
parents:
105
diff
changeset
|
42 |
action='store_true', |
|
71684a2ea502
delete record by default when importing
ymh <ymh.work@gmail.com>
parents:
105
diff
changeset
|
43 |
default=False, |
|
71684a2ea502
delete record by default when importing
ymh <ymh.work@gmail.com>
parents:
105
diff
changeset
|
44 |
help= 'preserve existing record' |
|
71684a2ea502
delete record by default when importing
ymh <ymh.work@gmail.com>
parents:
105
diff
changeset
|
45 |
), |
| 0 | 46 |
) |
47 |
||
48 |
def __init__(self, *args, **kwargs): |
|
49 |
super(Command, self).__init__(*args, **kwargs) |
|
| 107 | 50 |
self.record_parser = RecordParser(query_cache=QueryCache()) |
|
106
71684a2ea502
delete record by default when importing
ymh <ymh.work@gmail.com>
parents:
105
diff
changeset
|
51 |
|
| 0 | 52 |
|
53 |
def filter_node(self, node, graph, res_graph): |
|
54 |
for p,o in graph[node]: |
|
55 |
res_graph.add((node,p,o)) |
|
56 |
if isinstance(o, BNode): |
|
57 |
self.filter_node(o, graph, res_graph) |
|
58 |
||
59 |
||
60 |
||
61 |
def calculate_records_nb(self, records_url): |
|
62 |
context = ET.iterparse(records_url, events=("end",)) |
|
63 |
i = 0 |
|
64 |
for _,elem in context: |
|
65 |
if elem.tag == "{%s}Record" % IIEP: |
|
66 |
i += 1 |
|
67 |
return i |
|
68 |
||
69 |
def process_url(self, records_url, options): |
|
70 |
||
71 |
total_records = self.calculate_records_nb(records_url) |
|
72 |
writer = None |
|
73 |
errors=[] |
|
74 |
||
75 |
context = ET.iterparse(records_url, events=("end",)) |
|
76 |
i = 0 |
|
| 6 | 77 |
for _,elem in context: |
| 0 | 78 |
if elem.tag == "{%s}Record" % IIEP: |
79 |
i += 1 |
|
80 |
writer = show_progress(i, total_records, "Processing record nb %d " % i, 50, writer=writer) |
|
81 |
try: |
|
|
101
71532a54d1c4
update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
22
diff
changeset
|
82 |
record_graph = get_empty_graph() |
|
106
71684a2ea502
delete record by default when importing
ymh <ymh.work@gmail.com>
parents:
105
diff
changeset
|
83 |
record_graph.parse(data=ET.tostring(elem, encoding='utf-8'), format='xml') |
| 107 | 84 |
self.record_parser.build_record(record_graph, delete=(not self.preserve)) |
| 0 | 85 |
except Exception as e: |
86 |
transaction.rollback() |
|
87 |
msg = "Error processing resource %d in %s : %s" % (i, records_url, repr(e)) |
|
88 |
logger.exception(msg) |
|
89 |
errors.append((i, records_url, msg)) |
|
90 |
else: |
|
91 |
transaction.commit() |
|
92 |
||
93 |
if i%self.batch_size == 0: |
|
94 |
reset_queries() |
|
95 |
||
96 |
return errors |
|
97 |
||
98 |
||
99 |
# def process_url(self, records_url, options): |
|
100 |
# #open graph with rdflib |
|
|
13
6296aa12fd71
model simplification, correct import on language. We do not try to impose a language when none is found. add forgotten abstract field on import.
ymh <ymh.work@gmail.com>
parents:
7
diff
changeset
|
101 |
# |
| 0 | 102 |
# g = Graph() |
103 |
# print("Loading %s" % records_url) |
|
104 |
# g.parse(records_url) |
|
105 |
# print("Parsing %s done" % records_url) |
|
106 |
# for i,record_uri in enumerate(g[:RDF.type:IIEP.Record]): |
|
107 |
# print(i, repr(record_uri)) |
|
108 |
# record_graph = self.get_empty_graph() |
|
109 |
# self.filter_node(record_uri, g, record_graph) |
|
110 |
# self.build_record(record_graph) |
|
111 |
# if i > 3: |
|
112 |
# break |
|
113 |
||
114 |
||
115 |
||
116 |
||
117 |
def handle(self, *args, **options): |
|
118 |
||
119 |
self.batch_size = options.get('batch_size', 50) |
|
|
106
71684a2ea502
delete record by default when importing
ymh <ymh.work@gmail.com>
parents:
105
diff
changeset
|
120 |
self.preserve = options.get("preserve", False) |
| 0 | 121 |
transaction.enter_transaction_management() |
122 |
transaction.managed(True) |
|
123 |
||
124 |
for records_url in args: |
|
125 |
print("Processing %s" % records_url) |
|
126 |
errors = self.process_url(records_url, options) |
|
127 |
print("Processing %s Done" % records_url) |
|
128 |
if errors: |
|
129 |
print("%d error(s) when processing %s, check your log file." % (len(errors), records_url)) |
|
130 |
||
131 |
transaction.leave_transaction_management() |
|
132 |