| author | ymh <ymh.work@gmail.com> |
| Tue, 01 Oct 2013 02:14:08 +0200 | |
| changeset 126 | a345f1a67bf1 |
| parent 119 | ece69ca3ac24 |
| child 131 | f1854630734f |
| permissions | -rw-r--r-- |
| 0 | 1 |
# -*- coding: utf-8 -*- |
| 126 | 2 |
# |
3 |
# Copyright IRI (2013) |
|
4 |
# |
|
5 |
# contact@iri.centrepompidou.fr |
|
6 |
# |
|
7 |
# This software is governed by the CeCILL-B license under French law and |
|
8 |
# abiding by the rules of distribution of free software. You can use, |
|
9 |
# modify and/ or redistribute the software under the terms of the CeCILL-B |
|
10 |
# license as circulated by CEA, CNRS and INRIA at the following URL |
|
11 |
# "http://www.cecill.info". |
|
12 |
# |
|
13 |
# As a counterpart to the access to the source code and rights to copy, |
|
14 |
# modify and redistribute granted by the license, users are provided only |
|
15 |
# with a limited warranty and the software's author, the holder of the |
|
16 |
# economic rights, and the successive licensors have only limited |
|
17 |
# liability. |
|
18 |
# |
|
19 |
# In this respect, the user's attention is drawn to the risks associated |
|
20 |
# with loading, using, modifying and/or developing or reproducing the |
|
21 |
# software by the user in light of its specific status of free software, |
|
22 |
# that may mean that it is complicated to manipulate, and that also |
|
23 |
# therefore means that it is reserved for developers and experienced |
|
24 |
# professionals having in-depth computer knowledge. Users are therefore |
|
25 |
# encouraged to load and test the software's suitability as regards their |
|
26 |
# requirements in conditions enabling the security of their systems and/or |
|
27 |
# data to be ensured and, more generally, to use and operate it in the |
|
28 |
# same conditions as regards security. |
|
29 |
# |
|
30 |
# The fact that you are presently reading this means that you have had |
|
31 |
# knowledge of the CeCILL-B license and that you accept its terms. |
|
32 |
# |
|
| 0 | 33 |
|
|
101
71532a54d1c4
update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
22
diff
changeset
|
34 |
import logging |
|
71532a54d1c4
update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
22
diff
changeset
|
35 |
from optparse import make_option |
|
71532a54d1c4
update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
22
diff
changeset
|
36 |
|
| 6 | 37 |
from django.core.management import BaseCommand |
38 |
from django.db import reset_queries, transaction |
|
| 107 | 39 |
from rdflib import BNode |
|
101
71532a54d1c4
update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
22
diff
changeset
|
40 |
|
| 107 | 41 |
from p4l.mapping.constants import get_empty_graph, IIEP |
42 |
from p4l.mapping.parsers import RecordParser, QueryCache |
|
|
13
6296aa12fd71
model simplification, correct import on language. We do not try to impose a language when none is found. add forgotten abstract field on import.
ymh <ymh.work@gmail.com>
parents:
7
diff
changeset
|
43 |
from p4l.utils import show_progress |
| 0 | 44 |
import xml.etree.cElementTree as ET |
|
114
93b45b4f423c
add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
45 |
from django.conf import settings |
| 0 | 46 |
|
|
101
71532a54d1c4
update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
22
diff
changeset
|
47 |
|
| 0 | 48 |
logger = logging.getLogger(__name__) |
49 |
||
50 |
||
51 |
DEFAULT_LANGUAGE_URI = "http://psi.oasis-open.org/iso/639/#eng" |
|
52 |
||
53 |
DEFAULT_LANGUAGE_QUERY = """SELECT ( COALESCE(?lang, ?other_lang) as ?main_lang) WHERE { |
|
54 |
OPTIONAL { ?s dct:language ?lang }. |
|
55 |
OPTIONAL { ?s iiep:otherLanguage ?other_lang }. |
|
56 |
}""" |
|
57 |
||
58 |
||
59 |
class Command(BaseCommand): |
|
60 |
||
61 |
args = "record_url ..." |
|
62 |
||
63 |
help = "Import p4l record rdf format" |
|
64 |
||
65 |
option_list = BaseCommand.option_list + ( |
|
66 |
make_option('-b', '--batch-size', |
|
67 |
dest= 'batch_size', |
|
68 |
type='int', |
|
69 |
default= 50, |
|
70 |
help= 'number of object to import in bulk operations' |
|
71 |
), |
|
|
106
71684a2ea502
delete record by default when importing
ymh <ymh.work@gmail.com>
parents:
105
diff
changeset
|
72 |
make_option('-p', '--preserve', |
|
71684a2ea502
delete record by default when importing
ymh <ymh.work@gmail.com>
parents:
105
diff
changeset
|
73 |
dest= 'preserve', |
|
71684a2ea502
delete record by default when importing
ymh <ymh.work@gmail.com>
parents:
105
diff
changeset
|
74 |
action='store_true', |
|
71684a2ea502
delete record by default when importing
ymh <ymh.work@gmail.com>
parents:
105
diff
changeset
|
75 |
default=False, |
|
71684a2ea502
delete record by default when importing
ymh <ymh.work@gmail.com>
parents:
105
diff
changeset
|
76 |
help= 'preserve existing record' |
|
71684a2ea502
delete record by default when importing
ymh <ymh.work@gmail.com>
parents:
105
diff
changeset
|
77 |
), |
|
114
93b45b4f423c
add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
78 |
make_option('-i', '--index', |
|
93b45b4f423c
add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
79 |
dest= 'index', |
|
93b45b4f423c
add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
80 |
action='store_true', |
|
93b45b4f423c
add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
81 |
default=False, |
|
93b45b4f423c
add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
82 |
help= 'index while importing' |
|
93b45b4f423c
add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
83 |
), |
| 0 | 84 |
) |
85 |
||
86 |
def __init__(self, *args, **kwargs): |
|
87 |
super(Command, self).__init__(*args, **kwargs) |
|
| 107 | 88 |
self.record_parser = RecordParser(query_cache=QueryCache()) |
|
106
71684a2ea502
delete record by default when importing
ymh <ymh.work@gmail.com>
parents:
105
diff
changeset
|
89 |
|
| 0 | 90 |
|
91 |
def filter_node(self, node, graph, res_graph): |
|
92 |
for p,o in graph[node]: |
|
93 |
res_graph.add((node,p,o)) |
|
94 |
if isinstance(o, BNode): |
|
95 |
self.filter_node(o, graph, res_graph) |
|
96 |
||
97 |
||
98 |
||
99 |
def calculate_records_nb(self, records_url): |
|
100 |
context = ET.iterparse(records_url, events=("end",)) |
|
101 |
i = 0 |
|
102 |
for _,elem in context: |
|
103 |
if elem.tag == "{%s}Record" % IIEP: |
|
104 |
i += 1 |
|
105 |
return i |
|
106 |
||
107 |
def process_url(self, records_url, options): |
|
108 |
||
109 |
total_records = self.calculate_records_nb(records_url) |
|
110 |
writer = None |
|
111 |
errors=[] |
|
112 |
||
113 |
context = ET.iterparse(records_url, events=("end",)) |
|
114 |
i = 0 |
|
| 6 | 115 |
for _,elem in context: |
| 0 | 116 |
if elem.tag == "{%s}Record" % IIEP: |
117 |
i += 1 |
|
118 |
writer = show_progress(i, total_records, "Processing record nb %d " % i, 50, writer=writer) |
|
119 |
try: |
|
|
101
71532a54d1c4
update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
22
diff
changeset
|
120 |
record_graph = get_empty_graph() |
|
106
71684a2ea502
delete record by default when importing
ymh <ymh.work@gmail.com>
parents:
105
diff
changeset
|
121 |
record_graph.parse(data=ET.tostring(elem, encoding='utf-8'), format='xml') |
| 107 | 122 |
self.record_parser.build_record(record_graph, delete=(not self.preserve)) |
| 0 | 123 |
except Exception as e: |
124 |
transaction.rollback() |
|
125 |
msg = "Error processing resource %d in %s : %s" % (i, records_url, repr(e)) |
|
126 |
logger.exception(msg) |
|
127 |
errors.append((i, records_url, msg)) |
|
128 |
else: |
|
129 |
transaction.commit() |
|
130 |
||
131 |
if i%self.batch_size == 0: |
|
132 |
reset_queries() |
|
133 |
||
134 |
return errors |
|
135 |
||
136 |
||
137 |
# def process_url(self, records_url, options): |
|
138 |
# #open graph with rdflib |
|
|
13
6296aa12fd71
model simplification, correct import on language. We do not try to impose a language when none is found. add forgotten abstract field on import.
ymh <ymh.work@gmail.com>
parents:
7
diff
changeset
|
139 |
# |
| 0 | 140 |
# g = Graph() |
141 |
# print("Loading %s" % records_url) |
|
142 |
# g.parse(records_url) |
|
143 |
# print("Parsing %s done" % records_url) |
|
144 |
# for i,record_uri in enumerate(g[:RDF.type:IIEP.Record]): |
|
145 |
# print(i, repr(record_uri)) |
|
146 |
# record_graph = self.get_empty_graph() |
|
147 |
# self.filter_node(record_uri, g, record_graph) |
|
148 |
# self.build_record(record_graph) |
|
149 |
# if i > 3: |
|
150 |
# break |
|
151 |
||
152 |
||
153 |
def handle(self, *args, **options): |
|
154 |
||
155 |
self.batch_size = options.get('batch_size', 50) |
|
|
106
71684a2ea502
delete record by default when importing
ymh <ymh.work@gmail.com>
parents:
105
diff
changeset
|
156 |
self.preserve = options.get("preserve", False) |
|
114
93b45b4f423c
add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
157 |
self.index = options.get("index", False) |
|
93b45b4f423c
add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
158 |
|
|
93b45b4f423c
add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
159 |
if not self.index: |
| 119 | 160 |
old_realtime_indexing = getattr(settings, "REALTIME_INDEXING", None) |
|
114
93b45b4f423c
add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
161 |
#this is not recommended by the django manual, but in case of management command it seems to work |
| 119 | 162 |
settings.REALTIME_INDEXING = False |
|
114
93b45b4f423c
add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
163 |
|
| 0 | 164 |
transaction.enter_transaction_management() |
165 |
transaction.managed(True) |
|
166 |
||
167 |
for records_url in args: |
|
168 |
print("Processing %s" % records_url) |
|
169 |
errors = self.process_url(records_url, options) |
|
170 |
print("Processing %s Done" % records_url) |
|
171 |
if errors: |
|
172 |
print("%d error(s) when processing %s, check your log file." % (len(errors), records_url)) |
|
173 |
||
174 |
transaction.leave_transaction_management() |
|
|
114
93b45b4f423c
add corporate authors and small adjustments
ymh <ymh.work@gmail.com>
parents:
108
diff
changeset
|
175 |
|
| 119 | 176 |
if not self.index and old_realtime_indexing: |
177 |
settings.REALTIME_INDEXING = old_realtime_indexing |
|
| 0 | 178 |