|
1 # -*- coding: utf-8 -*- |
|
2 ''' |
|
3 Created on Aug 30, 2013 |
|
4 |
|
5 @author: ymh |
|
6 ''' |
|
7 |
|
8 import bz2 |
|
9 import codecs |
|
10 import gzip |
|
11 import logging |
|
12 from optparse import make_option |
|
13 from xml.sax.saxutils import XMLGenerator |
|
14 from xml.sax.xmlreader import AttributesNSImpl |
|
15 |
|
16 from django.core.management import BaseCommand |
|
17 from django.core.management.base import CommandError |
|
18 from django.db.models.fields.related import ForeignKey |
|
19 |
|
20 from p4l.management.constants import (GRAPH_NAMESPACES, RDF, get_empty_graph, |
|
21 IIEP, DCT) |
|
22 from p4l.mapping.serializers import (ModelSerializer, SimpleFieldSerializer, |
|
23 BooleanFieldSerializer, RelatedFieldSerializer) |
|
24 from p4l.models.data import Record |
|
25 from p4l.utils import show_progress |
|
26 from rdflib.namespace import RDFS |
|
27 |
|
28 |
|
29 logger = logging.getLogger(__name__) |
|
30 |
|
31 class ImprintSerializer(ModelSerializer): |
|
32 |
|
33 imprintCity = SimpleFieldSerializer(predicate=IIEP.imprintCity, lang_field='lang') |
|
34 publisher = SimpleFieldSerializer(predicate=IIEP.publisher, lang_field='lang') |
|
35 imprintDate = SimpleFieldSerializer(predicate=IIEP.imprintDate, lang_field='lang') |
|
36 |
|
37 |
|
38 class VolumeIssueSerializer(ModelSerializer): |
|
39 volume = SimpleFieldSerializer(predicate=IIEP.volume, lang_field='lang') |
|
40 number = SimpleFieldSerializer(predicate=IIEP.number, lang_field='lang') |
|
41 |
|
42 |
|
43 class MeetingSerializer(ModelSerializer): |
|
44 label = SimpleFieldSerializer(predicate=RDFS.label, lang_field='lang') |
|
45 meetingNumber = SimpleFieldSerializer(predicate=IIEP.meetingNumber, lang_field='lang') |
|
46 meetingPlace = SimpleFieldSerializer(predicate=IIEP.meetingPlace, lang_field='lang') |
|
47 meetingDate = SimpleFieldSerializer(predicate=IIEP.meetingDate, lang_field='lang') |
|
48 meetingYear = SimpleFieldSerializer(predicate=IIEP.meetingYear, lang_field='lang') |
|
49 |
|
50 class SubjectMeetingSerializer(ModelSerializer): |
|
51 label = SimpleFieldSerializer(predicate=RDFS.label) |
|
52 meetingNumber = SimpleFieldSerializer(predicate=IIEP.meetingNumber) |
|
53 meetingPlace = SimpleFieldSerializer(predicate=IIEP.meetingPlace) |
|
54 meetingDate = SimpleFieldSerializer(predicate=IIEP.meetingDate) |
|
55 meetingYear = SimpleFieldSerializer(predicate=IIEP.meetingYear) |
|
56 |
|
57 |
|
58 class SerieSerializer(ModelSerializer): |
|
59 title = SimpleFieldSerializer(predicate=DCT.title, lang_field='lang') |
|
60 volume = SimpleFieldSerializer(predicate=IIEP.volume, lang_field='lang') |
|
61 |
|
62 |
|
63 class UrlSerializer(ModelSerializer): |
|
64 address = SimpleFieldSerializer(predicate=IIEP.address) |
|
65 display = SimpleFieldSerializer(predicate=IIEP.display) |
|
66 accessLevel = SimpleFieldSerializer(predicate=IIEP.accessLevel) |
|
67 |
|
68 |
|
69 |
|
70 class RecordSerializer(ModelSerializer): |
|
71 |
|
72 identifier = SimpleFieldSerializer(predicate=DCT.identifier) |
|
73 notes = SimpleFieldSerializer(predicate=IIEP.notes) |
|
74 editionStatement = SimpleFieldSerializer(predicate=IIEP.editionStatement) |
|
75 recordType = SimpleFieldSerializer(predicate=DCT.type) |
|
76 isDocumentPart = BooleanFieldSerializer(predicate=IIEP.isDocumentPart) |
|
77 |
|
78 language = RelatedFieldSerializer(many=False, value_field='uri', predicate=DCT.language) |
|
79 otherLanguages = RelatedFieldSerializer(many=True, value_field='uri', predicate=IIEP.otherLanguage) |
|
80 subjects = RelatedFieldSerializer(many=True, value_field='uri', predicate=DCT.subject) |
|
81 themes = RelatedFieldSerializer(many=True, value_field='uri', predicate=IIEP.theme) |
|
82 countries = RelatedFieldSerializer(many=True, value_field='uri', predicate=IIEP.country) |
|
83 projectNames = RelatedFieldSerializer(many=True, value_field='uri', predicate=IIEP.projectName) |
|
84 subjectCorporateBodies = RelatedFieldSerializer(many=True, value_field='uri', predicate=IIEP.subjectCorporateBody) |
|
85 corporateAuthors = RelatedFieldSerializer(many=True, value_field='uri', predicate=IIEP.corporateAuthor) |
|
86 |
|
87 isbns = RelatedFieldSerializer(many=True, value_field='isbn', predicate=IIEP.isbn, lang_field='lang') |
|
88 issns = RelatedFieldSerializer(many=True, value_field='issn', predicate=IIEP.issn, lang_field='lang') |
|
89 collations = RelatedFieldSerializer(many=True, value_field='collation', predicate=IIEP.collation, lang_field='lang') |
|
90 documentCodes = RelatedFieldSerializer(many=True, value_field='documentCode', predicate=IIEP.documentCode, lang_field='lang') |
|
91 titles = RelatedFieldSerializer(many=True, value_field='title', predicate=IIEP.title, lang_field='lang') |
|
92 addedTitles = RelatedFieldSerializer(many=True, value_field='title', predicate=IIEP.addedTitle, lang_field='lang') |
|
93 titlesMainDocument = RelatedFieldSerializer(many=True, value_field='title', predicate=IIEP.titleMainDocument, lang_field='lang') |
|
94 abstracts = RelatedFieldSerializer(many=True, value_field='abstract', predicate=IIEP.abstract, lang_field='lang') |
|
95 periodicals = RelatedFieldSerializer(many=True, value_field='label', predicate=IIEP.periodical, lang_field='lang') |
|
96 authors = RelatedFieldSerializer(many=True, value_field='name', predicate=IIEP.author) |
|
97 subjectPersons = RelatedFieldSerializer(many=True, value_field='name', predicate=IIEP.subjectPerson) |
|
98 |
|
99 imprints = ImprintSerializer(many=True, predicate=IIEP.imprint) |
|
100 volumeIssues = VolumeIssueSerializer(many=True, predicate=IIEP.volumeIssue) |
|
101 meetings = MeetingSerializer(many=True, predicate=IIEP.meeting) |
|
102 subjectMeetings = SubjectMeetingSerializer(many=True, predicate=IIEP.subjectMeeting) |
|
103 series = SerieSerializer(many=True, predicate=IIEP.serie) |
|
104 urls = UrlSerializer(many=True, predicate=IIEP.url) |
|
105 |
|
106 |
|
107 class Meta: |
|
108 type = IIEP.Record |
|
109 uri_fieldname = "uri" |
|
110 |
|
111 class Command(BaseCommand): |
|
112 |
|
113 args = "file_path..." |
|
114 |
|
115 help = "Export p4l record rdf format" |
|
116 |
|
117 option_list = BaseCommand.option_list + ( |
|
118 make_option('-l', '--limit', |
|
119 dest= 'limit', |
|
120 type='int', |
|
121 default=-1, |
|
122 help= 'number of record to export. -1 is all (default)' |
|
123 ), |
|
124 make_option('-s', '--skip', |
|
125 dest= 'skip', |
|
126 type='int', |
|
127 default=0, |
|
128 help= 'number of record to skip before export. default 0.' |
|
129 ), |
|
130 make_option('-b', '--batch', |
|
131 dest= 'batch', |
|
132 type='int', |
|
133 default=100, |
|
134 help= 'query batch default 500.' |
|
135 ), |
|
136 make_option('-j', '--bzip2', |
|
137 dest= 'bzip2', |
|
138 action='store_true', |
|
139 default=False, |
|
140 help= 'bz2 compress' |
|
141 ), |
|
142 make_option('-z', '--gzip', |
|
143 dest= 'gzip', |
|
144 action='store_true', |
|
145 default=False, |
|
146 help= 'gzip compress' |
|
147 ), |
|
148 ) |
|
149 |
|
150 |
|
151 def get_graph_from_object(self, obj): |
|
152 g = get_empty_graph() |
|
153 |
|
154 serializer = RecordSerializer() |
|
155 serializer.to_graph(None, obj, None, g) |
|
156 |
|
157 return g |
|
158 |
|
159 |
|
160 def handle(self, *args, **options): |
|
161 |
|
162 if len(args) != 1: |
|
163 raise CommandError("This command takes exactly one argument") |
|
164 |
|
165 filepath = args[0] |
|
166 |
|
167 bzip2 = options.get('bzip2', False) |
|
168 gzip_opt = options.get('gzip', False) |
|
169 |
|
170 if bzip2 and not filepath.endswith(".bz2"): |
|
171 filepath += ".bz2" |
|
172 elif gzip_opt and not filepath.endswith(".gz"): |
|
173 filepath += ".gz" |
|
174 |
|
175 limit = options.get("limit", -1) |
|
176 skip = options.get("skip", 0) |
|
177 batch = options.get("batch", 100) |
|
178 |
|
179 qs = Record.objects.all().select_related(*[field.name for field in Record._meta.fields if isinstance(field, ForeignKey)]).prefetch_related(*([field.name for field in Record._meta.many_to_many] + [obj.get_accessor_name() for obj in Record._meta.get_all_related_objects()])).order_by('identifier') # @UndefinedVariable |
|
180 |
|
181 if limit>=0: |
|
182 qs = qs[skip:skip+limit] |
|
183 else: |
|
184 qs = qs[skip:] |
|
185 |
|
186 open_method = None |
|
187 open_args = [] |
|
188 |
|
189 if bzip2: |
|
190 open_method = bz2.BZ2File |
|
191 open_args = [filepath, 'wb', 9] |
|
192 elif gzip_opt: |
|
193 open_method = gzip.GzipFile |
|
194 open_args = [filepath, 'wb', 9] |
|
195 else: |
|
196 open_method = codecs.open |
|
197 open_args = [filepath, 'wb', "utf-8"] |
|
198 |
|
199 total_records = qs.count() |
|
200 |
|
201 print("Total record to export : %d" % total_records) |
|
202 progress_writer = None |
|
203 |
|
204 with open_method(*open_args) as dest_file: |
|
205 writer = XMLGenerator(dest_file, "UTF-8") |
|
206 writer.startDocument() |
|
207 for prefix,uri in GRAPH_NAMESPACES.items(): |
|
208 writer.startPrefixMapping(prefix, uri) |
|
209 writer.startElementNS((RDF, 'RDF'), 'RDF', AttributesNSImpl({}, {})) |
|
210 writer.characters("\n") |
|
211 for n in range((total_records/batch)+1): |
|
212 for i,r in enumerate(qs[n*batch:((n+1)*batch)]): |
|
213 progress_writer = show_progress(i+(n*batch)+1, total_records, "Exporting record %s" % r.identifier, 50, progress_writer) |
|
214 graph = self.get_graph_from_object(r) |
|
215 do_write = False |
|
216 for line in graph.serialize(format="pretty-xml", encoding="utf-8").splitlines(True): |
|
217 if "<iiep:Record" in line: |
|
218 do_write = True |
|
219 if do_write: |
|
220 dest_file.write(line.decode("utf-8")) |
|
221 if "</iiep:Record>" in line: |
|
222 break |
|
223 |
|
224 writer.endElementNS((RDF, 'RDF'), 'RDF') |
|
225 writer.endDocument() |
|
226 dest_file.write("\n") |