diff -r 49fda47ceb16 -r 71532a54d1c4 src/p4l/management/commands/dump_record.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/p4l/management/commands/dump_record.py Fri Sep 20 00:03:31 2013 +0200 @@ -0,0 +1,226 @@ +# -*- coding: utf-8 -*- +''' +Created on Aug 30, 2013 + +@author: ymh +''' + +import bz2 +import codecs +import gzip +import logging +from optparse import make_option +from xml.sax.saxutils import XMLGenerator +from xml.sax.xmlreader import AttributesNSImpl + +from django.core.management import BaseCommand +from django.core.management.base import CommandError +from django.db.models.fields.related import ForeignKey + +from p4l.management.constants import (GRAPH_NAMESPACES, RDF, get_empty_graph, + IIEP, DCT) +from p4l.mapping.serializers import (ModelSerializer, SimpleFieldSerializer, + BooleanFieldSerializer, RelatedFieldSerializer) +from p4l.models.data import Record +from p4l.utils import show_progress +from rdflib.namespace import RDFS + + +logger = logging.getLogger(__name__) + +class ImprintSerializer(ModelSerializer): + + imprintCity = SimpleFieldSerializer(predicate=IIEP.imprintCity, lang_field='lang') + publisher = SimpleFieldSerializer(predicate=IIEP.publisher, lang_field='lang') + imprintDate = SimpleFieldSerializer(predicate=IIEP.imprintDate, lang_field='lang') + + +class VolumeIssueSerializer(ModelSerializer): + volume = SimpleFieldSerializer(predicate=IIEP.volume, lang_field='lang') + number = SimpleFieldSerializer(predicate=IIEP.number, lang_field='lang') + + +class MeetingSerializer(ModelSerializer): + label = SimpleFieldSerializer(predicate=RDFS.label, lang_field='lang') + meetingNumber = SimpleFieldSerializer(predicate=IIEP.meetingNumber, lang_field='lang') + meetingPlace = SimpleFieldSerializer(predicate=IIEP.meetingPlace, lang_field='lang') + meetingDate = SimpleFieldSerializer(predicate=IIEP.meetingDate, lang_field='lang') + meetingYear = SimpleFieldSerializer(predicate=IIEP.meetingYear, lang_field='lang') + +class SubjectMeetingSerializer(ModelSerializer): + label = SimpleFieldSerializer(predicate=RDFS.label) + meetingNumber = SimpleFieldSerializer(predicate=IIEP.meetingNumber) + meetingPlace = SimpleFieldSerializer(predicate=IIEP.meetingPlace) + meetingDate = SimpleFieldSerializer(predicate=IIEP.meetingDate) + meetingYear = SimpleFieldSerializer(predicate=IIEP.meetingYear) + + +class SerieSerializer(ModelSerializer): + title = SimpleFieldSerializer(predicate=DCT.title, lang_field='lang') + volume = SimpleFieldSerializer(predicate=IIEP.volume, lang_field='lang') + + +class UrlSerializer(ModelSerializer): + address = SimpleFieldSerializer(predicate=IIEP.address) + display = SimpleFieldSerializer(predicate=IIEP.display) + accessLevel = SimpleFieldSerializer(predicate=IIEP.accessLevel) + + + +class RecordSerializer(ModelSerializer): + + identifier = SimpleFieldSerializer(predicate=DCT.identifier) + notes = SimpleFieldSerializer(predicate=IIEP.notes) + editionStatement = SimpleFieldSerializer(predicate=IIEP.editionStatement) + recordType = SimpleFieldSerializer(predicate=DCT.type) + isDocumentPart = BooleanFieldSerializer(predicate=IIEP.isDocumentPart) + + language = RelatedFieldSerializer(many=False, value_field='uri', predicate=DCT.language) + otherLanguages = RelatedFieldSerializer(many=True, value_field='uri', predicate=IIEP.otherLanguage) + subjects = RelatedFieldSerializer(many=True, value_field='uri', predicate=DCT.subject) + themes = RelatedFieldSerializer(many=True, value_field='uri', predicate=IIEP.theme) + countries = RelatedFieldSerializer(many=True, value_field='uri', predicate=IIEP.country) + projectNames = RelatedFieldSerializer(many=True, value_field='uri', predicate=IIEP.projectName) + subjectCorporateBodies = RelatedFieldSerializer(many=True, value_field='uri', predicate=IIEP.subjectCorporateBody) + corporateAuthors = RelatedFieldSerializer(many=True, value_field='uri', predicate=IIEP.corporateAuthor) + + isbns = RelatedFieldSerializer(many=True, value_field='isbn', predicate=IIEP.isbn, lang_field='lang') + issns = RelatedFieldSerializer(many=True, value_field='issn', predicate=IIEP.issn, lang_field='lang') + collations = RelatedFieldSerializer(many=True, value_field='collation', predicate=IIEP.collation, lang_field='lang') + documentCodes = RelatedFieldSerializer(many=True, value_field='documentCode', predicate=IIEP.documentCode, lang_field='lang') + titles = RelatedFieldSerializer(many=True, value_field='title', predicate=IIEP.title, lang_field='lang') + addedTitles = RelatedFieldSerializer(many=True, value_field='title', predicate=IIEP.addedTitle, lang_field='lang') + titlesMainDocument = RelatedFieldSerializer(many=True, value_field='title', predicate=IIEP.titleMainDocument, lang_field='lang') + abstracts = RelatedFieldSerializer(many=True, value_field='abstract', predicate=IIEP.abstract, lang_field='lang') + periodicals = RelatedFieldSerializer(many=True, value_field='label', predicate=IIEP.periodical, lang_field='lang') + authors = RelatedFieldSerializer(many=True, value_field='name', predicate=IIEP.author) + subjectPersons = RelatedFieldSerializer(many=True, value_field='name', predicate=IIEP.subjectPerson) + + imprints = ImprintSerializer(many=True, predicate=IIEP.imprint) + volumeIssues = VolumeIssueSerializer(many=True, predicate=IIEP.volumeIssue) + meetings = MeetingSerializer(many=True, predicate=IIEP.meeting) + subjectMeetings = SubjectMeetingSerializer(many=True, predicate=IIEP.subjectMeeting) + series = SerieSerializer(many=True, predicate=IIEP.serie) + urls = UrlSerializer(many=True, predicate=IIEP.url) + + + class Meta: + type = IIEP.Record + uri_fieldname = "uri" + +class Command(BaseCommand): + + args = "file_path..." + + help = "Export p4l record rdf format" + + option_list = BaseCommand.option_list + ( + make_option('-l', '--limit', + dest= 'limit', + type='int', + default=-1, + help= 'number of record to export. -1 is all (default)' + ), + make_option('-s', '--skip', + dest= 'skip', + type='int', + default=0, + help= 'number of record to skip before export. default 0.' + ), + make_option('-b', '--batch', + dest= 'batch', + type='int', + default=100, + help= 'query batch default 500.' + ), + make_option('-j', '--bzip2', + dest= 'bzip2', + action='store_true', + default=False, + help= 'bz2 compress' + ), + make_option('-z', '--gzip', + dest= 'gzip', + action='store_true', + default=False, + help= 'gzip compress' + ), + ) + + + def get_graph_from_object(self, obj): + g = get_empty_graph() + + serializer = RecordSerializer() + serializer.to_graph(None, obj, None, g) + + return g + + + def handle(self, *args, **options): + + if len(args) != 1: + raise CommandError("This command takes exactly one argument") + + filepath = args[0] + + bzip2 = options.get('bzip2', False) + gzip_opt = options.get('gzip', False) + + if bzip2 and not filepath.endswith(".bz2"): + filepath += ".bz2" + elif gzip_opt and not filepath.endswith(".gz"): + filepath += ".gz" + + limit = options.get("limit", -1) + skip = options.get("skip", 0) + batch = options.get("batch", 100) + + qs = Record.objects.all().select_related(*[field.name for field in Record._meta.fields if isinstance(field, ForeignKey)]).prefetch_related(*([field.name for field in Record._meta.many_to_many] + [obj.get_accessor_name() for obj in Record._meta.get_all_related_objects()])).order_by('identifier') # @UndefinedVariable + + if limit>=0: + qs = qs[skip:skip+limit] + else: + qs = qs[skip:] + + open_method = None + open_args = [] + + if bzip2: + open_method = bz2.BZ2File + open_args = [filepath, 'wb', 9] + elif gzip_opt: + open_method = gzip.GzipFile + open_args = [filepath, 'wb', 9] + else: + open_method = codecs.open + open_args = [filepath, 'wb', "utf-8"] + + total_records = qs.count() + + print("Total record to export : %d" % total_records) + progress_writer = None + + with open_method(*open_args) as dest_file: + writer = XMLGenerator(dest_file, "UTF-8") + writer.startDocument() + for prefix,uri in GRAPH_NAMESPACES.items(): + writer.startPrefixMapping(prefix, uri) + writer.startElementNS((RDF, 'RDF'), 'RDF', AttributesNSImpl({}, {})) + writer.characters("\n") + for n in range((total_records/batch)+1): + for i,r in enumerate(qs[n*batch:((n+1)*batch)]): + progress_writer = show_progress(i+(n*batch)+1, total_records, "Exporting record %s" % r.identifier, 50, progress_writer) + graph = self.get_graph_from_object(r) + do_write = False + for line in graph.serialize(format="pretty-xml", encoding="utf-8").splitlines(True): + if "" in line: + break + + writer.endElementNS((RDF, 'RDF'), 'RDF') + writer.endDocument() + dest_file.write("\n") \ No newline at end of file