src/p4l/management/commands/dump_record.py
changeset 101 71532a54d1c4
child 103 468349edbf7f
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/p4l/management/commands/dump_record.py	Fri Sep 20 00:03:31 2013 +0200
@@ -0,0 +1,226 @@
+# -*- coding: utf-8 -*-
+'''
+Created on Aug 30, 2013
+
+@author: ymh
+'''
+
+import bz2
+import codecs
+import gzip
+import logging
+from optparse import make_option
+from xml.sax.saxutils import XMLGenerator
+from xml.sax.xmlreader import AttributesNSImpl
+
+from django.core.management import BaseCommand
+from django.core.management.base import CommandError
+from django.db.models.fields.related import ForeignKey
+
+from p4l.management.constants import (GRAPH_NAMESPACES, RDF, get_empty_graph, 
+    IIEP, DCT)
+from p4l.mapping.serializers import (ModelSerializer, SimpleFieldSerializer, 
+    BooleanFieldSerializer, RelatedFieldSerializer)
+from p4l.models.data import Record
+from p4l.utils import show_progress
+from rdflib.namespace import RDFS
+
+
+logger = logging.getLogger(__name__)
+
+class ImprintSerializer(ModelSerializer):
+    
+    imprintCity = SimpleFieldSerializer(predicate=IIEP.imprintCity, lang_field='lang')
+    publisher = SimpleFieldSerializer(predicate=IIEP.publisher, lang_field='lang')
+    imprintDate = SimpleFieldSerializer(predicate=IIEP.imprintDate, lang_field='lang')
+
+
+class VolumeIssueSerializer(ModelSerializer):
+    volume = SimpleFieldSerializer(predicate=IIEP.volume, lang_field='lang')
+    number = SimpleFieldSerializer(predicate=IIEP.number, lang_field='lang')
+
+
+class MeetingSerializer(ModelSerializer):
+    label = SimpleFieldSerializer(predicate=RDFS.label, lang_field='lang')
+    meetingNumber = SimpleFieldSerializer(predicate=IIEP.meetingNumber, lang_field='lang')
+    meetingPlace = SimpleFieldSerializer(predicate=IIEP.meetingPlace, lang_field='lang')
+    meetingDate = SimpleFieldSerializer(predicate=IIEP.meetingDate, lang_field='lang')
+    meetingYear = SimpleFieldSerializer(predicate=IIEP.meetingYear, lang_field='lang')
+
+class SubjectMeetingSerializer(ModelSerializer):
+    label = SimpleFieldSerializer(predicate=RDFS.label)
+    meetingNumber = SimpleFieldSerializer(predicate=IIEP.meetingNumber)
+    meetingPlace = SimpleFieldSerializer(predicate=IIEP.meetingPlace)
+    meetingDate = SimpleFieldSerializer(predicate=IIEP.meetingDate)
+    meetingYear = SimpleFieldSerializer(predicate=IIEP.meetingYear)
+
+
+class SerieSerializer(ModelSerializer):
+    title = SimpleFieldSerializer(predicate=DCT.title, lang_field='lang')
+    volume = SimpleFieldSerializer(predicate=IIEP.volume, lang_field='lang')
+
+
+class UrlSerializer(ModelSerializer):
+    address = SimpleFieldSerializer(predicate=IIEP.address)
+    display = SimpleFieldSerializer(predicate=IIEP.display)
+    accessLevel = SimpleFieldSerializer(predicate=IIEP.accessLevel)
+
+
+
+class RecordSerializer(ModelSerializer):
+    
+    identifier = SimpleFieldSerializer(predicate=DCT.identifier)
+    notes = SimpleFieldSerializer(predicate=IIEP.notes)
+    editionStatement = SimpleFieldSerializer(predicate=IIEP.editionStatement)
+    recordType = SimpleFieldSerializer(predicate=DCT.type)
+    isDocumentPart = BooleanFieldSerializer(predicate=IIEP.isDocumentPart)
+
+    language = RelatedFieldSerializer(many=False, value_field='uri', predicate=DCT.language) 
+    otherLanguages = RelatedFieldSerializer(many=True, value_field='uri', predicate=IIEP.otherLanguage)
+    subjects = RelatedFieldSerializer(many=True, value_field='uri', predicate=DCT.subject)
+    themes = RelatedFieldSerializer(many=True, value_field='uri', predicate=IIEP.theme)
+    countries = RelatedFieldSerializer(many=True, value_field='uri', predicate=IIEP.country)
+    projectNames = RelatedFieldSerializer(many=True, value_field='uri', predicate=IIEP.projectName)
+    subjectCorporateBodies = RelatedFieldSerializer(many=True, value_field='uri', predicate=IIEP.subjectCorporateBody) 
+    corporateAuthors = RelatedFieldSerializer(many=True, value_field='uri', predicate=IIEP.corporateAuthor)
+    
+    isbns = RelatedFieldSerializer(many=True, value_field='isbn', predicate=IIEP.isbn, lang_field='lang')
+    issns = RelatedFieldSerializer(many=True, value_field='issn', predicate=IIEP.issn, lang_field='lang')
+    collations = RelatedFieldSerializer(many=True, value_field='collation', predicate=IIEP.collation, lang_field='lang')
+    documentCodes = RelatedFieldSerializer(many=True, value_field='documentCode', predicate=IIEP.documentCode, lang_field='lang')
+    titles = RelatedFieldSerializer(many=True, value_field='title', predicate=IIEP.title, lang_field='lang')
+    addedTitles = RelatedFieldSerializer(many=True, value_field='title', predicate=IIEP.addedTitle, lang_field='lang')
+    titlesMainDocument = RelatedFieldSerializer(many=True, value_field='title', predicate=IIEP.titleMainDocument, lang_field='lang')
+    abstracts = RelatedFieldSerializer(many=True, value_field='abstract', predicate=IIEP.abstract, lang_field='lang')
+    periodicals = RelatedFieldSerializer(many=True, value_field='label', predicate=IIEP.periodical, lang_field='lang')
+    authors = RelatedFieldSerializer(many=True, value_field='name', predicate=IIEP.author)
+    subjectPersons = RelatedFieldSerializer(many=True, value_field='name', predicate=IIEP.subjectPerson)
+    
+    imprints = ImprintSerializer(many=True, predicate=IIEP.imprint)
+    volumeIssues = VolumeIssueSerializer(many=True, predicate=IIEP.volumeIssue)
+    meetings = MeetingSerializer(many=True, predicate=IIEP.meeting)
+    subjectMeetings = SubjectMeetingSerializer(many=True, predicate=IIEP.subjectMeeting)
+    series = SerieSerializer(many=True, predicate=IIEP.serie)
+    urls = UrlSerializer(many=True, predicate=IIEP.url)
+    
+    
+    class Meta:
+        type = IIEP.Record
+        uri_fieldname = "uri"
+
+class Command(BaseCommand):
+
+    args = "file_path..."
+
+    help = "Export p4l record rdf format"
+
+    option_list = BaseCommand.option_list + (
+        make_option('-l', '--limit',
+            dest= 'limit',
+            type='int',
+            default=-1,
+            help= 'number of record to export. -1 is all (default)' 
+        ),
+        make_option('-s', '--skip',
+            dest= 'skip',
+            type='int',
+            default=0,
+            help= 'number of record to skip before export. default 0.' 
+        ),
+        make_option('-b', '--batch',
+            dest= 'batch',
+            type='int',
+            default=100,
+            help= 'query batch default 500.' 
+        ),
+        make_option('-j', '--bzip2',
+            dest= 'bzip2',
+            action='store_true',
+            default=False,
+            help= 'bz2 compress' 
+        ),
+        make_option('-z', '--gzip',
+            dest= 'gzip',
+            action='store_true',
+            default=False,
+            help= 'gzip compress' 
+        ),
+    )
+
+
+    def get_graph_from_object(self, obj):
+        g = get_empty_graph()
+        
+        serializer = RecordSerializer()        
+        serializer.to_graph(None, obj, None, g)
+        
+        return g
+    
+    
+    def handle(self, *args, **options):
+        
+        if len(args) != 1:
+            raise CommandError("This command takes exactly one argument")
+        
+        filepath = args[0]
+
+        bzip2 = options.get('bzip2', False)
+        gzip_opt = options.get('gzip', False)
+        
+        if bzip2 and not filepath.endswith(".bz2"):
+            filepath += ".bz2"
+        elif gzip_opt and not filepath.endswith(".gz"):
+            filepath += ".gz"            
+        
+        limit = options.get("limit", -1)
+        skip = options.get("skip", 0)
+        batch = options.get("batch", 100)
+        
+        qs = Record.objects.all().select_related(*[field.name for field in Record._meta.fields if isinstance(field, ForeignKey)]).prefetch_related(*([field.name for field in Record._meta.many_to_many] + [obj.get_accessor_name() for obj in Record._meta.get_all_related_objects()])).order_by('identifier')  # @UndefinedVariable
+        
+        if limit>=0:
+            qs = qs[skip:skip+limit]
+        else:
+            qs = qs[skip:]
+        
+        open_method = None
+        open_args = []
+        
+        if bzip2:
+            open_method = bz2.BZ2File
+            open_args = [filepath, 'wb', 9] 
+        elif gzip_opt:
+            open_method = gzip.GzipFile
+            open_args = [filepath, 'wb', 9]
+        else:
+            open_method = codecs.open
+            open_args = [filepath, 'wb', "utf-8"]
+        
+        total_records = qs.count()
+        
+        print("Total record to export : %d" % total_records)
+        progress_writer = None
+        
+        with open_method(*open_args) as dest_file:
+            writer = XMLGenerator(dest_file, "UTF-8")
+            writer.startDocument()
+            for prefix,uri in GRAPH_NAMESPACES.items():
+                writer.startPrefixMapping(prefix, uri)
+            writer.startElementNS((RDF, 'RDF'), 'RDF', AttributesNSImpl({}, {}))
+            writer.characters("\n")
+            for n in range((total_records/batch)+1):
+                for i,r in enumerate(qs[n*batch:((n+1)*batch)]):
+                    progress_writer = show_progress(i+(n*batch)+1, total_records, "Exporting record %s" % r.identifier, 50, progress_writer) 
+                    graph = self.get_graph_from_object(r)
+                    do_write = False
+                    for line in graph.serialize(format="pretty-xml", encoding="utf-8").splitlines(True):
+                        if "<iiep:Record" in line:
+                            do_write = True
+                        if do_write:
+                            dest_file.write(line.decode("utf-8"))
+                        if "</iiep:Record>" in line:
+                            break
+                
+            writer.endElementNS((RDF, 'RDF'), 'RDF')
+            writer.endDocument()
+            dest_file.write("\n")
\ No newline at end of file