src/p4l/management/commands/dump_record.py
changeset 101 71532a54d1c4
child 103 468349edbf7f
equal deleted inserted replaced
89:49fda47ceb16 101:71532a54d1c4
       
     1 # -*- coding: utf-8 -*-
       
     2 '''
       
     3 Created on Aug 30, 2013
       
     4 
       
     5 @author: ymh
       
     6 '''
       
     7 
       
     8 import bz2
       
     9 import codecs
       
    10 import gzip
       
    11 import logging
       
    12 from optparse import make_option
       
    13 from xml.sax.saxutils import XMLGenerator
       
    14 from xml.sax.xmlreader import AttributesNSImpl
       
    15 
       
    16 from django.core.management import BaseCommand
       
    17 from django.core.management.base import CommandError
       
    18 from django.db.models.fields.related import ForeignKey
       
    19 
       
    20 from p4l.management.constants import (GRAPH_NAMESPACES, RDF, get_empty_graph, 
       
    21     IIEP, DCT)
       
    22 from p4l.mapping.serializers import (ModelSerializer, SimpleFieldSerializer, 
       
    23     BooleanFieldSerializer, RelatedFieldSerializer)
       
    24 from p4l.models.data import Record
       
    25 from p4l.utils import show_progress
       
    26 from rdflib.namespace import RDFS
       
    27 
       
    28 
       
    29 logger = logging.getLogger(__name__)
       
    30 
       
    31 class ImprintSerializer(ModelSerializer):
       
    32     
       
    33     imprintCity = SimpleFieldSerializer(predicate=IIEP.imprintCity, lang_field='lang')
       
    34     publisher = SimpleFieldSerializer(predicate=IIEP.publisher, lang_field='lang')
       
    35     imprintDate = SimpleFieldSerializer(predicate=IIEP.imprintDate, lang_field='lang')
       
    36 
       
    37 
       
    38 class VolumeIssueSerializer(ModelSerializer):
       
    39     volume = SimpleFieldSerializer(predicate=IIEP.volume, lang_field='lang')
       
    40     number = SimpleFieldSerializer(predicate=IIEP.number, lang_field='lang')
       
    41 
       
    42 
       
    43 class MeetingSerializer(ModelSerializer):
       
    44     label = SimpleFieldSerializer(predicate=RDFS.label, lang_field='lang')
       
    45     meetingNumber = SimpleFieldSerializer(predicate=IIEP.meetingNumber, lang_field='lang')
       
    46     meetingPlace = SimpleFieldSerializer(predicate=IIEP.meetingPlace, lang_field='lang')
       
    47     meetingDate = SimpleFieldSerializer(predicate=IIEP.meetingDate, lang_field='lang')
       
    48     meetingYear = SimpleFieldSerializer(predicate=IIEP.meetingYear, lang_field='lang')
       
    49 
       
    50 class SubjectMeetingSerializer(ModelSerializer):
       
    51     label = SimpleFieldSerializer(predicate=RDFS.label)
       
    52     meetingNumber = SimpleFieldSerializer(predicate=IIEP.meetingNumber)
       
    53     meetingPlace = SimpleFieldSerializer(predicate=IIEP.meetingPlace)
       
    54     meetingDate = SimpleFieldSerializer(predicate=IIEP.meetingDate)
       
    55     meetingYear = SimpleFieldSerializer(predicate=IIEP.meetingYear)
       
    56 
       
    57 
       
    58 class SerieSerializer(ModelSerializer):
       
    59     title = SimpleFieldSerializer(predicate=DCT.title, lang_field='lang')
       
    60     volume = SimpleFieldSerializer(predicate=IIEP.volume, lang_field='lang')
       
    61 
       
    62 
       
    63 class UrlSerializer(ModelSerializer):
       
    64     address = SimpleFieldSerializer(predicate=IIEP.address)
       
    65     display = SimpleFieldSerializer(predicate=IIEP.display)
       
    66     accessLevel = SimpleFieldSerializer(predicate=IIEP.accessLevel)
       
    67 
       
    68 
       
    69 
       
    70 class RecordSerializer(ModelSerializer):
       
    71     
       
    72     identifier = SimpleFieldSerializer(predicate=DCT.identifier)
       
    73     notes = SimpleFieldSerializer(predicate=IIEP.notes)
       
    74     editionStatement = SimpleFieldSerializer(predicate=IIEP.editionStatement)
       
    75     recordType = SimpleFieldSerializer(predicate=DCT.type)
       
    76     isDocumentPart = BooleanFieldSerializer(predicate=IIEP.isDocumentPart)
       
    77 
       
    78     language = RelatedFieldSerializer(many=False, value_field='uri', predicate=DCT.language) 
       
    79     otherLanguages = RelatedFieldSerializer(many=True, value_field='uri', predicate=IIEP.otherLanguage)
       
    80     subjects = RelatedFieldSerializer(many=True, value_field='uri', predicate=DCT.subject)
       
    81     themes = RelatedFieldSerializer(many=True, value_field='uri', predicate=IIEP.theme)
       
    82     countries = RelatedFieldSerializer(many=True, value_field='uri', predicate=IIEP.country)
       
    83     projectNames = RelatedFieldSerializer(many=True, value_field='uri', predicate=IIEP.projectName)
       
    84     subjectCorporateBodies = RelatedFieldSerializer(many=True, value_field='uri', predicate=IIEP.subjectCorporateBody) 
       
    85     corporateAuthors = RelatedFieldSerializer(many=True, value_field='uri', predicate=IIEP.corporateAuthor)
       
    86     
       
    87     isbns = RelatedFieldSerializer(many=True, value_field='isbn', predicate=IIEP.isbn, lang_field='lang')
       
    88     issns = RelatedFieldSerializer(many=True, value_field='issn', predicate=IIEP.issn, lang_field='lang')
       
    89     collations = RelatedFieldSerializer(many=True, value_field='collation', predicate=IIEP.collation, lang_field='lang')
       
    90     documentCodes = RelatedFieldSerializer(many=True, value_field='documentCode', predicate=IIEP.documentCode, lang_field='lang')
       
    91     titles = RelatedFieldSerializer(many=True, value_field='title', predicate=IIEP.title, lang_field='lang')
       
    92     addedTitles = RelatedFieldSerializer(many=True, value_field='title', predicate=IIEP.addedTitle, lang_field='lang')
       
    93     titlesMainDocument = RelatedFieldSerializer(many=True, value_field='title', predicate=IIEP.titleMainDocument, lang_field='lang')
       
    94     abstracts = RelatedFieldSerializer(many=True, value_field='abstract', predicate=IIEP.abstract, lang_field='lang')
       
    95     periodicals = RelatedFieldSerializer(many=True, value_field='label', predicate=IIEP.periodical, lang_field='lang')
       
    96     authors = RelatedFieldSerializer(many=True, value_field='name', predicate=IIEP.author)
       
    97     subjectPersons = RelatedFieldSerializer(many=True, value_field='name', predicate=IIEP.subjectPerson)
       
    98     
       
    99     imprints = ImprintSerializer(many=True, predicate=IIEP.imprint)
       
   100     volumeIssues = VolumeIssueSerializer(many=True, predicate=IIEP.volumeIssue)
       
   101     meetings = MeetingSerializer(many=True, predicate=IIEP.meeting)
       
   102     subjectMeetings = SubjectMeetingSerializer(many=True, predicate=IIEP.subjectMeeting)
       
   103     series = SerieSerializer(many=True, predicate=IIEP.serie)
       
   104     urls = UrlSerializer(many=True, predicate=IIEP.url)
       
   105     
       
   106     
       
   107     class Meta:
       
   108         type = IIEP.Record
       
   109         uri_fieldname = "uri"
       
   110 
       
   111 class Command(BaseCommand):
       
   112 
       
   113     args = "file_path..."
       
   114 
       
   115     help = "Export p4l record rdf format"
       
   116 
       
   117     option_list = BaseCommand.option_list + (
       
   118         make_option('-l', '--limit',
       
   119             dest= 'limit',
       
   120             type='int',
       
   121             default=-1,
       
   122             help= 'number of record to export. -1 is all (default)' 
       
   123         ),
       
   124         make_option('-s', '--skip',
       
   125             dest= 'skip',
       
   126             type='int',
       
   127             default=0,
       
   128             help= 'number of record to skip before export. default 0.' 
       
   129         ),
       
   130         make_option('-b', '--batch',
       
   131             dest= 'batch',
       
   132             type='int',
       
   133             default=100,
       
   134             help= 'query batch default 500.' 
       
   135         ),
       
   136         make_option('-j', '--bzip2',
       
   137             dest= 'bzip2',
       
   138             action='store_true',
       
   139             default=False,
       
   140             help= 'bz2 compress' 
       
   141         ),
       
   142         make_option('-z', '--gzip',
       
   143             dest= 'gzip',
       
   144             action='store_true',
       
   145             default=False,
       
   146             help= 'gzip compress' 
       
   147         ),
       
   148     )
       
   149 
       
   150 
       
   151     def get_graph_from_object(self, obj):
       
   152         g = get_empty_graph()
       
   153         
       
   154         serializer = RecordSerializer()        
       
   155         serializer.to_graph(None, obj, None, g)
       
   156         
       
   157         return g
       
   158     
       
   159     
       
   160     def handle(self, *args, **options):
       
   161         
       
   162         if len(args) != 1:
       
   163             raise CommandError("This command takes exactly one argument")
       
   164         
       
   165         filepath = args[0]
       
   166 
       
   167         bzip2 = options.get('bzip2', False)
       
   168         gzip_opt = options.get('gzip', False)
       
   169         
       
   170         if bzip2 and not filepath.endswith(".bz2"):
       
   171             filepath += ".bz2"
       
   172         elif gzip_opt and not filepath.endswith(".gz"):
       
   173             filepath += ".gz"            
       
   174         
       
   175         limit = options.get("limit", -1)
       
   176         skip = options.get("skip", 0)
       
   177         batch = options.get("batch", 100)
       
   178         
       
   179         qs = Record.objects.all().select_related(*[field.name for field in Record._meta.fields if isinstance(field, ForeignKey)]).prefetch_related(*([field.name for field in Record._meta.many_to_many] + [obj.get_accessor_name() for obj in Record._meta.get_all_related_objects()])).order_by('identifier')  # @UndefinedVariable
       
   180         
       
   181         if limit>=0:
       
   182             qs = qs[skip:skip+limit]
       
   183         else:
       
   184             qs = qs[skip:]
       
   185         
       
   186         open_method = None
       
   187         open_args = []
       
   188         
       
   189         if bzip2:
       
   190             open_method = bz2.BZ2File
       
   191             open_args = [filepath, 'wb', 9] 
       
   192         elif gzip_opt:
       
   193             open_method = gzip.GzipFile
       
   194             open_args = [filepath, 'wb', 9]
       
   195         else:
       
   196             open_method = codecs.open
       
   197             open_args = [filepath, 'wb', "utf-8"]
       
   198         
       
   199         total_records = qs.count()
       
   200         
       
   201         print("Total record to export : %d" % total_records)
       
   202         progress_writer = None
       
   203         
       
   204         with open_method(*open_args) as dest_file:
       
   205             writer = XMLGenerator(dest_file, "UTF-8")
       
   206             writer.startDocument()
       
   207             for prefix,uri in GRAPH_NAMESPACES.items():
       
   208                 writer.startPrefixMapping(prefix, uri)
       
   209             writer.startElementNS((RDF, 'RDF'), 'RDF', AttributesNSImpl({}, {}))
       
   210             writer.characters("\n")
       
   211             for n in range((total_records/batch)+1):
       
   212                 for i,r in enumerate(qs[n*batch:((n+1)*batch)]):
       
   213                     progress_writer = show_progress(i+(n*batch)+1, total_records, "Exporting record %s" % r.identifier, 50, progress_writer) 
       
   214                     graph = self.get_graph_from_object(r)
       
   215                     do_write = False
       
   216                     for line in graph.serialize(format="pretty-xml", encoding="utf-8").splitlines(True):
       
   217                         if "<iiep:Record" in line:
       
   218                             do_write = True
       
   219                         if do_write:
       
   220                             dest_file.write(line.decode("utf-8"))
       
   221                         if "</iiep:Record>" in line:
       
   222                             break
       
   223                 
       
   224             writer.endElementNS((RDF, 'RDF'), 'RDF')
       
   225             writer.endDocument()
       
   226             dest_file.write("\n")