src/p4l/management/commands/dump_record.py
author ymh <ymh.work@gmail.com>
Fri, 20 Sep 2013 10:34:49 +0200
changeset 107 48440ff95906
parent 103 468349edbf7f
child 126 a345f1a67bf1
permissions -rw-r--r--
small code reorg
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
101
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
     1
# -*- coding: utf-8 -*-
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
     2
'''
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
     3
Created on Aug 30, 2013
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
     4
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
     5
@author: ymh
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
     6
'''
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
     7
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
     8
import bz2
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
     9
import codecs
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    10
import gzip
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    11
import logging
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    12
from optparse import make_option
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    13
from xml.sax.saxutils import XMLGenerator
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    14
from xml.sax.xmlreader import AttributesNSImpl
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    15
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    16
from django.core.management import BaseCommand
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    17
from django.core.management.base import CommandError
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    18
from django.db.models.fields.related import ForeignKey
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    19
107
48440ff95906 small code reorg
ymh <ymh.work@gmail.com>
parents: 103
diff changeset
    20
from p4l.mapping.constants import GRAPH_NAMESPACES, RDF, get_empty_graph
48440ff95906 small code reorg
ymh <ymh.work@gmail.com>
parents: 103
diff changeset
    21
from p4l.mapping import RecordSerializer
101
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    22
from p4l.models.data import Record
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    23
from p4l.utils import show_progress
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    24
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    25
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    26
logger = logging.getLogger(__name__)
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    27
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    28
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    29
class Command(BaseCommand):
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    30
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    31
    args = "file_path..."
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    32
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    33
    help = "Export p4l record rdf format"
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    34
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    35
    option_list = BaseCommand.option_list + (
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    36
        make_option('-l', '--limit',
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    37
            dest= 'limit',
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    38
            type='int',
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    39
            default=-1,
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    40
            help= 'number of record to export. -1 is all (default)' 
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    41
        ),
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    42
        make_option('-s', '--skip',
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    43
            dest= 'skip',
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    44
            type='int',
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    45
            default=0,
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    46
            help= 'number of record to skip before export. default 0.' 
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    47
        ),
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    48
        make_option('-b', '--batch',
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    49
            dest= 'batch',
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    50
            type='int',
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    51
            default=100,
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    52
            help= 'query batch default 500.' 
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    53
        ),
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    54
        make_option('-j', '--bzip2',
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    55
            dest= 'bzip2',
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    56
            action='store_true',
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    57
            default=False,
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    58
            help= 'bz2 compress' 
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    59
        ),
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    60
        make_option('-z', '--gzip',
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    61
            dest= 'gzip',
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    62
            action='store_true',
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    63
            default=False,
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    64
            help= 'gzip compress' 
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    65
        ),
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    66
    )
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    67
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    68
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    69
    def get_graph_from_object(self, obj):
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    70
        g = get_empty_graph()
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    71
        
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    72
        serializer = RecordSerializer()        
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    73
        serializer.to_graph(None, obj, None, g)
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    74
        
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    75
        return g
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    76
    
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    77
    
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    78
    def handle(self, *args, **options):
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    79
        
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    80
        if len(args) != 1:
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    81
            raise CommandError("This command takes exactly one argument")
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    82
        
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    83
        filepath = args[0]
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    84
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    85
        bzip2 = options.get('bzip2', False)
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    86
        gzip_opt = options.get('gzip', False)
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    87
        
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    88
        if bzip2 and not filepath.endswith(".bz2"):
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    89
            filepath += ".bz2"
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    90
        elif gzip_opt and not filepath.endswith(".gz"):
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    91
            filepath += ".gz"            
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    92
        
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    93
        limit = options.get("limit", -1)
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    94
        skip = options.get("skip", 0)
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    95
        batch = options.get("batch", 100)
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    96
        
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    97
        qs = Record.objects.all().select_related(*[field.name for field in Record._meta.fields if isinstance(field, ForeignKey)]).prefetch_related(*([field.name for field in Record._meta.many_to_many] + [obj.get_accessor_name() for obj in Record._meta.get_all_related_objects()])).order_by('identifier')  # @UndefinedVariable
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    98
        
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
    99
        if limit>=0:
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   100
            qs = qs[skip:skip+limit]
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   101
        else:
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   102
            qs = qs[skip:]
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   103
        
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   104
        open_method = None
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   105
        open_args = []
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   106
        
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   107
        if bzip2:
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   108
            open_method = bz2.BZ2File
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   109
            open_args = [filepath, 'wb', 9] 
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   110
        elif gzip_opt:
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   111
            open_method = gzip.GzipFile
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   112
            open_args = [filepath, 'wb', 9]
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   113
        else:
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   114
            open_method = codecs.open
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   115
            open_args = [filepath, 'wb', "utf-8"]
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   116
        
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   117
        total_records = qs.count()
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   118
        
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   119
        print("Total record to export : %d" % total_records)
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   120
        progress_writer = None
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   121
        
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   122
        with open_method(*open_args) as dest_file:
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   123
            writer = XMLGenerator(dest_file, "UTF-8")
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   124
            writer.startDocument()
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   125
            for prefix,uri in GRAPH_NAMESPACES.items():
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   126
                writer.startPrefixMapping(prefix, uri)
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   127
            writer.startElementNS((RDF, 'RDF'), 'RDF', AttributesNSImpl({}, {}))
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   128
            writer.characters("\n")
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   129
            for n in range((total_records/batch)+1):
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   130
                for i,r in enumerate(qs[n*batch:((n+1)*batch)]):
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   131
                    progress_writer = show_progress(i+(n*batch)+1, total_records, "Exporting record %s" % r.identifier, 50, progress_writer) 
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   132
                    graph = self.get_graph_from_object(r)
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   133
                    do_write = False
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   134
                    for line in graph.serialize(format="pretty-xml", encoding="utf-8").splitlines(True):
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   135
                        if "<iiep:Record" in line:
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   136
                            do_write = True
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   137
                        if do_write:
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   138
                            dest_file.write(line.decode("utf-8"))
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   139
                        if "</iiep:Record>" in line:
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   140
                            break
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   141
                
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   142
            writer.endElementNS((RDF, 'RDF'), 'RDF')
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   143
            writer.endDocument()
71532a54d1c4 update virtualenv + implement record serialization
ymh <ymh.work@gmail.com>
parents:
diff changeset
   144
            dest_file.write("\n")