web/hdabo/management/commands/importcsv.py
author ymh <ymh.work@gmail.com>
Fri, 10 Jun 2011 20:53:40 +0200
changeset 19 e2f27df4e17b
parent 15 a9136d8f0b4a
child 21 20d3375b6d28
permissions -rw-r--r--
some changes to import all data from export

# -*- coding: utf-8 -*-
'''
Created on May 25, 2011

@author: ymh
'''
#Auteur,Chemin,Comment,Controle,Datcre,Datmaj,Desc,Domaine,Format,ID,Insee,Org,Org_Home,OrgID,Periode1,Periode2,Periode3,Satut,Sousdom,Tag,Theme2,Theme3,Titre,Url,Vignette,Ville
#"Auteur","Chemin","Comment","Controle","Datcre","Datmaj","Desc","Domaine","Format","ID","Insee","Org","Org_Home","OrgID","Periode1","Periode2","Periode3","Satut","Sousdom","Tag","Theme2","Theme3","Titre","Url","Vignette","Ville",

from django.core.management.base import BaseCommand, CommandError
from django.db import transaction
from hdabo.models import (Author, Datasheet, DocumentFormat, Domain, Organisation, 
    Tag, TaggedSheet, TimePeriod, Location)
from optparse import make_option
import csv
import datetime
import math
import sys

class Command(BaseCommand):
    '''
    Command to import csvfile
    '''
    args = '<path_to_csv_file path_to_csv_file ...>'
    options = '[--ignore-existing] [--lines] [--encoding]'
    help = """Import of a csv file for hdabo
Options:
    --ignore-existing : ignore existing datasheets
    --lines : max number of lines to load (for each file). 0 means all.
    --encoding : files encoding. default to latin-1"""
    
    option_list = BaseCommand.option_list + (
        make_option('--encoding',
            action='store',
            type='string',
            dest='encoding',
            default="latin-1",
            help='fix the file encoding. default to latin-1'),
        make_option('--delimiter',
            action='store',
            type='string',
            dest='delimiter',
            default=";",
            help='csv file delimiter'),
        make_option('--dialect',
            action='store',
            type='string',
            dest='dialect',
            default="excel",
            help='csv dialect'),
        make_option('--fieldnames',
            action='store',
            type='string',
            dest='fieldnames',
            default=None,
            help='fields list (comma separated)'),
        make_option('--lines',
            action='store',
            type='int',
            dest='lines',
            default=0,
            help='Number of lines to read. 0 means all.'),
        make_option('--ignore-existing',
            action='store_true',
            dest='ignore_existing',
            default=False,
            help='force insertion'),
        
        )
    
    def show_progress(self, current_line, total_line, width):

        percent = (float(current_line)/float(total_line))*100.0

        marks = math.floor(width * (percent / 100.0))
        spaces = math.floor(width - marks)
    
        loader = '[' + ('=' * int(marks)) + (' ' * int(spaces)) + ']'
    
        sys.stdout.write("%s %d%% %d/%d\r" % (loader, percent, current_line-1, total_line-1)) #takes the header into account
        if percent >= 100:
            sys.stdout.write("\n")
        sys.stdout.flush()


    def normalize_tag(self, tag):
        if len(tag) == 0:
            return tag
        tag = tag.strip()
        tag = tag.replace("_", " ")
        tag = " ".join(tag.split())
        tag = tag[0].upper() + tag[1:]
        return tag
    
    def create_domain_period(self, row_value, klass, school_period):
        res_list = []
        if not row_value:
            return res_list
        for label_str in [dstr.strip() for dstr in row_value.split('\x0b')]:
            if label_str:
                res_obj, created = klass.objects.get_or_create(label=label_str, school_period=school_period, defaults={"label":label_str,"school_period":school_period}) #@UnusedVariable
                res_list.append(res_obj)
        return res_list
    
    def create_datasheet(self, row):
        
        if self.ignore_existing and Datasheet.objects.filter(hda_id=row[u"ID"]).count() > 0:
            return
        
        author_str = row[u'Auteur']
        if author_str:
            author_array = author_str.split(" ")
            if len(author_array) == 0:
                firstname = ""
                lastname = ""
            elif len(author_array) == 1:
                firstname = ""
                lastname = author_array[0]
            elif len(author_array) == 2:
                firstname = author_array[0]
                lastname = author_array[1]
                
            author, created = Author.objects.get_or_create(hda_id=author_str, defaults={"firstname":firstname, "lastname":lastname}) #@UnusedVariable
        else:
            author = None
        
        org_str = row[u"Org"]    
        if org_str:
            url_str = row[u'Org_Home']
            if url_str is not None:
                url_str = url_str.strip()
            org, created = Organisation.objects.get_or_create(hda_id=org_str, defaults={"name":org_str, "website" : url_str}) #@UnusedVariable
        else:
            org = None
            
        town_str = row[u"Ville"]
        if town_str:
            insee_str = row[u'Insee'].strip() if row[u'Insee'] else row[u'Insee']
            if len(insee_str)>5:
                insee_str = "" 
            loc, created = Location.objects.get_or_create(insee=insee_str, defaults={"name": town_str, "insee": insee_str}) #@UnusedVariable
        else:
            loc = None
            
        format_str = row[u"Format"]
        if format_str:
            format, created = DocumentFormat.objects.get_or_create(label=format_str, defaults={"label": format_str}) #@UnusedVariable
        else:
            format = None
        
        domains = self.create_domain_period(row[u"Domaine"], Domain, Domain.DOMAIN_PERIOD_DICT[u'Global'])
                                        
        primary_periods = self.create_domain_period(row[u"Periode1"], TimePeriod, TimePeriod.TIME_PERIOD_DICT[u'Primaire'])
        college_periods = self.create_domain_period(row[u"Periode2"], TimePeriod, TimePeriod.TIME_PERIOD_DICT[u'Collège'])
        highschool_periods = self.create_domain_period(row[u"Periode3"], TimePeriod, TimePeriod.TIME_PERIOD_DICT[u'Lycée'])
                    
        primary_themes = self.create_domain_period(row[u"Sousdom"], Domain, Domain.DOMAIN_PERIOD_DICT[u'Primaire'])
        college_themes = self.create_domain_period(row[u"Theme2"], Domain, Domain.DOMAIN_PERIOD_DICT[u'Collège'])
        highschool_themes = self.create_domain_period(row[u"Theme3"], Domain, Domain.DOMAIN_PERIOD_DICT[u'Lycée'])
        
        url = row[u"Url"]
        if url is not None:
            url = url.strip()
        
        datasheet = Datasheet.objects.create(
            hda_id = row[u"ID"],
            author = author,
            organisation = org,
            title = row[u"Titre"],
            description = row[u"Desc"],
            url = url,
            town = loc,
            format = format,
            original_creation_date = datetime.datetime.strptime(row[u"Datcre"], "%d/%m/%Y").date(),
            original_modification_date = datetime.datetime.strptime(row[u"Datmaj"], "%d/%m/%Y").date(),
            validated = False                                          
        )
        
        datasheet.save()
        
        datasheet.domains = domains
        datasheet.primary_periods = primary_periods
        datasheet.college_periods = college_periods
        datasheet.highschool_periods = highschool_periods
        datasheet.primary_themes = primary_themes
        datasheet.college_themes = college_themes
        datasheet.highschool_themes = highschool_themes

        
        if row[u'Tag']:
            for i,tag in enumerate([t.strip() for t in row[u'Tag'].split(u";")]):
                if len(tag)==0:
                    continue
                tag_label = self.normalize_tag(tag)
                tag_objs = Tag.objects.filter(label__iexact=tag_label)
                if len(tag_objs) == 0:
                    tag_obj = Tag(label=tag_label,original_label=tag)
                    tag_obj.save()
                else:
                    tag_obj = tag_objs[0]
                tagged_ds = TaggedSheet(datasheet=datasheet, tag=tag_obj, original_order=i+1, order=i+1)
                tagged_ds.save()
        

    def handle(self, *args, **options):
        
        if len(args)==0:
            raise CommandError("Gives at lat one csv file to import")
        
        self.encoding = options.get('encoding', "latin-1")
        lines = options.get('lines',0)
        self.ignore_existing = options.get('ignore_existing', False)
        fieldnames = options.get('fieldnames',None)

        transaction.commit_unless_managed()
        transaction.enter_transaction_management()
        transaction.managed(True)

        try:
            for csv_path in args:
                try:
                    print "Processing %s " % (csv_path)
                    with open(csv_path, 'rU') as csv_file:
                        
                        # get the number of lines if necessary
                        if not lines:
                            for i,l in enumerate(csv_file): #@UnusedVariable
                                pass                        
                            total_line = i+1
                            if fieldnames:
                                total_line = total_line + 1
                            csv_file.seek(0)
                        else:
                            total_line = lines+1
                        
                        dr_kwargs = {'delimiter':options.get('delimiter',";")}
                        if  fieldnames is not None:
                            dr_kwargs['fieldnames'] = [f.strip() for f in fieldnames.split(",")]
                        dialect = options.get('dialect', "excel")
                        if dialect is not None:
                            dr_kwargs['dialect'] = dialect
                           
                        reader = csv.DictReader(csv_file, **dr_kwargs)
        
                        for j,row in enumerate(reader):
                            if lines and j>=lines:
                                break
                            line_num = reader.line_num if fieldnames is None else reader.line_num+1
                            self.show_progress(line_num, total_line, 60)
                            def safe_decode(val,encoding):
                                if val:
                                    return val.decode(encoding)
                                else:
                                    return val
                                                        
                            row = dict([(safe_decode(key,self.encoding), safe_decode(value,self.encoding)) for key, value in row.items()])
                            self.create_datasheet(row)
                            
                            transaction.commit()
                except Exception:
                    transaction.rollback()
                    raise 
                finally:
                    print('')
        finally:
            transaction.leave_transaction_management()