| author | cavaliet |
| Tue, 17 Jun 2014 10:25:33 +0200 | |
| changeset 271 | 8f77cf71ab02 |
| parent 104 | web/hdabo/management/commands/import_csv.py@28a2c02ef6c8 |
| child 693 | 09e00f38d177 |
| permissions | -rw-r--r-- |
| 0 | 1 |
# -*- coding: utf-8 -*- |
2 |
''' |
|
3 |
Created on May 25, 2011 |
|
4 |
||
5 |
@author: ymh |
|
6 |
''' |
|
|
19
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
7 |
#Auteur,Chemin,Comment,Controle,Datcre,Datmaj,Desc,Domaine,Format,ID,Insee,Org,Org_Home,OrgID,Periode1,Periode2,Periode3,Satut,Sousdom,Tag,Theme2,Theme3,Titre,Url,Vignette,Ville |
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
8 |
#"Auteur","Chemin","Comment","Controle","Datcre","Datmaj","Desc","Domaine","Format","ID","Insee","Org","Org_Home","OrgID","Periode1","Periode2","Periode3","Satut","Sousdom","Tag","Theme2","Theme3","Titre","Url","Vignette","Ville", |
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
9 |
|
| 0 | 10 |
from django.core.management.base import BaseCommand, CommandError |
| 2 | 11 |
from django.db import transaction |
| 23 | 12 |
from hdabo.models import (Author, Datasheet, DocumentFormat, Domain, Organisation, |
| 2 | 13 |
Tag, TaggedSheet, TimePeriod, Location) |
|
25
e5f8cb1020c8
add command to reset wikipedia info on a tag
ymh <ymh.work@gmail.com>
parents:
23
diff
changeset
|
14 |
from hdabo.wp_utils import normalize_tag |
| 2 | 15 |
from optparse import make_option |
| 0 | 16 |
import csv |
| 2 | 17 |
import datetime |
18 |
import math |
|
19 |
import sys |
|
| 0 | 20 |
|
21 |
class Command(BaseCommand): |
|
22 |
''' |
|
| 2 | 23 |
Command to import csvfile |
| 0 | 24 |
''' |
25 |
args = '<path_to_csv_file path_to_csv_file ...>' |
|
| 2 | 26 |
options = '[--ignore-existing] [--lines] [--encoding]' |
27 |
help = """Import of a csv file for hdabo |
|
28 |
Options: |
|
29 |
--ignore-existing : ignore existing datasheets |
|
30 |
--lines : max number of lines to load (for each file). 0 means all. |
|
31 |
--encoding : files encoding. default to latin-1""" |
|
32 |
||
33 |
option_list = BaseCommand.option_list + ( |
|
34 |
make_option('--encoding', |
|
35 |
action='store', |
|
36 |
type='string', |
|
37 |
dest='encoding', |
|
38 |
default="latin-1", |
|
39 |
help='fix the file encoding. default to latin-1'), |
|
|
19
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
40 |
make_option('--delimiter', |
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
41 |
action='store', |
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
42 |
type='string', |
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
43 |
dest='delimiter', |
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
44 |
default=";", |
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
45 |
help='csv file delimiter'), |
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
46 |
make_option('--dialect', |
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
47 |
action='store', |
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
48 |
type='string', |
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
49 |
dest='dialect', |
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
50 |
default="excel", |
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
51 |
help='csv dialect'), |
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
52 |
make_option('--fieldnames', |
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
53 |
action='store', |
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
54 |
type='string', |
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
55 |
dest='fieldnames', |
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
56 |
default=None, |
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
57 |
help='fields list (comma separated)'), |
| 2 | 58 |
make_option('--lines', |
59 |
action='store', |
|
60 |
type='int', |
|
61 |
dest='lines', |
|
62 |
default=0, |
|
63 |
help='Number of lines to read. 0 means all.'), |
|
64 |
make_option('--ignore-existing', |
|
65 |
action='store_true', |
|
66 |
dest='ignore_existing', |
|
67 |
default=False, |
|
68 |
help='force insertion'), |
|
69 |
||
70 |
) |
|
71 |
||
72 |
def show_progress(self, current_line, total_line, width): |
|
73 |
||
| 21 | 74 |
percent = (float(current_line) / float(total_line)) * 100.0 |
| 2 | 75 |
|
76 |
marks = math.floor(width * (percent / 100.0)) |
|
77 |
spaces = math.floor(width - marks) |
|
78 |
||
79 |
loader = '[' + ('=' * int(marks)) + (' ' * int(spaces)) + ']' |
|
80 |
||
| 21 | 81 |
sys.stdout.write("%s %d%% %d/%d\r" % (loader, percent, current_line - 1, total_line - 1)) #takes the header into account |
| 2 | 82 |
if percent >= 100: |
83 |
sys.stdout.write("\n") |
|
84 |
sys.stdout.flush() |
|
85 |
||
| 0 | 86 |
|
87 |
def create_domain_period(self, row_value, klass, school_period): |
|
88 |
res_list = [] |
|
89 |
if not row_value: |
|
90 |
return res_list |
|
| 2 | 91 |
for label_str in [dstr.strip() for dstr in row_value.split('\x0b')]: |
| 0 | 92 |
if label_str: |
| 21 | 93 |
res_obj, created = klass.objects.get_or_create(label=label_str, school_period=school_period, defaults={"label":label_str, "school_period":school_period}) #@UnusedVariable |
| 0 | 94 |
res_list.append(res_obj) |
95 |
return res_list |
|
| 2 | 96 |
|
97 |
def create_datasheet(self, row): |
|
98 |
||
99 |
if self.ignore_existing and Datasheet.objects.filter(hda_id=row[u"ID"]).count() > 0: |
|
100 |
return |
|
101 |
||
102 |
author_str = row[u'Auteur'] |
|
103 |
if author_str: |
|
104 |
author_array = author_str.split(" ") |
|
|
19
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
105 |
if len(author_array) == 0: |
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
106 |
firstname = "" |
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
107 |
lastname = "" |
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
108 |
elif len(author_array) == 1: |
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
109 |
firstname = "" |
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
110 |
lastname = author_array[0] |
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
111 |
elif len(author_array) == 2: |
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
112 |
firstname = author_array[0] |
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
113 |
lastname = author_array[1] |
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
114 |
|
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
115 |
author, created = Author.objects.get_or_create(hda_id=author_str, defaults={"firstname":firstname, "lastname":lastname}) #@UnusedVariable |
| 2 | 116 |
else: |
117 |
author = None |
|
118 |
||
119 |
org_str = row[u"Org"] |
|
120 |
if org_str: |
|
121 |
url_str = row[u'Org_Home'] |
|
|
11
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
2
diff
changeset
|
122 |
if url_str is not None: |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
2
diff
changeset
|
123 |
url_str = url_str.strip() |
| 2 | 124 |
org, created = Organisation.objects.get_or_create(hda_id=org_str, defaults={"name":org_str, "website" : url_str}) #@UnusedVariable |
125 |
else: |
|
126 |
org = None |
|
127 |
||
128 |
town_str = row[u"Ville"] |
|
129 |
if town_str: |
|
|
19
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
130 |
insee_str = row[u'Insee'].strip() if row[u'Insee'] else row[u'Insee'] |
| 21 | 131 |
if len(insee_str) > 5: |
|
19
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
132 |
insee_str = "" |
| 2 | 133 |
loc, created = Location.objects.get_or_create(insee=insee_str, defaults={"name": town_str, "insee": insee_str}) #@UnusedVariable |
134 |
else: |
|
135 |
loc = None |
|
136 |
||
137 |
format_str = row[u"Format"] |
|
138 |
if format_str: |
|
139 |
format, created = DocumentFormat.objects.get_or_create(label=format_str, defaults={"label": format_str}) #@UnusedVariable |
|
140 |
else: |
|
141 |
format = None |
|
142 |
||
143 |
domains = self.create_domain_period(row[u"Domaine"], Domain, Domain.DOMAIN_PERIOD_DICT[u'Global']) |
|
144 |
||
145 |
primary_periods = self.create_domain_period(row[u"Periode1"], TimePeriod, TimePeriod.TIME_PERIOD_DICT[u'Primaire']) |
|
146 |
college_periods = self.create_domain_period(row[u"Periode2"], TimePeriod, TimePeriod.TIME_PERIOD_DICT[u'Collège']) |
|
147 |
highschool_periods = self.create_domain_period(row[u"Periode3"], TimePeriod, TimePeriod.TIME_PERIOD_DICT[u'Lycée']) |
|
148 |
||
149 |
primary_themes = self.create_domain_period(row[u"Sousdom"], Domain, Domain.DOMAIN_PERIOD_DICT[u'Primaire']) |
|
150 |
college_themes = self.create_domain_period(row[u"Theme2"], Domain, Domain.DOMAIN_PERIOD_DICT[u'Collège']) |
|
151 |
highschool_themes = self.create_domain_period(row[u"Theme3"], Domain, Domain.DOMAIN_PERIOD_DICT[u'Lycée']) |
|
152 |
||
|
11
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
2
diff
changeset
|
153 |
url = row[u"Url"] |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
2
diff
changeset
|
154 |
if url is not None: |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
2
diff
changeset
|
155 |
url = url.strip() |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
2
diff
changeset
|
156 |
|
| 2 | 157 |
datasheet = Datasheet.objects.create( |
| 21 | 158 |
hda_id=row[u"ID"], |
159 |
author=author, |
|
160 |
organisation=org, |
|
161 |
title=row[u"Titre"], |
|
162 |
description=row[u"Desc"], |
|
163 |
url=url, |
|
164 |
town=loc, |
|
165 |
format=format, |
|
166 |
original_creation_date=datetime.datetime.strptime(row[u"Datcre"], "%d/%m/%Y").date(), |
|
167 |
original_modification_date=datetime.datetime.strptime(row[u"Datmaj"], "%d/%m/%Y").date(), |
|
168 |
validated=False |
|
| 2 | 169 |
) |
170 |
||
171 |
datasheet.save() |
|
172 |
||
|
104
28a2c02ef6c8
Remove sorted m2m fields and prepare for south
ymh <ymh.work@gmail.com>
parents:
72
diff
changeset
|
173 |
datasheet.set_domains(domains) |
|
28a2c02ef6c8
Remove sorted m2m fields and prepare for south
ymh <ymh.work@gmail.com>
parents:
72
diff
changeset
|
174 |
datasheet.set_primary_periods(primary_periods) |
|
28a2c02ef6c8
Remove sorted m2m fields and prepare for south
ymh <ymh.work@gmail.com>
parents:
72
diff
changeset
|
175 |
datasheet.set_college_periods(college_periods) |
|
28a2c02ef6c8
Remove sorted m2m fields and prepare for south
ymh <ymh.work@gmail.com>
parents:
72
diff
changeset
|
176 |
datasheet.set_highschool_periods(highschool_periods) |
|
28a2c02ef6c8
Remove sorted m2m fields and prepare for south
ymh <ymh.work@gmail.com>
parents:
72
diff
changeset
|
177 |
datasheet.set_primary_themes(primary_themes) |
|
28a2c02ef6c8
Remove sorted m2m fields and prepare for south
ymh <ymh.work@gmail.com>
parents:
72
diff
changeset
|
178 |
datasheet.set_college_themes(college_themes) |
|
28a2c02ef6c8
Remove sorted m2m fields and prepare for south
ymh <ymh.work@gmail.com>
parents:
72
diff
changeset
|
179 |
datasheet.set_highschool_themes(highschool_themes) |
| 2 | 180 |
|
181 |
||
182 |
if row[u'Tag']: |
|
| 21 | 183 |
for i, tag in enumerate([t.strip() for t in row[u'Tag'].split(u";")]): |
184 |
if len(tag) == 0: |
|
|
15
a9136d8f0b4a
add commant to reorder tags and query wikipedia
ymh <ymh.work@gmail.com>
parents:
11
diff
changeset
|
185 |
continue |
|
25
e5f8cb1020c8
add command to reset wikipedia info on a tag
ymh <ymh.work@gmail.com>
parents:
23
diff
changeset
|
186 |
tag_label = normalize_tag(tag) |
| 69 | 187 |
tag_obj = None |
188 |
for t in Tag.objects.filter(label__iexact=tag_label): |
|
189 |
if tag_obj is None or t.url_status != Tag.TAG_URL_STATUS_DICT['null_result']: |
|
190 |
tag_obj = t |
|
191 |
if tag_obj.url_status != Tag.TAG_URL_STATUS_DICT['null_result']: |
|
192 |
break |
|
193 |
||
194 |
if tag_obj is None: |
|
195 |
tag_obj = Tag(label=tag_label, original_label=tag) |
|
| 72 | 196 |
tag_obj.save() |
| 69 | 197 |
|
| 21 | 198 |
tagged_ds = TaggedSheet(datasheet=datasheet, tag=tag_obj, original_order=i + 1, order=i + 1) |
| 2 | 199 |
tagged_ds.save() |
200 |
||
| 0 | 201 |
|
202 |
def handle(self, *args, **options): |
|
| 2 | 203 |
|
| 21 | 204 |
if len(args) == 0: |
| 2 | 205 |
raise CommandError("Gives at lat one csv file to import") |
206 |
||
207 |
self.encoding = options.get('encoding', "latin-1") |
|
| 21 | 208 |
lines = options.get('lines', 0) |
| 2 | 209 |
self.ignore_existing = options.get('ignore_existing', False) |
| 21 | 210 |
fieldnames = options.get('fieldnames', None) |
| 2 | 211 |
|
212 |
transaction.commit_unless_managed() |
|
213 |
transaction.enter_transaction_management() |
|
214 |
transaction.managed(True) |
|
215 |
||
216 |
try: |
|
217 |
for csv_path in args: |
|
218 |
try: |
|
219 |
print "Processing %s " % (csv_path) |
|
220 |
with open(csv_path, 'rU') as csv_file: |
|
| 0 | 221 |
|
| 2 | 222 |
# get the number of lines if necessary |
223 |
if not lines: |
|
| 21 | 224 |
for i, l in enumerate(csv_file): #@UnusedVariable |
| 2 | 225 |
pass |
| 21 | 226 |
total_line = i + 1 |
|
19
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
227 |
if fieldnames: |
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
228 |
total_line = total_line + 1 |
| 2 | 229 |
csv_file.seek(0) |
230 |
else: |
|
| 21 | 231 |
total_line = lines + 1 |
|
19
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
232 |
|
| 47 | 233 |
delimiter = options.get('delimiter', ";") |
234 |
if delimiter == "TAB" or delimiter == "\\t": |
|
235 |
delimiter = '\t' |
|
236 |
||
237 |
dr_kwargs = {'delimiter':delimiter} |
|
|
19
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
238 |
if fieldnames is not None: |
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
239 |
dr_kwargs['fieldnames'] = [f.strip() for f in fieldnames.split(",")] |
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
240 |
dialect = options.get('dialect', "excel") |
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
241 |
if dialect is not None: |
|
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
242 |
dr_kwargs['dialect'] = dialect |
| 2 | 243 |
|
|
19
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
244 |
reader = csv.DictReader(csv_file, **dr_kwargs) |
| 2 | 245 |
|
| 21 | 246 |
for j, row in enumerate(reader): |
247 |
if lines and j >= lines: |
|
| 2 | 248 |
break |
| 21 | 249 |
line_num = reader.line_num if fieldnames is None else reader.line_num + 1 |
|
19
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
250 |
self.show_progress(line_num, total_line, 60) |
| 21 | 251 |
def safe_decode(val, encoding): |
|
11
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
2
diff
changeset
|
252 |
if val: |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
2
diff
changeset
|
253 |
return val.decode(encoding) |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
2
diff
changeset
|
254 |
else: |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
2
diff
changeset
|
255 |
return val |
|
143ab88d17f8
add ordered manytomany fields and indexing
ymh <ymh.work@gmail.com>
parents:
2
diff
changeset
|
256 |
|
| 21 | 257 |
row = dict([(safe_decode(key, self.encoding), safe_decode(value, self.encoding)) for key, value in row.items()]) |
| 2 | 258 |
self.create_datasheet(row) |
259 |
||
260 |
transaction.commit() |
|
261 |
except Exception: |
|
262 |
transaction.rollback() |
|
263 |
raise |
|
264 |
finally: |
|
|
19
e2f27df4e17b
some changes to import all data from export
ymh <ymh.work@gmail.com>
parents:
15
diff
changeset
|
265 |
print('') |
| 2 | 266 |
finally: |
267 |
transaction.leave_transaction_management() |