|
1 # -*- coding: utf-8 -*- |
|
2 ''' |
|
3 Created on Jan 30, 2012 |
|
4 |
|
5 @author: ymh |
|
6 ''' |
|
7 |
|
8 from django.core.management.base import NoArgsCommand |
|
9 from django.core.management.color import no_style |
|
10 |
|
11 from optparse import make_option |
|
12 from django.conf import settings |
|
13 from django.db.models import Count |
|
14 from django.db import transaction |
|
15 from hdabo.models import Tag |
|
16 from hdalab.models import DbpediaFields, TagLinks |
|
17 from hdabo.utils import show_progress |
|
18 from rdflib.graph import Graph |
|
19 from rdflib import URIRef |
|
20 import re |
|
21 import traceback |
|
22 import sys |
|
23 from hdalab.models.dataviz import DbpediaFieldsTranslation |
|
24 from django import db |
|
25 |
|
26 class Command(NoArgsCommand): |
|
27 ''' |
|
28 query and update wikipedia for tag title. |
|
29 ''' |
|
30 options = '' |
|
31 help = """query and update wikipedia for tag title.""" |
|
32 |
|
33 option_list = NoArgsCommand.option_list + ( |
|
34 make_option('--all', |
|
35 action='store_true', |
|
36 dest='all', |
|
37 default=False, |
|
38 help='force all tags to be updated, not only those not yet processed'), |
|
39 make_option('--force', |
|
40 action='store_true', |
|
41 dest='force', |
|
42 default=False, |
|
43 help='ask no questions'), |
|
44 make_option('--random', |
|
45 action='store_true', |
|
46 dest='random', |
|
47 default=False, |
|
48 help='randomize query on tags'), |
|
49 make_option('--limit', |
|
50 action='store', |
|
51 type='int', |
|
52 dest='limit', |
|
53 default= -1, |
|
54 help='number of tag to process'), |
|
55 make_option('--start', |
|
56 action='store', |
|
57 type='int', |
|
58 dest='start', |
|
59 default=0, |
|
60 help='number of tag to ignore'), |
|
61 make_option('--tag', |
|
62 action='append', |
|
63 dest='tags', |
|
64 type='string', |
|
65 default=[], |
|
66 help='the tag to query'), |
|
67 ) |
|
68 |
|
69 def handle_noargs(self, **options): |
|
70 |
|
71 self.style = no_style() |
|
72 |
|
73 self.interactive = options.get('interactive', True) |
|
74 |
|
75 self.verbosity = int(options.get('verbosity', '1')) |
|
76 |
|
77 self.force = options.get('force', False) |
|
78 |
|
79 self.limit = options.get("limit", -1) |
|
80 self.start = options.get("start", 0) |
|
81 |
|
82 self.random = options.get('random', False) |
|
83 |
|
84 if self.verbosity > 2: |
|
85 print "option passed : " + repr(options) |
|
86 |
|
87 self.tag_list = options.get("tags", []); |
|
88 |
|
89 queryset = Tag.objects.exclude(dbpedia_uri= None) |
|
90 |
|
91 |
|
92 if self.tag_list: |
|
93 queryset = queryset.filter(label__in=self.tag_list) |
|
94 elif not options.get('all',False): |
|
95 queryset = queryset.annotate(dbfc=Count('dbpedia_fields')).filter(dbfc = 0) |
|
96 |
|
97 if self.random: |
|
98 queryset = queryset.order_by("?") |
|
99 else: |
|
100 queryset = queryset.order_by("label") |
|
101 |
|
102 if self.limit >= 0: |
|
103 queryset = queryset[self.start:self.limit] |
|
104 elif self.start > 0: |
|
105 queryset = queryset[self.start:] |
|
106 |
|
107 if self.verbosity > 2 : |
|
108 print "Tag Query is %s" % (queryset.query) |
|
109 |
|
110 count = queryset.count() |
|
111 |
|
112 if count == 0: |
|
113 print "No tag to query : exit." |
|
114 return |
|
115 |
|
116 |
|
117 if not self.force and self.interactive: |
|
118 confirm = raw_input("You have requested to query and replace the dbpedia information for %d tags.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count)) |
|
119 else: |
|
120 confirm = 'yes' |
|
121 |
|
122 if confirm != "yes": |
|
123 print "dbpedia query cancelled" |
|
124 return |
|
125 |
|
126 writer = None |
|
127 for i,tag in enumerate(queryset): |
|
128 writer = show_progress(i+1, count, tag.label, 50, writer) |
|
129 db.reset_queries() |
|
130 rdf_uri = re.sub('\/resource\/', "/data/", tag.dbpedia_uri) + ".n3" |
|
131 g = Graph() |
|
132 try : |
|
133 g.parse(rdf_uri, format="n3") |
|
134 |
|
135 with transaction.commit_on_success(): |
|
136 |
|
137 abstracts = {} |
|
138 labels = {} |
|
139 thumbnail = None |
|
140 for t in g: |
|
141 if t[1] == URIRef(u'http://dbpedia.org/ontology/abstract') and t[2] is not None \ |
|
142 and hasattr(t[2], 'language'): |
|
143 abstracts[t[2].language] = unicode(t[2]) |
|
144 if t[1] == URIRef(u'http://www.w3.org/2000/01/rdf-schema#label') and t[2] is not None \ |
|
145 and hasattr(t[2], 'language'): |
|
146 labels[t[2].language] = unicode(t[2]) |
|
147 if t[1] == URIRef(u'http://dbpedia.org/ontology/thumbnail') and t[2] is not None: |
|
148 thumbnail = unicode(t[2]) |
|
149 if u'http://dbpedia.org/resource' in t[2]: |
|
150 tagqs = Tag.objects.filter(dbpedia_uri=unicode(t[2])) |
|
151 if tagqs: |
|
152 TagLinks.objects.get_or_create(subject=tag, object=tagqs[0]) |
|
153 |
|
154 ref_label = labels['fr'] if 'fr' in labels else labels['en'] if 'en' in labels else labels.values()[0] if len(labels) > 0 else tag.label |
|
155 ref_abstract = abstracts['fr'] if 'fr' in abstracts else abstracts['en'] if 'en' in abstracts else abstracts.values()[0] if len(abstracts) > 0 else None |
|
156 for lang in settings.LANGUAGES: |
|
157 if lang[0] not in labels: |
|
158 labels[lang[0]]= ref_label |
|
159 if lang[0] not in abstracts: |
|
160 abstracts[lang[0]] = ref_abstract |
|
161 |
|
162 dbfield , created = DbpediaFields.objects.get_or_create(dbpedia_uri=tag.dbpedia_uri,tag=tag, defaults={'abstract':ref_abstract, 'thumbnail':thumbnail, 'label':ref_label}) #@UndefinedVariable |
|
163 if not created: |
|
164 dbfield.abstract = ref_abstract |
|
165 dbfield.thumbnail = thumbnail |
|
166 dbfield.label = ref_label |
|
167 dbfield.save() |
|
168 DbpediaFieldsTranslation.objects.filter(master=dbfield).delete() |
|
169 |
|
170 consolidated_trans = {} |
|
171 for lang,label in labels.iteritems(): |
|
172 consolidated_trans[lang] = [label,ref_abstract] |
|
173 for lang,abstract in abstracts.iteritems(): |
|
174 if lang in consolidated_trans: |
|
175 consolidated_trans[lang][1] = abstract |
|
176 else: |
|
177 consolidated_trans[lang] = [ref_label, abstract] |
|
178 |
|
179 for lang, trans in consolidated_trans.iteritems(): |
|
180 label, abstract = tuple(trans) |
|
181 DbpediaFieldsTranslation.objects.create(master=dbfield, language_code=lang, label=label, abstract=abstract) |
|
182 |
|
183 |
|
184 except Exception as e: |
|
185 print "\nError processing resource %s : %s" %(rdf_uri,unicode(e)) |
|
186 traceback.print_exception(type(e), e, sys.exc_info()[2]) |
|
187 |
|
188 |
|
189 |
|
190 |
|
191 |
|
192 |