1 # -*- coding: utf-8 -*- |
|
2 ''' |
|
3 Created on Jan 30, 2012 |
|
4 |
|
5 @author: ymh |
|
6 ''' |
|
7 |
|
8 from django.core.management.base import NoArgsCommand |
|
9 from django.core.management.color import no_style |
|
10 |
|
11 from optparse import make_option |
|
12 from django.conf import settings |
|
13 from django.db.models import Count |
|
14 from django.db import transaction |
|
15 from hdabo.models import Tag |
|
16 from hdalab.models import DbpediaFields, TagLinks |
|
17 from hdabo.utils import show_progress |
|
18 from rdflib.graph import Graph |
|
19 from rdflib import URIRef |
|
20 import traceback |
|
21 import sys |
|
22 from hdalab.models.dataviz import DbpediaFieldsTranslation |
|
23 from django import db |
|
24 from SPARQLWrapper import SPARQLWrapper, RDF |
|
25 |
|
26 class Command(NoArgsCommand): |
|
27 ''' |
|
28 query and update wikipedia for tag title. |
|
29 ''' |
|
30 options = '' |
|
31 help = """query and update wikipedia for tag title.""" |
|
32 |
|
33 option_list = NoArgsCommand.option_list + ( |
|
34 make_option('--all', |
|
35 action='store_true', |
|
36 dest='all', |
|
37 default=False, |
|
38 help='force all tags to be updated, not only those not yet processed'), |
|
39 make_option('--force', |
|
40 action='store_true', |
|
41 dest='force', |
|
42 default=False, |
|
43 help='ask no questions'), |
|
44 make_option('--random', |
|
45 action='store_true', |
|
46 dest='random', |
|
47 default=False, |
|
48 help='randomize query on tags'), |
|
49 make_option('--limit', |
|
50 action='store', |
|
51 type='int', |
|
52 dest='limit', |
|
53 default= -1, |
|
54 help='number of tag to process'), |
|
55 make_option('--start', |
|
56 action='store', |
|
57 type='int', |
|
58 dest='start', |
|
59 default=0, |
|
60 help='number of tag to ignore'), |
|
61 make_option('--tag', |
|
62 action='append', |
|
63 dest='tags', |
|
64 type='string', |
|
65 default=[], |
|
66 help='the tag to query'), |
|
67 ) |
|
68 |
|
69 def handle_noargs(self, **options): |
|
70 |
|
71 self.style = no_style() |
|
72 |
|
73 self.interactive = options.get('interactive', True) |
|
74 |
|
75 self.verbosity = int(options.get('verbosity', '1')) |
|
76 |
|
77 self.force = options.get('force', False) |
|
78 |
|
79 self.limit = options.get("limit", -1) |
|
80 self.start = options.get("start", 0) |
|
81 |
|
82 self.random = options.get('random', False) |
|
83 |
|
84 if self.verbosity > 2: |
|
85 print "option passed : " + repr(options) |
|
86 |
|
87 self.tag_list = options.get("tags", []); |
|
88 |
|
89 queryset = Tag.objects.exclude(dbpedia_uri= None) |
|
90 |
|
91 |
|
92 if self.tag_list: |
|
93 queryset = queryset.filter(label__in=self.tag_list) |
|
94 elif not options.get('all',False): |
|
95 queryset = queryset.annotate(dbfc=Count('dbpedia_fields')).filter(dbfc = 0) |
|
96 |
|
97 if self.random: |
|
98 queryset = queryset.order_by("?") |
|
99 else: |
|
100 queryset = queryset.order_by("label") |
|
101 |
|
102 if self.limit >= 0: |
|
103 queryset = queryset[self.start:self.limit] |
|
104 elif self.start > 0: |
|
105 queryset = queryset[self.start:] |
|
106 |
|
107 if self.verbosity > 2 : |
|
108 print "Tag Query is %s" % (queryset.query) |
|
109 |
|
110 count = queryset.count() |
|
111 |
|
112 if count == 0: |
|
113 print "No tag to query : exit." |
|
114 return |
|
115 |
|
116 |
|
117 if not self.force and self.interactive: |
|
118 confirm = raw_input("You have requested to query and replace the dbpedia information for %d tags.\n Are you sure you want to do this? \nType 'yes' to continue, or 'no' to cancel: " % (count)) |
|
119 else: |
|
120 confirm = 'yes' |
|
121 |
|
122 if confirm != "yes": |
|
123 print "dbpedia query cancelled" |
|
124 return |
|
125 |
|
126 endpoint = SPARQLWrapper("http://dbpedia.org/sparql", returnFormat=RDF) |
|
127 |
|
128 writer = None |
|
129 for i,tag in enumerate(queryset): |
|
130 writer = show_progress(i+1, count, tag.label, 50, writer) |
|
131 db.reset_queries() |
|
132 |
|
133 #abstract query |
|
134 #"select ?y |
|
135 # where {<%s> <http://dbpedia.org/ontology/abstract> ?y}" % (tag.dbpedia_uri) |
|
136 |
|
137 #rdf_uri = re.sub('\/resource\/', "/data/", tag.dbpedia_uri) + ".n3" |
|
138 #g = Graph() |
|
139 try : |
|
140 abstracts = {} |
|
141 labels = {} |
|
142 thumbnail = None |
|
143 with transaction.commit_on_success(): |
|
144 endpoint.setQuery("select distinct ?y where {<%s> <http://dbpedia.org/ontology/abstract> ?y}" % (tag.dbpedia_uri)) |
|
145 res_abstracts = endpoint.queryAndConvert() |
|
146 for _,_,o in res_abstracts.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)): |
|
147 abstracts[o.language] = (unicode(o), True) |
|
148 |
|
149 endpoint.setQuery("select distinct ?y where {<%s> <http://www.w3.org/2000/01/rdf-schema#label> ?y}" % (tag.dbpedia_uri)) |
|
150 res_labels = endpoint.queryAndConvert() |
|
151 for _,_,o in res_labels.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)): |
|
152 labels[o.language] = (unicode(o), True) |
|
153 |
|
154 endpoint.setQuery("select distinct ?y where {<%s> <http://dbpedia.org/ontology/thumbnail> ?y} limit 1" % (tag.dbpedia_uri)) |
|
155 res_thumbnails = endpoint.queryAndConvert() |
|
156 for _,_,o in res_thumbnails.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)): |
|
157 thumbnail = unicode(o) |
|
158 |
|
159 endpoint.setQuery('select distinct ?y where { <%s> ?p ?y . FILTER regex(?y, "^http://dbpedia.org/resource")}' % (tag.dbpedia_uri)) |
|
160 res_links = endpoint.queryAndConvert() |
|
161 for _,_,o in res_links.triples((None, URIRef('http://www.w3.org/2005/sparql-results#value'), None)): |
|
162 tagqs = Tag.objects.filter(dbpedia_uri=unicode(o)) |
|
163 if tagqs: |
|
164 TagLinks.objects.get_or_create(subject=tag, object=tagqs[0]) |
|
165 |
|
166 ref_label_lang, (ref_label, _) = ('fr',labels['fr']) if 'fr' in labels else ('en',labels['en']) if 'en' in labels else labels.items()[0] if len(labels) > 0 else ('fr',(tag.label, True)) |
|
167 ref_abstract_lang, (ref_abstract, _) = ('fr',abstracts['fr']) if 'fr' in abstracts else ('en',abstracts['en']) if 'en' in abstracts else abstracts.items()[0] if len(abstracts) > 0 else ('fr',(None, 'True')) |
|
168 |
|
169 for lang in settings.LANGUAGES: |
|
170 if lang[0] not in labels: |
|
171 labels[lang[0]]= (ref_label, False) |
|
172 if lang[0] not in abstracts: |
|
173 abstracts[lang[0]] = (ref_abstract, False) |
|
174 |
|
175 dbfield , created = DbpediaFields.objects.get_or_create(dbpedia_uri=tag.dbpedia_uri,tag=tag, defaults={'abstract':ref_abstract, 'thumbnail':thumbnail, 'label':ref_label}) #@UndefinedVariable |
|
176 if not created: |
|
177 dbfield.abstract = ref_abstract |
|
178 dbfield.thumbnail = thumbnail |
|
179 dbfield.label = ref_label |
|
180 dbfield.save() |
|
181 DbpediaFieldsTranslation.objects.filter(master=dbfield).delete() |
|
182 |
|
183 consolidated_trans = {} |
|
184 for lang,label in labels.iteritems(): |
|
185 consolidated_trans[lang] = [label,(ref_abstract, lang==ref_abstract_lang)] |
|
186 for lang,abstract in abstracts.iteritems(): |
|
187 if lang in consolidated_trans: |
|
188 consolidated_trans[lang][1] = abstract |
|
189 else: |
|
190 consolidated_trans[lang] = [(ref_label, lang==ref_label_lang), abstract] |
|
191 |
|
192 for lang, trans in consolidated_trans.iteritems(): |
|
193 label, abstract = tuple(trans) |
|
194 DbpediaFieldsTranslation.objects.create(master=dbfield, language_code=lang, label=label[0], is_label_translated=label[1], abstract=abstract[0], is_abstract_translated=abstract[1]) |
|
195 |
|
196 |
|
197 except Exception as e: |
|
198 print "\nError processing resource %s : %s" %(rdf_uri,unicode(e)) |
|
199 traceback.print_exception(type(e), e, sys.exc_info()[2]) |
|
200 |
|
201 |
|
202 |
|
203 |
|
204 |
|
205 |
|