# HG changeset patch # User cavaliet # Date 1391180930 -3600 # Node ID 5771052a647a5a04cd46652456cdec982d1fde56 # Parent a023e0185a02e6833a3e04724b80cdf628621224 better migration for dbpedia uri diff -r a023e0185a02 -r 5771052a647a DoctrineMigrations/Version20140129151724.php --- a/DoctrineMigrations/Version20140129151724.php Thu Jan 30 17:52:14 2014 +0100 +++ b/DoctrineMigrations/Version20140129151724.php Fri Jan 31 16:08:50 2014 +0100 @@ -20,17 +20,26 @@ // this up() migration is autogenerated, please modify it to your needs $this->abortIf($this->connection->getDatabasePlatform()->getName() != "mysql"); + // First we get all tags. $em = $GLOBALS["kernel"]->getContainer()->get( 'doctrine.orm.entity_manager' ); - $query = $em->createQuery('SELECT t FROM WikiTagBundle:Tag t ORDER BY t.label ASC');//->setMaxResults(40)->setFirstResult(5000); + // Avoid php annoying memory leaks + $em->getConnection()->getConfiguration()->setSQLLogger(null); + + // First step : we populate the dbpedia uris thanks to the dbpedia-owl:wikiPageID + echo "\nFIRST STEP"; + $query = $em->createQuery('SELECT t FROM WikiTagBundle:Tag t ORDER BY t.label ASC');//->setMaxResults(240)->setFirstResult(500); $tags = $query->getResult(); - $i = 1; + $nb_set = 0; foreach($tags as $tag){ $l = $tag->getLabel(); - $uri = WikiTagUtils::getDbpediaUri($tag->getLabel(), [], false); + $uri = WikiTagUtils::getDbpediaUri($tag->getWikipediaPageId(), [], false, "pageid"); $tag->setDbpediaUri($uri); $em->persist($tag); + if($uri!=NULL && $uri!=""){ + $nb_set++; + } if( $i % 50 == 0 ){ $em->flush(); echo "\n FLUSH"; @@ -39,6 +48,57 @@ echo "\n$i : $l \t\t: $uri"; } $em->flush(); + echo "\nFIRST STEP : $nb_set uris found"; + + + // Second step : we populate the dbpedia uris not found thanks to the foaf:isPrimaryTopicOf + echo "\nSECOND STEP"; + $query = $em->createQuery('SELECT t FROM WikiTagBundle:Tag t WHERE (t.dbpediaUri=\'\' OR t.dbpediaUri IS NULL) ORDER BY t.label ASC');//->setMaxResults(240); + $tags = $query->getResult(); + $i = 1; + $nb_set = 0; + foreach($tags as $tag){ + $l = $tag->getLabel(); + $uri = WikiTagUtils::getDbpediaUri($tag->getWikipediaUrl(), [], false, "wikiurl"); + $tag->setDbpediaUri($uri); + $em->persist($tag); + if($uri!=NULL && $uri!=""){ + $nb_set++; + } + if( $i % 50 == 0 ){ + $em->flush(); + echo "\n FLUSH"; + } + $i++; + echo "\n$i : $l \t\t: $uri"; + } + $em->flush(); + echo "\nSECOND STEP : $nb_set uris found"; + + + // Third step : we populate the dbpedia uris not found thanks to the rdfs:label + echo "\nTHIRD STEP"; + $query = $em->createQuery('SELECT t FROM WikiTagBundle:Tag t WHERE (t.dbpediaUri=\'\' OR t.dbpediaUri IS NULL) ORDER BY t.label ASC');//->setMaxResults(240); + $tags = $query->getResult(); + $i = 1; + $nb_set = 0; + foreach($tags as $tag){ + $l = $tag->getLabel(); + $uri = WikiTagUtils::getDbpediaUri($tag->getLabel(), [], false); + $tag->setDbpediaUri($uri); + $em->persist($tag); + if($uri!=NULL && $uri!=""){ + $nb_set++; + } + if( $i % 50 == 0 ){ + $em->flush(); + echo "\n FLUSH"; + } + $i++; + echo "\n$i : $l \t\t: $uri"; + } + $em->flush(); + echo "\nTHIRD STEP : $nb_set uris found"; } public function down(Schema $schema) diff -r a023e0185a02 -r 5771052a647a README.md --- a/README.md Thu Jan 30 17:52:14 2014 +0100 +++ b/README.md Fri Jan 31 16:08:50 2014 +0100 @@ -332,4 +332,12 @@ + last doctags: php app/console wikitag:load-fixtures -B 40001 /path/to/data.json The -B (index Begin) and -E (index End) works alson on the tags. Therefore you cans import tags also in slices. + +## Migration + +The wikitag folder contains a migration in DoctrineMigrations/Version20140129151724.php. If your wikitag is anterior to V00.14, you need to to do this migration. +This migration takes every tag label and searches the REAL dbpedia uri associated to this label. +Before, the dbpedia uri was manually generated by http://dbpedia.org/resource/ + english_label. +Now we get the dbpedia uri by requesting http://LANG_CODE.dbpedia.org/sparql with the current label. + diff -r a023e0185a02 -r 5771052a647a Utils/WikiTagUtils.php --- a/Utils/WikiTagUtils.php Thu Jan 30 17:52:14 2014 +0100 +++ b/Utils/WikiTagUtils.php Fri Jan 31 16:08:50 2014 +0100 @@ -281,16 +281,25 @@ /** * Builds DbPedia URI */ - public static function getDbpediaUri($label, $params=[], $throw_error=true) + public static function getDbpediaUri($label, $params=[], $throw_error=true, $req_param="label") { // Get lang from url $dbp_url = $GLOBALS["kernel"]->getContainer()->getParameter("wiki_tag.url_templates")["dbpedia_sparql"]; $lang = substr($dbp_url, 7, 2); + $query = 'select distinct * where { ?s rdfs:label "'.$label.'"@'.$lang.' . FILTER (regex(?s, "^http\\\\://[^:]+$")) }'; + if($req_param=="pageid"){ + $query = 'select distinct * where { ?s dbpedia-owl:wikiPageID '.$label.' }'; + } + elseif ($req_param=="wikiurl"){ + $query = 'select distinct * where { ?s foaf:isPrimaryTopicOf <'.$label.'> }'; + } + + // filter with regexp to avoid results with "category:LABEL" or other "abc:LABEL" //"query" => 'select distinct * where { ?s rdfs:label "'.$label.'"@'.$lang.' }', //"query" => 'select distinct * where { ?s rdfs:label "'.$label.'"@'.$lang.' . FILTER (regex(?s, "^http\\\\://[^:]+$")) }', $params = [ - "query" => 'select distinct * where { ?s rdfs:label "'.$label.'"@'.$lang.' . FILTER (regex(?s, "^http\\\\://[^:]+$")) }', + "query" => $query, "format" => 'application/json', ];