# HG changeset patch # User cavaliet # Date 1391100734 -3600 # Node ID a023e0185a02e6833a3e04724b80cdf628621224 # Parent 085ea4dbfeee1ff9258235625ad8f4c628198080 migration to real dbpedia uri diff -r 085ea4dbfeee -r a023e0185a02 DoctrineMigrations/Version20140129151724.php --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/DoctrineMigrations/Version20140129151724.php Thu Jan 30 17:52:14 2014 +0100 @@ -0,0 +1,49 @@ +abortIf($this->connection->getDatabasePlatform()->getName() != "mysql"); + + // First we get all tags. + $em = $GLOBALS["kernel"]->getContainer()->get( 'doctrine.orm.entity_manager' ); + $query = $em->createQuery('SELECT t FROM WikiTagBundle:Tag t ORDER BY t.label ASC');//->setMaxResults(40)->setFirstResult(5000); + $tags = $query->getResult(); + + $i = 1; + foreach($tags as $tag){ + $l = $tag->getLabel(); + $uri = WikiTagUtils::getDbpediaUri($tag->getLabel(), [], false); + $tag->setDbpediaUri($uri); + $em->persist($tag); + if( $i % 50 == 0 ){ + $em->flush(); + echo "\n FLUSH"; + } + $i++; + echo "\n$i : $l \t\t: $uri"; + } + $em->flush(); + } + + public function down(Schema $schema) + { + // this down() migration is autogenerated, please modify it to your needs + $this->abortIf($this->connection->getDatabasePlatform()->getName() != "mysql"); + } +} diff -r 085ea4dbfeee -r a023e0185a02 Utils/WikiTagUtils.php --- a/Utils/WikiTagUtils.php Wed Jan 29 12:16:16 2014 +0100 +++ b/Utils/WikiTagUtils.php Thu Jan 30 17:52:14 2014 +0100 @@ -190,7 +190,7 @@ * @param string $url * @return object (json decoded) */ - private static function curlRequest($url) + private static function curlRequest($url, $throw_error=true) { $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); @@ -219,7 +219,7 @@ $curl_error = curl_error($ch); curl_close($ch); - if ($curl_errno > 0) { + if ($curl_errno > 0 && $throw_error) { throw new \Exception("$url\n request failed. cURLError #$curl_errno: $curl_error\n", $curl_errno, null); } @@ -281,16 +281,19 @@ /** * Builds DbPedia URI */ - private static function getDbpediaUri($label, $params=[]) + public static function getDbpediaUri($label, $params=[], $throw_error=true) { // Get lang from url $dbp_url = $GLOBALS["kernel"]->getContainer()->getParameter("wiki_tag.url_templates")["dbpedia_sparql"]; $lang = substr($dbp_url, 7, 2); + // filter with regexp to avoid results with "category:LABEL" or other "abc:LABEL" + //"query" => 'select distinct * where { ?s rdfs:label "'.$label.'"@'.$lang.' }', + //"query" => 'select distinct * where { ?s rdfs:label "'.$label.'"@'.$lang.' . FILTER (regex(?s, "^http\\\\://[^:]+$")) }', $params = [ - "query" => 'select distinct * where {?s rdfs:label "'.$label.'"@'.$lang.'}', + "query" => 'select distinct * where { ?s rdfs:label "'.$label.'"@'.$lang.' . FILTER (regex(?s, "^http\\\\://[^:]+$")) }', "format" => 'application/json', ]; - + $params_str = ''; foreach ($params as $key => $value) { if ($params_str==''){ @@ -303,15 +306,30 @@ $url = $GLOBALS["kernel"]->getContainer()->getParameter("wiki_tag.url_templates")["dbpedia_sparql"].'?'.$params_str; - $res = WikiTagUtils::curlRequest($url); + $res = WikiTagUtils::curlRequest($url, $throw_error); $val = json_decode($res, true); $uri = ""; - if(array_key_exists("results", $val)){ - if(array_key_exists("bindings", $val["results"])){ - if(count($val["results"]["bindings"]) > 0){ - $uri = $val["results"]["bindings"][0]["s"]["value"]; - } - } + if($val){ + if(array_key_exists("results", $val)){ + if(array_key_exists("bindings", $val["results"])){ + $len = count($val["results"]["bindings"]); + if($len > 0){ + $uri = $val["results"]["bindings"][0]["s"]["value"]; + if($len>1){ + // If there are several results, we test the "url label" to see if it matches the label. + // Why ? Because, for example "1000" gets "Category:1000" and "1000" as result. + // We keep this code to be safe but the regexp in the sparql request normally avoids this problem. + for($i=0;$i<$len;$i++){ + $res_uri = $val["results"]["bindings"][$i]["s"]["value"]; + $url_label = substr( $res_uri, strrpos( $res_uri, '/' )+1 ); + if(str_replace(" ", "_", $label) == $url_label){ + $uri = $res_uri; + } + } + } + } + } + } } return $uri; }