Utils/WikiTagUtils.php
changeset 2 13f43f53d0ba
child 8 7d2fb5d7c9ff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Utils/WikiTagUtils.php	Sun Oct 16 14:50:48 2011 +0200
@@ -0,0 +1,268 @@
+<?php
+
+namespace IRI\Bundle\WikiTagBundle\Utils;
+
+use IRI\Bundle\WikiTagBundle\Entity\Tag;
+
+class WikiTagUtils
+{
+    // Constants
+    private static $WIKIPEDIA_API_URL = "http://fr.wikipedia.org/w/api.php";
+    private static $WIKIPEDIA_VERSION_PERMALINK_TEMPLATE = "http://fr.wikipedia.org/w/index.php?oldid=%s";
+    private static $DBPEDIA_URI_TEMPLATE = "http://dbpedia.org/resource/%s";
+    
+    
+    /**
+     * Get or create tag. Returns an array(tag:WikiTagTag, revision_id=int, created:Boolean)
+     */
+    
+    /**
+     *
+     * Enter description here ...
+     * @param unknown_type $tag_label
+     * @param unknown_type $doctrine
+     * @return multitype:boolean Ambigous <NULL, \IRI\Bundle\WikiTagBundle\Entity\Tag> Ambigous <NULL, unknown, mixed, string> (array(\IRI\Bundle\WikiTagBundle\Model\TagInterface, revision_id=int, created:Boolean))
+     */
+    public static function getOrCreateTag($tag_label, $doctrine)
+    {
+        $tag_label_normalized = WikiTagUtils::normalizeTag($tag_label);
+        // We get the wikipedia references for the tag_label
+        // We get or create the tag object
+        $tags = $doctrine->getRepository('WikiTagBundle:Tag')->findBy(array('normalizedLabel' => $tag_label_normalized));
+        $tag = null;
+        foreach ($tags as $t){
+            if($tag==null || $t->getUrlStatus()!=Tag::$TAG_URL_STATUS_DICT['null_result']){
+                $tag = $t;
+                if($t->getUrlStatus()!=Tag::$TAG_URL_STATUS_DICT['null_result']){
+                    break;
+                }
+            }
+        }
+        if($tag==null){
+            $tag = new Tag();
+            $tag->setLabel($tag_label_normalized);
+            $tag->setOriginalLabel($tag_label);
+            $tag->setNormalizedLabel($tag_label_normalized);
+            $created = true;
+        }
+        else{
+            $created = false;
+            //$created = true;
+        }
+        
+        // We request Wikipedia if the tag is created
+        if($created==true){
+            $wp_response = WikiTagUtils::getWikipediaInfo($tag_label_normalized);
+            $new_label = $wp_response['new_label'];
+            $status = $wp_response['status'];
+            $url = $wp_response['wikipedia_url'];
+            $pageid = $wp_response['pageid'];
+            $dbpedia_uri = $wp_response["dbpedia_uri"];
+            $wikipedia_revision_id = $wp_response['revision_id'];
+            
+            # We save the datas
+            if($new_label!=null){
+                $tag->setLabel($new_label);
+            }
+            if($status!=null){
+                $tag->setUrlStatus($status);
+            }
+            $tag->setWikipediaUrl($url);
+            $tag->setWikipediaPageId($pageid);
+            $tag->setDbpediaUri($dbpedia_uri);
+            
+            // Save datas.
+            $em = $doctrine->getEntityManager();
+            $em->persist($tag);
+            $em->flush();
+            
+        }
+        else if($tag!=null && $tag->getWikipediaPageId()!=null){
+            $wp_response = WikiTagUtils::getWikipediaInfo(null, $tag->getWikipediaPageId());
+            $wikipedia_revision_id = $wp_response['revision_id'];
+        }
+        else{
+            $wikipedia_revision_id = null;
+        }
+        
+        return array($tag, $wikipedia_revision_id, $created);//, $wpReponse);
+    }
+    
+    /**
+     * Cleans the tag label
+     */
+    private static function normalizeTag($tag_label)
+    {
+        if(strlen($tag_label)==0){
+            return $tag_label;
+        }
+        $tag_label = trim($tag_label);//tag.strip()
+        $tag_label = str_replace("_", " ", $tag_label);//tag.replace("_", " ")
+        $tag_label = preg_replace('/\s+/', ' ', $tag_label);//" ".join(tag.split())
+        $tag_label = ucfirst($tag_label);//tag[0].upper() + tag[1:]
+        return $tag_label;
+    }
+    
+    /**
+     *
+     * TODO: Enter description here ...
+     * @param unknown_type $tag_label_normalized
+     * @param unknown_type $page_id
+     * @return multitype:NULL unknown |multitype:Ambigous <NULL, unknown> multitype:number  mixed Ambigous <NULL, string> Ambigous <unknown, mixed>
+     */
+    private static function getWikipediaInfo($tag_label_normalized, $page_id=null)
+    {
+        $params = array('action'=>'query', 'prop'=>'info|categories|langlinks', 'inprop'=>'url', 'lllimit'=>'500', 'cllimit'=>'500', 'rvprop'=>'ids', 'format'=>'json');
+        if($tag_label_normalized!=null){
+            $params['titles'] = urlencode($tag_label_normalized);
+        }
+        else if($page_id!=null){
+            $params['pageids'] = $page_id;
+        }
+        else{
+            return WikiTagUtils::returnNullResult(null);
+        }
+        
+        $ar = WikiTagUtils::requestWikipedia($params);
+        $res = $ar[0];
+        $original_response = $res;
+        $pages = $ar[1];
+        // If there 0 or more than 1 result, the query has failed
+        if(count($pages)>1 || count($pages)==0){
+            return WikiTagUtils::returnNullResult($res);
+        }
+        // get first result
+        $page = reset($pages);
+        // Unknow entry ?
+        if(array_key_exists('missing', $page) || array_key_exists('invalid', $page)){
+            return WikiTagUtils::returnNullResult($res);
+        }
+        // The entry exists, we get the datas.
+        $url = array_key_exists('fullurl', $page) ? $page['fullurl'] : null;
+        $pageid = array_key_exists('pageid', $page) ? $page['pageid'] : null;
+        $new_label = array_key_exists('title', $page) ? $page['title'] : null;
+        // We test the status (redirect first because a redirect has no categories key)
+        if(array_key_exists('redirect', $page)){
+            //return " REDIRECT";
+            $status = Tag::$TAG_URL_STATUS_DICT["redirection"];
+        }
+        else if(WikiTagUtils::isHomonymy($page)){
+            //return " HOMONYMY";
+            $status = Tag::$TAG_URL_STATUS_DICT["homonyme"];
+        }
+        else{
+            //return " MATCH";
+            $status = Tag::$TAG_URL_STATUS_DICT["match"];
+        }
+        // In redirection, we have to get more datas by adding redirects=true to the params
+        if($status==Tag::$TAG_URL_STATUS_DICT["redirection"]){
+            $params['redirects'] = "true";
+            $ar = WikiTagUtils::requestWikipedia($params);
+            $res = $ar[0];
+            $pages = $ar[1];
+            #we know that we have at least one answer
+            if(count($pages)>1 || count($pages)==0){
+                return WikiTagUtils::returnNullResult($res);
+            }
+            // get first result
+            $page = reset($pages);
+        }
+        
+        $revision_id = $page['lastrevid'];
+        
+        // process language to extract the english label
+        $english_label = null;
+        if($status==Tag::$TAG_URL_STATUS_DICT["match"] || $status==Tag::$TAG_URL_STATUS_DICT["redirection"]){
+            if(array_key_exists("langlinks", $page)){
+                foreach ($page["langlinks"] as $ar) {
+                    if($ar["lang"]=="en"){
+                        $english_label = $ar["*"];
+                        break;
+                    }
+                }
+            }
+        }
+        // We create the dbpedia uri.
+        $dbpedia_uri = null;
+        if($english_label!=null && strpos($english_label, '#')===false){
+            $dbpedia_uri = WikiTagUtils::getDbpediaUri($english_label);
+        }
+        
+        $wp_response = array('new_label'=>$new_label, 'status'=>$status, 'wikipedia_url'=>$url, 'pageid'=>$pageid, 'dbpedia_uri'=>$dbpedia_uri, 'revision_id'=>$revision_id, 'response'=>$original_response);
+        //return $url." <br/>RES =  ".$res/*." <br/>DUMP =  ".var_dump($pages)*/." <br/>COUNT =  ".count($pages)." <br/>page =  ".var_dump($page);
+        return $wp_response;
+    }
+    
+
+    /**
+     *
+     * TODO : Enter description here ...
+     * @param unknown_type $params
+     * @return multitype:unknown mixed
+     */
+    private static function requestWikipedia($params)
+    {
+        $params_str = '';
+        foreach ($params as $key => $value) {
+            if ($params_str==''){
+                $params_str = $key.'='.$value;
+            }
+            else{
+                $params_str .= '&'.$key.'='.$value;
+            }
+        }
+        
+        $url = WikiTagUtils::$WIKIPEDIA_API_URL.'?'.$params_str;
+        
+        $ch = curl_init();
+        curl_setopt($ch, CURLOPT_URL, $url);
+        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
+        curl_setopt($ch, CURLOPT_USERAGENT, 'http://www.iri.centrepompidou.fr');
+        $res = curl_exec($ch);
+        curl_close($ch);
+        
+        $val = json_decode($res, true);
+        $pages = $val["query"]["pages"];
+        return array($res, $pages);
+    }
+    
+    /**
+     * Returns tag with a null result, usually used after a failed request on Wikipedia
+     */
+    private static function returnNullResult($response)
+    {
+        return array('new_label'=>null, 'status'=>Tag::$TAG_URL_STATUS_DICT['null_result'], 'wikipedia_url'=>null, 'pageid'=>null, 'dbpedia_uri'=>null, 'revision_id'=>null, 'response'=>$response);
+    }
+    
+    /**
+     * Returns tag with a null result, usually used after a failed request on Wikipedia
+     */
+    private static function isHomonymy($page)
+    {
+        //$s = "";
+        foreach ($page["categories"] as $ar) {
+            //$s .= ", b : ".$ar." - title = ".$ar["title"].", strpos = ".strpos($ar["title"], 'Catégorie:Homonymie');
+            // Strict test because false can be seen as "O".
+            if(strpos($ar["title"], 'Catégorie:Homonymie')!==false || strpos($ar["title"], 'Category:Disambiguation')!==false){
+                //$s .= "TRUE";
+                return true;
+            }
+        }
+        return false;
+    }
+    
+    /**
+     * Builds DbPedia URI
+     */
+    private static function getDbpediaUri($english_label)
+    {
+        return sprintf(WikiTagUtils::$DBPEDIA_URI_TEMPLATE, WikiTagUtils::urlize_for_wikipedia($english_label));
+    }
+    
+    /**
+     * URLencode label for wikipedia
+     */
+    private static function urlize_for_wikipedia($label){
+        return urlencode(str_replace(" ", "_", $label));
+    }
+}