Utils/WikiTagUtils.php
changeset 2 13f43f53d0ba
child 8 7d2fb5d7c9ff
equal deleted inserted replaced
1:06a22ff5d58d 2:13f43f53d0ba
       
     1 <?php
       
     2 
       
     3 namespace IRI\Bundle\WikiTagBundle\Utils;
       
     4 
       
     5 use IRI\Bundle\WikiTagBundle\Entity\Tag;
       
     6 
       
     7 class WikiTagUtils
       
     8 {
       
     9     // Constants
       
    10     private static $WIKIPEDIA_API_URL = "http://fr.wikipedia.org/w/api.php";
       
    11     private static $WIKIPEDIA_VERSION_PERMALINK_TEMPLATE = "http://fr.wikipedia.org/w/index.php?oldid=%s";
       
    12     private static $DBPEDIA_URI_TEMPLATE = "http://dbpedia.org/resource/%s";
       
    13     
       
    14     
       
    15     /**
       
    16      * Get or create tag. Returns an array(tag:WikiTagTag, revision_id=int, created:Boolean)
       
    17      */
       
    18     
       
    19     /**
       
    20      *
       
    21      * Enter description here ...
       
    22      * @param unknown_type $tag_label
       
    23      * @param unknown_type $doctrine
       
    24      * @return multitype:boolean Ambigous <NULL, \IRI\Bundle\WikiTagBundle\Entity\Tag> Ambigous <NULL, unknown, mixed, string> (array(\IRI\Bundle\WikiTagBundle\Model\TagInterface, revision_id=int, created:Boolean))
       
    25      */
       
    26     public static function getOrCreateTag($tag_label, $doctrine)
       
    27     {
       
    28         $tag_label_normalized = WikiTagUtils::normalizeTag($tag_label);
       
    29         // We get the wikipedia references for the tag_label
       
    30         // We get or create the tag object
       
    31         $tags = $doctrine->getRepository('WikiTagBundle:Tag')->findBy(array('normalizedLabel' => $tag_label_normalized));
       
    32         $tag = null;
       
    33         foreach ($tags as $t){
       
    34             if($tag==null || $t->getUrlStatus()!=Tag::$TAG_URL_STATUS_DICT['null_result']){
       
    35                 $tag = $t;
       
    36                 if($t->getUrlStatus()!=Tag::$TAG_URL_STATUS_DICT['null_result']){
       
    37                     break;
       
    38                 }
       
    39             }
       
    40         }
       
    41         if($tag==null){
       
    42             $tag = new Tag();
       
    43             $tag->setLabel($tag_label_normalized);
       
    44             $tag->setOriginalLabel($tag_label);
       
    45             $tag->setNormalizedLabel($tag_label_normalized);
       
    46             $created = true;
       
    47         }
       
    48         else{
       
    49             $created = false;
       
    50             //$created = true;
       
    51         }
       
    52         
       
    53         // We request Wikipedia if the tag is created
       
    54         if($created==true){
       
    55             $wp_response = WikiTagUtils::getWikipediaInfo($tag_label_normalized);
       
    56             $new_label = $wp_response['new_label'];
       
    57             $status = $wp_response['status'];
       
    58             $url = $wp_response['wikipedia_url'];
       
    59             $pageid = $wp_response['pageid'];
       
    60             $dbpedia_uri = $wp_response["dbpedia_uri"];
       
    61             $wikipedia_revision_id = $wp_response['revision_id'];
       
    62             
       
    63             # We save the datas
       
    64             if($new_label!=null){
       
    65                 $tag->setLabel($new_label);
       
    66             }
       
    67             if($status!=null){
       
    68                 $tag->setUrlStatus($status);
       
    69             }
       
    70             $tag->setWikipediaUrl($url);
       
    71             $tag->setWikipediaPageId($pageid);
       
    72             $tag->setDbpediaUri($dbpedia_uri);
       
    73             
       
    74             // Save datas.
       
    75             $em = $doctrine->getEntityManager();
       
    76             $em->persist($tag);
       
    77             $em->flush();
       
    78             
       
    79         }
       
    80         else if($tag!=null && $tag->getWikipediaPageId()!=null){
       
    81             $wp_response = WikiTagUtils::getWikipediaInfo(null, $tag->getWikipediaPageId());
       
    82             $wikipedia_revision_id = $wp_response['revision_id'];
       
    83         }
       
    84         else{
       
    85             $wikipedia_revision_id = null;
       
    86         }
       
    87         
       
    88         return array($tag, $wikipedia_revision_id, $created);//, $wpReponse);
       
    89     }
       
    90     
       
    91     /**
       
    92      * Cleans the tag label
       
    93      */
       
    94     private static function normalizeTag($tag_label)
       
    95     {
       
    96         if(strlen($tag_label)==0){
       
    97             return $tag_label;
       
    98         }
       
    99         $tag_label = trim($tag_label);//tag.strip()
       
   100         $tag_label = str_replace("_", " ", $tag_label);//tag.replace("_", " ")
       
   101         $tag_label = preg_replace('/\s+/', ' ', $tag_label);//" ".join(tag.split())
       
   102         $tag_label = ucfirst($tag_label);//tag[0].upper() + tag[1:]
       
   103         return $tag_label;
       
   104     }
       
   105     
       
   106     /**
       
   107      *
       
   108      * TODO: Enter description here ...
       
   109      * @param unknown_type $tag_label_normalized
       
   110      * @param unknown_type $page_id
       
   111      * @return multitype:NULL unknown |multitype:Ambigous <NULL, unknown> multitype:number  mixed Ambigous <NULL, string> Ambigous <unknown, mixed>
       
   112      */
       
   113     private static function getWikipediaInfo($tag_label_normalized, $page_id=null)
       
   114     {
       
   115         $params = array('action'=>'query', 'prop'=>'info|categories|langlinks', 'inprop'=>'url', 'lllimit'=>'500', 'cllimit'=>'500', 'rvprop'=>'ids', 'format'=>'json');
       
   116         if($tag_label_normalized!=null){
       
   117             $params['titles'] = urlencode($tag_label_normalized);
       
   118         }
       
   119         else if($page_id!=null){
       
   120             $params['pageids'] = $page_id;
       
   121         }
       
   122         else{
       
   123             return WikiTagUtils::returnNullResult(null);
       
   124         }
       
   125         
       
   126         $ar = WikiTagUtils::requestWikipedia($params);
       
   127         $res = $ar[0];
       
   128         $original_response = $res;
       
   129         $pages = $ar[1];
       
   130         // If there 0 or more than 1 result, the query has failed
       
   131         if(count($pages)>1 || count($pages)==0){
       
   132             return WikiTagUtils::returnNullResult($res);
       
   133         }
       
   134         // get first result
       
   135         $page = reset($pages);
       
   136         // Unknow entry ?
       
   137         if(array_key_exists('missing', $page) || array_key_exists('invalid', $page)){
       
   138             return WikiTagUtils::returnNullResult($res);
       
   139         }
       
   140         // The entry exists, we get the datas.
       
   141         $url = array_key_exists('fullurl', $page) ? $page['fullurl'] : null;
       
   142         $pageid = array_key_exists('pageid', $page) ? $page['pageid'] : null;
       
   143         $new_label = array_key_exists('title', $page) ? $page['title'] : null;
       
   144         // We test the status (redirect first because a redirect has no categories key)
       
   145         if(array_key_exists('redirect', $page)){
       
   146             //return " REDIRECT";
       
   147             $status = Tag::$TAG_URL_STATUS_DICT["redirection"];
       
   148         }
       
   149         else if(WikiTagUtils::isHomonymy($page)){
       
   150             //return " HOMONYMY";
       
   151             $status = Tag::$TAG_URL_STATUS_DICT["homonyme"];
       
   152         }
       
   153         else{
       
   154             //return " MATCH";
       
   155             $status = Tag::$TAG_URL_STATUS_DICT["match"];
       
   156         }
       
   157         // In redirection, we have to get more datas by adding redirects=true to the params
       
   158         if($status==Tag::$TAG_URL_STATUS_DICT["redirection"]){
       
   159             $params['redirects'] = "true";
       
   160             $ar = WikiTagUtils::requestWikipedia($params);
       
   161             $res = $ar[0];
       
   162             $pages = $ar[1];
       
   163             #we know that we have at least one answer
       
   164             if(count($pages)>1 || count($pages)==0){
       
   165                 return WikiTagUtils::returnNullResult($res);
       
   166             }
       
   167             // get first result
       
   168             $page = reset($pages);
       
   169         }
       
   170         
       
   171         $revision_id = $page['lastrevid'];
       
   172         
       
   173         // process language to extract the english label
       
   174         $english_label = null;
       
   175         if($status==Tag::$TAG_URL_STATUS_DICT["match"] || $status==Tag::$TAG_URL_STATUS_DICT["redirection"]){
       
   176             if(array_key_exists("langlinks", $page)){
       
   177                 foreach ($page["langlinks"] as $ar) {
       
   178                     if($ar["lang"]=="en"){
       
   179                         $english_label = $ar["*"];
       
   180                         break;
       
   181                     }
       
   182                 }
       
   183             }
       
   184         }
       
   185         // We create the dbpedia uri.
       
   186         $dbpedia_uri = null;
       
   187         if($english_label!=null && strpos($english_label, '#')===false){
       
   188             $dbpedia_uri = WikiTagUtils::getDbpediaUri($english_label);
       
   189         }
       
   190         
       
   191         $wp_response = array('new_label'=>$new_label, 'status'=>$status, 'wikipedia_url'=>$url, 'pageid'=>$pageid, 'dbpedia_uri'=>$dbpedia_uri, 'revision_id'=>$revision_id, 'response'=>$original_response);
       
   192         //return $url." <br/>RES =  ".$res/*." <br/>DUMP =  ".var_dump($pages)*/." <br/>COUNT =  ".count($pages)." <br/>page =  ".var_dump($page);
       
   193         return $wp_response;
       
   194     }
       
   195     
       
   196 
       
   197     /**
       
   198      *
       
   199      * TODO : Enter description here ...
       
   200      * @param unknown_type $params
       
   201      * @return multitype:unknown mixed
       
   202      */
       
   203     private static function requestWikipedia($params)
       
   204     {
       
   205         $params_str = '';
       
   206         foreach ($params as $key => $value) {
       
   207             if ($params_str==''){
       
   208                 $params_str = $key.'='.$value;
       
   209             }
       
   210             else{
       
   211                 $params_str .= '&'.$key.'='.$value;
       
   212             }
       
   213         }
       
   214         
       
   215         $url = WikiTagUtils::$WIKIPEDIA_API_URL.'?'.$params_str;
       
   216         
       
   217         $ch = curl_init();
       
   218         curl_setopt($ch, CURLOPT_URL, $url);
       
   219         curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
       
   220         curl_setopt($ch, CURLOPT_USERAGENT, 'http://www.iri.centrepompidou.fr');
       
   221         $res = curl_exec($ch);
       
   222         curl_close($ch);
       
   223         
       
   224         $val = json_decode($res, true);
       
   225         $pages = $val["query"]["pages"];
       
   226         return array($res, $pages);
       
   227     }
       
   228     
       
   229     /**
       
   230      * Returns tag with a null result, usually used after a failed request on Wikipedia
       
   231      */
       
   232     private static function returnNullResult($response)
       
   233     {
       
   234         return array('new_label'=>null, 'status'=>Tag::$TAG_URL_STATUS_DICT['null_result'], 'wikipedia_url'=>null, 'pageid'=>null, 'dbpedia_uri'=>null, 'revision_id'=>null, 'response'=>$response);
       
   235     }
       
   236     
       
   237     /**
       
   238      * Returns tag with a null result, usually used after a failed request on Wikipedia
       
   239      */
       
   240     private static function isHomonymy($page)
       
   241     {
       
   242         //$s = "";
       
   243         foreach ($page["categories"] as $ar) {
       
   244             //$s .= ", b : ".$ar." - title = ".$ar["title"].", strpos = ".strpos($ar["title"], 'Catégorie:Homonymie');
       
   245             // Strict test because false can be seen as "O".
       
   246             if(strpos($ar["title"], 'Catégorie:Homonymie')!==false || strpos($ar["title"], 'Category:Disambiguation')!==false){
       
   247                 //$s .= "TRUE";
       
   248                 return true;
       
   249             }
       
   250         }
       
   251         return false;
       
   252     }
       
   253     
       
   254     /**
       
   255      * Builds DbPedia URI
       
   256      */
       
   257     private static function getDbpediaUri($english_label)
       
   258     {
       
   259         return sprintf(WikiTagUtils::$DBPEDIA_URI_TEMPLATE, WikiTagUtils::urlize_for_wikipedia($english_label));
       
   260     }
       
   261     
       
   262     /**
       
   263      * URLencode label for wikipedia
       
   264      */
       
   265     private static function urlize_for_wikipedia($label){
       
   266         return urlencode(str_replace(" ", "_", $label));
       
   267     }
       
   268 }