diff -r 06a22ff5d58d -r 13f43f53d0ba Utils/WikiTagUtils.php --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Utils/WikiTagUtils.php Sun Oct 16 14:50:48 2011 +0200 @@ -0,0 +1,268 @@ + Ambigous (array(\IRI\Bundle\WikiTagBundle\Model\TagInterface, revision_id=int, created:Boolean)) + */ + public static function getOrCreateTag($tag_label, $doctrine) + { + $tag_label_normalized = WikiTagUtils::normalizeTag($tag_label); + // We get the wikipedia references for the tag_label + // We get or create the tag object + $tags = $doctrine->getRepository('WikiTagBundle:Tag')->findBy(array('normalizedLabel' => $tag_label_normalized)); + $tag = null; + foreach ($tags as $t){ + if($tag==null || $t->getUrlStatus()!=Tag::$TAG_URL_STATUS_DICT['null_result']){ + $tag = $t; + if($t->getUrlStatus()!=Tag::$TAG_URL_STATUS_DICT['null_result']){ + break; + } + } + } + if($tag==null){ + $tag = new Tag(); + $tag->setLabel($tag_label_normalized); + $tag->setOriginalLabel($tag_label); + $tag->setNormalizedLabel($tag_label_normalized); + $created = true; + } + else{ + $created = false; + //$created = true; + } + + // We request Wikipedia if the tag is created + if($created==true){ + $wp_response = WikiTagUtils::getWikipediaInfo($tag_label_normalized); + $new_label = $wp_response['new_label']; + $status = $wp_response['status']; + $url = $wp_response['wikipedia_url']; + $pageid = $wp_response['pageid']; + $dbpedia_uri = $wp_response["dbpedia_uri"]; + $wikipedia_revision_id = $wp_response['revision_id']; + + # We save the datas + if($new_label!=null){ + $tag->setLabel($new_label); + } + if($status!=null){ + $tag->setUrlStatus($status); + } + $tag->setWikipediaUrl($url); + $tag->setWikipediaPageId($pageid); + $tag->setDbpediaUri($dbpedia_uri); + + // Save datas. + $em = $doctrine->getEntityManager(); + $em->persist($tag); + $em->flush(); + + } + else if($tag!=null && $tag->getWikipediaPageId()!=null){ + $wp_response = WikiTagUtils::getWikipediaInfo(null, $tag->getWikipediaPageId()); + $wikipedia_revision_id = $wp_response['revision_id']; + } + else{ + $wikipedia_revision_id = null; + } + + return array($tag, $wikipedia_revision_id, $created);//, $wpReponse); + } + + /** + * Cleans the tag label + */ + private static function normalizeTag($tag_label) + { + if(strlen($tag_label)==0){ + return $tag_label; + } + $tag_label = trim($tag_label);//tag.strip() + $tag_label = str_replace("_", " ", $tag_label);//tag.replace("_", " ") + $tag_label = preg_replace('/\s+/', ' ', $tag_label);//" ".join(tag.split()) + $tag_label = ucfirst($tag_label);//tag[0].upper() + tag[1:] + return $tag_label; + } + + /** + * + * TODO: Enter description here ... + * @param unknown_type $tag_label_normalized + * @param unknown_type $page_id + * @return multitype:NULL unknown |multitype:Ambigous multitype:number mixed Ambigous Ambigous + */ + private static function getWikipediaInfo($tag_label_normalized, $page_id=null) + { + $params = array('action'=>'query', 'prop'=>'info|categories|langlinks', 'inprop'=>'url', 'lllimit'=>'500', 'cllimit'=>'500', 'rvprop'=>'ids', 'format'=>'json'); + if($tag_label_normalized!=null){ + $params['titles'] = urlencode($tag_label_normalized); + } + else if($page_id!=null){ + $params['pageids'] = $page_id; + } + else{ + return WikiTagUtils::returnNullResult(null); + } + + $ar = WikiTagUtils::requestWikipedia($params); + $res = $ar[0]; + $original_response = $res; + $pages = $ar[1]; + // If there 0 or more than 1 result, the query has failed + if(count($pages)>1 || count($pages)==0){ + return WikiTagUtils::returnNullResult($res); + } + // get first result + $page = reset($pages); + // Unknow entry ? + if(array_key_exists('missing', $page) || array_key_exists('invalid', $page)){ + return WikiTagUtils::returnNullResult($res); + } + // The entry exists, we get the datas. + $url = array_key_exists('fullurl', $page) ? $page['fullurl'] : null; + $pageid = array_key_exists('pageid', $page) ? $page['pageid'] : null; + $new_label = array_key_exists('title', $page) ? $page['title'] : null; + // We test the status (redirect first because a redirect has no categories key) + if(array_key_exists('redirect', $page)){ + //return " REDIRECT"; + $status = Tag::$TAG_URL_STATUS_DICT["redirection"]; + } + else if(WikiTagUtils::isHomonymy($page)){ + //return " HOMONYMY"; + $status = Tag::$TAG_URL_STATUS_DICT["homonyme"]; + } + else{ + //return " MATCH"; + $status = Tag::$TAG_URL_STATUS_DICT["match"]; + } + // In redirection, we have to get more datas by adding redirects=true to the params + if($status==Tag::$TAG_URL_STATUS_DICT["redirection"]){ + $params['redirects'] = "true"; + $ar = WikiTagUtils::requestWikipedia($params); + $res = $ar[0]; + $pages = $ar[1]; + #we know that we have at least one answer + if(count($pages)>1 || count($pages)==0){ + return WikiTagUtils::returnNullResult($res); + } + // get first result + $page = reset($pages); + } + + $revision_id = $page['lastrevid']; + + // process language to extract the english label + $english_label = null; + if($status==Tag::$TAG_URL_STATUS_DICT["match"] || $status==Tag::$TAG_URL_STATUS_DICT["redirection"]){ + if(array_key_exists("langlinks", $page)){ + foreach ($page["langlinks"] as $ar) { + if($ar["lang"]=="en"){ + $english_label = $ar["*"]; + break; + } + } + } + } + // We create the dbpedia uri. + $dbpedia_uri = null; + if($english_label!=null && strpos($english_label, '#')===false){ + $dbpedia_uri = WikiTagUtils::getDbpediaUri($english_label); + } + + $wp_response = array('new_label'=>$new_label, 'status'=>$status, 'wikipedia_url'=>$url, 'pageid'=>$pageid, 'dbpedia_uri'=>$dbpedia_uri, 'revision_id'=>$revision_id, 'response'=>$original_response); + //return $url."
RES = ".$res/*."
DUMP = ".var_dump($pages)*/."
COUNT = ".count($pages)."
page = ".var_dump($page); + return $wp_response; + } + + + /** + * + * TODO : Enter description here ... + * @param unknown_type $params + * @return multitype:unknown mixed + */ + private static function requestWikipedia($params) + { + $params_str = ''; + foreach ($params as $key => $value) { + if ($params_str==''){ + $params_str = $key.'='.$value; + } + else{ + $params_str .= '&'.$key.'='.$value; + } + } + + $url = WikiTagUtils::$WIKIPEDIA_API_URL.'?'.$params_str; + + $ch = curl_init(); + curl_setopt($ch, CURLOPT_URL, $url); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); + curl_setopt($ch, CURLOPT_USERAGENT, 'http://www.iri.centrepompidou.fr'); + $res = curl_exec($ch); + curl_close($ch); + + $val = json_decode($res, true); + $pages = $val["query"]["pages"]; + return array($res, $pages); + } + + /** + * Returns tag with a null result, usually used after a failed request on Wikipedia + */ + private static function returnNullResult($response) + { + return array('new_label'=>null, 'status'=>Tag::$TAG_URL_STATUS_DICT['null_result'], 'wikipedia_url'=>null, 'pageid'=>null, 'dbpedia_uri'=>null, 'revision_id'=>null, 'response'=>$response); + } + + /** + * Returns tag with a null result, usually used after a failed request on Wikipedia + */ + private static function isHomonymy($page) + { + //$s = ""; + foreach ($page["categories"] as $ar) { + //$s .= ", b : ".$ar." - title = ".$ar["title"].", strpos = ".strpos($ar["title"], 'Catégorie:Homonymie'); + // Strict test because false can be seen as "O". + if(strpos($ar["title"], 'Catégorie:Homonymie')!==false || strpos($ar["title"], 'Category:Disambiguation')!==false){ + //$s .= "TRUE"; + return true; + } + } + return false; + } + + /** + * Builds DbPedia URI + */ + private static function getDbpediaUri($english_label) + { + return sprintf(WikiTagUtils::$DBPEDIA_URI_TEMPLATE, WikiTagUtils::urlize_for_wikipedia($english_label)); + } + + /** + * URLencode label for wikipedia + */ + private static function urlize_for_wikipedia($label){ + return urlencode(str_replace(" ", "_", $label)); + } +}