cms/drupal/modules/aggregator/aggregator.parser.inc
changeset 541 e756a8c72c3d
equal deleted inserted replaced
540:07239de796bb 541:e756a8c72c3d
       
     1 <?php
       
     2 
       
     3 /**
       
     4  * @file
       
     5  * Parser functions for the aggregator module.
       
     6  */
       
     7 
       
     8 /**
       
     9  * Implements hook_aggregator_parse_info().
       
    10  */
       
    11 function aggregator_aggregator_parse_info() {
       
    12   return array(
       
    13     'title' => t('Default parser'),
       
    14     'description' => t('Parses RSS, Atom and RDF feeds.'),
       
    15   );
       
    16 }
       
    17 
       
    18 /**
       
    19  * Implements hook_aggregator_parse().
       
    20  */
       
    21 function aggregator_aggregator_parse($feed) {
       
    22   global $channel, $image;
       
    23 
       
    24   // Filter the input data.
       
    25   if (aggregator_parse_feed($feed->source_string, $feed)) {
       
    26     $modified = empty($feed->http_headers['last-modified']) ? 0 : strtotime($feed->http_headers['last-modified']);
       
    27 
       
    28     // Prepare the channel data.
       
    29     foreach ($channel as $key => $value) {
       
    30       $channel[$key] = trim($value);
       
    31     }
       
    32 
       
    33     // Prepare the image data (if any).
       
    34     foreach ($image as $key => $value) {
       
    35       $image[$key] = trim($value);
       
    36     }
       
    37 
       
    38     $etag = empty($feed->http_headers['etag']) ? '' : $feed->http_headers['etag'];
       
    39 
       
    40     // Add parsed data to the feed object.
       
    41     $feed->link = !empty($channel['link']) ? $channel['link'] : '';
       
    42     $feed->description = !empty($channel['description']) ? $channel['description'] : '';
       
    43     $feed->image = !empty($image['url']) ? $image['url'] : '';
       
    44     $feed->etag = $etag;
       
    45     $feed->modified = $modified;
       
    46 
       
    47     // Clear the cache.
       
    48     cache_clear_all();
       
    49 
       
    50     return TRUE;
       
    51   }
       
    52 
       
    53   return FALSE;
       
    54 }
       
    55 
       
    56 /**
       
    57  * Parses a feed and stores its items.
       
    58  *
       
    59  * @param $data
       
    60  *   The feed data.
       
    61  * @param $feed
       
    62  *   An object describing the feed to be parsed.
       
    63  *
       
    64  * @return
       
    65  *   FALSE on error, TRUE otherwise.
       
    66  */
       
    67 function aggregator_parse_feed(&$data, $feed) {
       
    68   global $items, $image, $channel;
       
    69 
       
    70   // Unset the global variables before we use them.
       
    71   unset($GLOBALS['element'], $GLOBALS['item'], $GLOBALS['tag']);
       
    72   $items = array();
       
    73   $image = array();
       
    74   $channel = array();
       
    75 
       
    76   // Parse the data.
       
    77   $xml_parser = drupal_xml_parser_create($data);
       
    78   xml_set_element_handler($xml_parser, 'aggregator_element_start', 'aggregator_element_end');
       
    79   xml_set_character_data_handler($xml_parser, 'aggregator_element_data');
       
    80 
       
    81   if (!xml_parse($xml_parser, $data, 1)) {
       
    82     watchdog('aggregator', 'The feed from %site seems to be broken, due to an error "%error" on line %line.', array('%site' => $feed->title, '%error' => xml_error_string(xml_get_error_code($xml_parser)), '%line' => xml_get_current_line_number($xml_parser)), WATCHDOG_WARNING);
       
    83     drupal_set_message(t('The feed from %site seems to be broken, because of error "%error" on line %line.', array('%site' => $feed->title, '%error' => xml_error_string(xml_get_error_code($xml_parser)), '%line' => xml_get_current_line_number($xml_parser))), 'error');
       
    84     return FALSE;
       
    85   }
       
    86   xml_parser_free($xml_parser);
       
    87 
       
    88   // We reverse the array such that we store the first item last, and the last
       
    89   // item first. In the database, the newest item should be at the top.
       
    90   $items = array_reverse($items);
       
    91 
       
    92   // Initialize items array.
       
    93   $feed->items = array();
       
    94   foreach ($items as $item) {
       
    95 
       
    96     // Prepare the item:
       
    97     foreach ($item as $key => $value) {
       
    98       $item[$key] = trim($value);
       
    99     }
       
   100 
       
   101     // Resolve the item's title. If no title is found, we use up to 40
       
   102     // characters of the description ending at a word boundary, but not
       
   103     // splitting potential entities.
       
   104     if (!empty($item['title'])) {
       
   105       $item['title'] = $item['title'];
       
   106     }
       
   107     elseif (!empty($item['description'])) {
       
   108       $item['title'] = preg_replace('/^(.*)[^\w;&].*?$/', "\\1", truncate_utf8($item['description'], 40));
       
   109     }
       
   110     else {
       
   111       $item['title'] = '';
       
   112     }
       
   113 
       
   114     // Resolve the items link.
       
   115     if (!empty($item['link'])) {
       
   116       $item['link'] = $item['link'];
       
   117     }
       
   118     else {
       
   119       $item['link'] = $feed->link;
       
   120     }
       
   121 
       
   122     // Atom feeds have an ID tag instead of a GUID tag.
       
   123     if (!isset($item['guid'])) {
       
   124       $item['guid'] = isset($item['id']) ? $item['id'] : '';
       
   125     }
       
   126 
       
   127     // Atom feeds have a content and/or summary tag instead of a description tag.
       
   128     if (!empty($item['content:encoded'])) {
       
   129       $item['description'] = $item['content:encoded'];
       
   130     }
       
   131     elseif (!empty($item['summary'])) {
       
   132       $item['description'] = $item['summary'];
       
   133     }
       
   134     elseif (!empty($item['content'])) {
       
   135       $item['description'] = $item['content'];
       
   136     }
       
   137 
       
   138     // Try to resolve and parse the item's publication date.
       
   139     $date = '';
       
   140     foreach (array('pubdate', 'dc:date', 'dcterms:issued', 'dcterms:created', 'dcterms:modified', 'issued', 'created', 'modified', 'published', 'updated') as $key) {
       
   141       if (!empty($item[$key])) {
       
   142         $date = $item[$key];
       
   143         break;
       
   144       }
       
   145     }
       
   146 
       
   147     $item['timestamp'] = strtotime($date);
       
   148 
       
   149     if ($item['timestamp'] === FALSE) {
       
   150       $item['timestamp'] = aggregator_parse_w3cdtf($date); // Aggregator_parse_w3cdtf() returns FALSE on failure.
       
   151     }
       
   152 
       
   153     // Resolve dc:creator tag as the item author if author tag is not set.
       
   154     if (empty($item['author']) && !empty($item['dc:creator'])) {
       
   155       $item['author'] = $item['dc:creator'];
       
   156     }
       
   157 
       
   158     $item += array('author' => '', 'description' => '');
       
   159 
       
   160     // Store on $feed object. This is where processors will look for parsed items.
       
   161     $feed->items[] = $item;
       
   162   }
       
   163 
       
   164   return TRUE;
       
   165 }
       
   166 
       
   167 /**
       
   168  * Performs an action when an opening tag is encountered.
       
   169  *
       
   170  * Callback function used by xml_parse() within aggregator_parse_feed().
       
   171  */
       
   172 function aggregator_element_start($parser, $name, $attributes) {
       
   173   global $item, $element, $tag, $items, $channel;
       
   174 
       
   175   $name = strtolower($name);
       
   176   switch ($name) {
       
   177     case 'image':
       
   178     case 'textinput':
       
   179     case 'summary':
       
   180     case 'tagline':
       
   181     case 'subtitle':
       
   182     case 'logo':
       
   183     case 'info':
       
   184       $element = $name;
       
   185       break;
       
   186     case 'id':
       
   187     case 'content':
       
   188       if ($element != 'item') {
       
   189         $element = $name;
       
   190       }
       
   191     case 'link':
       
   192       // According to RFC 4287, link elements in Atom feeds without a 'rel'
       
   193       // attribute should be interpreted as though the relation type is
       
   194       // "alternate".
       
   195       if (!empty($attributes['HREF']) && (empty($attributes['REL']) || $attributes['REL'] == 'alternate')) {
       
   196         if ($element == 'item') {
       
   197           $items[$item]['link'] = $attributes['HREF'];
       
   198         }
       
   199         else {
       
   200           $channel['link'] = $attributes['HREF'];
       
   201         }
       
   202       }
       
   203       break;
       
   204     case 'item':
       
   205       $element = $name;
       
   206       $item += 1;
       
   207       break;
       
   208     case 'entry':
       
   209       $element = 'item';
       
   210       $item += 1;
       
   211       break;
       
   212   }
       
   213 
       
   214   $tag = $name;
       
   215 }
       
   216 
       
   217 /**
       
   218  * Performs an action when a closing tag is encountered.
       
   219  *
       
   220  * Callback function used by xml_parse() within aggregator_parse_feed().
       
   221  */
       
   222 function aggregator_element_end($parser, $name) {
       
   223   global $element;
       
   224 
       
   225   switch ($name) {
       
   226     case 'image':
       
   227     case 'textinput':
       
   228     case 'item':
       
   229     case 'entry':
       
   230     case 'info':
       
   231       $element = '';
       
   232       break;
       
   233     case 'id':
       
   234     case 'content':
       
   235       if ($element == $name) {
       
   236         $element = '';
       
   237       }
       
   238   }
       
   239 }
       
   240 
       
   241 /**
       
   242  * Performs an action when data is encountered.
       
   243  *
       
   244  * Callback function used by xml_parse() within aggregator_parse_feed().
       
   245  */
       
   246 function aggregator_element_data($parser, $data) {
       
   247   global $channel, $element, $items, $item, $image, $tag;
       
   248   $items += array($item => array());
       
   249   switch ($element) {
       
   250     case 'item':
       
   251       $items[$item] += array($tag => '');
       
   252       $items[$item][$tag] .= $data;
       
   253       break;
       
   254     case 'image':
       
   255     case 'logo':
       
   256       $image += array($tag => '');
       
   257       $image[$tag] .= $data;
       
   258       break;
       
   259     case 'link':
       
   260       if ($data) {
       
   261         $items[$item] += array($tag => '');
       
   262         $items[$item][$tag] .= $data;
       
   263       }
       
   264       break;
       
   265     case 'content':
       
   266       $items[$item] += array('content' => '');
       
   267       $items[$item]['content'] .= $data;
       
   268       break;
       
   269     case 'summary':
       
   270       $items[$item] += array('summary' => '');
       
   271       $items[$item]['summary'] .= $data;
       
   272       break;
       
   273     case 'tagline':
       
   274     case 'subtitle':
       
   275       $channel += array('description' => '');
       
   276       $channel['description'] .= $data;
       
   277       break;
       
   278     case 'info':
       
   279     case 'id':
       
   280     case 'textinput':
       
   281       // The sub-element is not supported. However, we must recognize
       
   282       // it or its contents will end up in the item array.
       
   283       break;
       
   284     default:
       
   285       $channel += array($tag => '');
       
   286       $channel[$tag] .= $data;
       
   287   }
       
   288 }
       
   289 
       
   290 /**
       
   291  * Parses the W3C date/time format, a subset of ISO 8601.
       
   292  *
       
   293  * PHP date parsing functions do not handle this format. See
       
   294  * http://www.w3.org/TR/NOTE-datetime for more information. Originally from
       
   295  * MagpieRSS (http://magpierss.sourceforge.net/).
       
   296  *
       
   297  * @param $date_str
       
   298  *   A string with a potentially W3C DTF date.
       
   299  *
       
   300  * @return
       
   301  *   A timestamp if parsed successfully or FALSE if not.
       
   302  */
       
   303 function aggregator_parse_w3cdtf($date_str) {
       
   304   if (preg_match('/(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2})(:(\d{2}))?(?:([-+])(\d{2}):?(\d{2})|(Z))?/', $date_str, $match)) {
       
   305     list($year, $month, $day, $hours, $minutes, $seconds) = array($match[1], $match[2], $match[3], $match[4], $match[5], $match[6]);
       
   306     // Calculate the epoch for current date assuming GMT.
       
   307     $epoch = gmmktime($hours, $minutes, $seconds, $month, $day, $year);
       
   308     if ($match[10] != 'Z') { // Z is zulu time, aka GMT
       
   309       list($tz_mod, $tz_hour, $tz_min) = array($match[8], $match[9], $match[10]);
       
   310       // Zero out the variables.
       
   311       if (!$tz_hour) {
       
   312         $tz_hour = 0;
       
   313       }
       
   314       if (!$tz_min) {
       
   315         $tz_min = 0;
       
   316       }
       
   317       $offset_secs = (($tz_hour * 60) + $tz_min) * 60;
       
   318       // Is timezone ahead of GMT?  If yes, subtract offset.
       
   319       if ($tz_mod == '+') {
       
   320         $offset_secs *= -1;
       
   321       }
       
   322       $epoch += $offset_secs;
       
   323     }
       
   324     return $epoch;
       
   325   }
       
   326   else {
       
   327     return FALSE;
       
   328   }
       
   329 }