|
1 <?php |
|
2 |
|
3 /** |
|
4 * @file |
|
5 * Parser functions for the aggregator module. |
|
6 */ |
|
7 |
|
8 /** |
|
9 * Implements hook_aggregator_parse_info(). |
|
10 */ |
|
11 function aggregator_aggregator_parse_info() { |
|
12 return array( |
|
13 'title' => t('Default parser'), |
|
14 'description' => t('Parses RSS, Atom and RDF feeds.'), |
|
15 ); |
|
16 } |
|
17 |
|
18 /** |
|
19 * Implements hook_aggregator_parse(). |
|
20 */ |
|
21 function aggregator_aggregator_parse($feed) { |
|
22 global $channel, $image; |
|
23 |
|
24 // Filter the input data. |
|
25 if (aggregator_parse_feed($feed->source_string, $feed)) { |
|
26 $modified = empty($feed->http_headers['last-modified']) ? 0 : strtotime($feed->http_headers['last-modified']); |
|
27 |
|
28 // Prepare the channel data. |
|
29 foreach ($channel as $key => $value) { |
|
30 $channel[$key] = trim($value); |
|
31 } |
|
32 |
|
33 // Prepare the image data (if any). |
|
34 foreach ($image as $key => $value) { |
|
35 $image[$key] = trim($value); |
|
36 } |
|
37 |
|
38 $etag = empty($feed->http_headers['etag']) ? '' : $feed->http_headers['etag']; |
|
39 |
|
40 // Add parsed data to the feed object. |
|
41 $feed->link = !empty($channel['link']) ? $channel['link'] : ''; |
|
42 $feed->description = !empty($channel['description']) ? $channel['description'] : ''; |
|
43 $feed->image = !empty($image['url']) ? $image['url'] : ''; |
|
44 $feed->etag = $etag; |
|
45 $feed->modified = $modified; |
|
46 |
|
47 // Clear the cache. |
|
48 cache_clear_all(); |
|
49 |
|
50 return TRUE; |
|
51 } |
|
52 |
|
53 return FALSE; |
|
54 } |
|
55 |
|
56 /** |
|
57 * Parses a feed and stores its items. |
|
58 * |
|
59 * @param $data |
|
60 * The feed data. |
|
61 * @param $feed |
|
62 * An object describing the feed to be parsed. |
|
63 * |
|
64 * @return |
|
65 * FALSE on error, TRUE otherwise. |
|
66 */ |
|
67 function aggregator_parse_feed(&$data, $feed) { |
|
68 global $items, $image, $channel; |
|
69 |
|
70 // Unset the global variables before we use them. |
|
71 unset($GLOBALS['element'], $GLOBALS['item'], $GLOBALS['tag']); |
|
72 $items = array(); |
|
73 $image = array(); |
|
74 $channel = array(); |
|
75 |
|
76 // Parse the data. |
|
77 $xml_parser = drupal_xml_parser_create($data); |
|
78 xml_set_element_handler($xml_parser, 'aggregator_element_start', 'aggregator_element_end'); |
|
79 xml_set_character_data_handler($xml_parser, 'aggregator_element_data'); |
|
80 |
|
81 if (!xml_parse($xml_parser, $data, 1)) { |
|
82 watchdog('aggregator', 'The feed from %site seems to be broken, due to an error "%error" on line %line.', array('%site' => $feed->title, '%error' => xml_error_string(xml_get_error_code($xml_parser)), '%line' => xml_get_current_line_number($xml_parser)), WATCHDOG_WARNING); |
|
83 drupal_set_message(t('The feed from %site seems to be broken, because of error "%error" on line %line.', array('%site' => $feed->title, '%error' => xml_error_string(xml_get_error_code($xml_parser)), '%line' => xml_get_current_line_number($xml_parser))), 'error'); |
|
84 return FALSE; |
|
85 } |
|
86 xml_parser_free($xml_parser); |
|
87 |
|
88 // We reverse the array such that we store the first item last, and the last |
|
89 // item first. In the database, the newest item should be at the top. |
|
90 $items = array_reverse($items); |
|
91 |
|
92 // Initialize items array. |
|
93 $feed->items = array(); |
|
94 foreach ($items as $item) { |
|
95 |
|
96 // Prepare the item: |
|
97 foreach ($item as $key => $value) { |
|
98 $item[$key] = trim($value); |
|
99 } |
|
100 |
|
101 // Resolve the item's title. If no title is found, we use up to 40 |
|
102 // characters of the description ending at a word boundary, but not |
|
103 // splitting potential entities. |
|
104 if (!empty($item['title'])) { |
|
105 $item['title'] = $item['title']; |
|
106 } |
|
107 elseif (!empty($item['description'])) { |
|
108 $item['title'] = preg_replace('/^(.*)[^\w;&].*?$/', "\\1", truncate_utf8($item['description'], 40)); |
|
109 } |
|
110 else { |
|
111 $item['title'] = ''; |
|
112 } |
|
113 |
|
114 // Resolve the items link. |
|
115 if (!empty($item['link'])) { |
|
116 $item['link'] = $item['link']; |
|
117 } |
|
118 else { |
|
119 $item['link'] = $feed->link; |
|
120 } |
|
121 |
|
122 // Atom feeds have an ID tag instead of a GUID tag. |
|
123 if (!isset($item['guid'])) { |
|
124 $item['guid'] = isset($item['id']) ? $item['id'] : ''; |
|
125 } |
|
126 |
|
127 // Atom feeds have a content and/or summary tag instead of a description tag. |
|
128 if (!empty($item['content:encoded'])) { |
|
129 $item['description'] = $item['content:encoded']; |
|
130 } |
|
131 elseif (!empty($item['summary'])) { |
|
132 $item['description'] = $item['summary']; |
|
133 } |
|
134 elseif (!empty($item['content'])) { |
|
135 $item['description'] = $item['content']; |
|
136 } |
|
137 |
|
138 // Try to resolve and parse the item's publication date. |
|
139 $date = ''; |
|
140 foreach (array('pubdate', 'dc:date', 'dcterms:issued', 'dcterms:created', 'dcterms:modified', 'issued', 'created', 'modified', 'published', 'updated') as $key) { |
|
141 if (!empty($item[$key])) { |
|
142 $date = $item[$key]; |
|
143 break; |
|
144 } |
|
145 } |
|
146 |
|
147 $item['timestamp'] = strtotime($date); |
|
148 |
|
149 if ($item['timestamp'] === FALSE) { |
|
150 $item['timestamp'] = aggregator_parse_w3cdtf($date); // Aggregator_parse_w3cdtf() returns FALSE on failure. |
|
151 } |
|
152 |
|
153 // Resolve dc:creator tag as the item author if author tag is not set. |
|
154 if (empty($item['author']) && !empty($item['dc:creator'])) { |
|
155 $item['author'] = $item['dc:creator']; |
|
156 } |
|
157 |
|
158 $item += array('author' => '', 'description' => ''); |
|
159 |
|
160 // Store on $feed object. This is where processors will look for parsed items. |
|
161 $feed->items[] = $item; |
|
162 } |
|
163 |
|
164 return TRUE; |
|
165 } |
|
166 |
|
167 /** |
|
168 * Performs an action when an opening tag is encountered. |
|
169 * |
|
170 * Callback function used by xml_parse() within aggregator_parse_feed(). |
|
171 */ |
|
172 function aggregator_element_start($parser, $name, $attributes) { |
|
173 global $item, $element, $tag, $items, $channel; |
|
174 |
|
175 $name = strtolower($name); |
|
176 switch ($name) { |
|
177 case 'image': |
|
178 case 'textinput': |
|
179 case 'summary': |
|
180 case 'tagline': |
|
181 case 'subtitle': |
|
182 case 'logo': |
|
183 case 'info': |
|
184 $element = $name; |
|
185 break; |
|
186 case 'id': |
|
187 case 'content': |
|
188 if ($element != 'item') { |
|
189 $element = $name; |
|
190 } |
|
191 case 'link': |
|
192 // According to RFC 4287, link elements in Atom feeds without a 'rel' |
|
193 // attribute should be interpreted as though the relation type is |
|
194 // "alternate". |
|
195 if (!empty($attributes['HREF']) && (empty($attributes['REL']) || $attributes['REL'] == 'alternate')) { |
|
196 if ($element == 'item') { |
|
197 $items[$item]['link'] = $attributes['HREF']; |
|
198 } |
|
199 else { |
|
200 $channel['link'] = $attributes['HREF']; |
|
201 } |
|
202 } |
|
203 break; |
|
204 case 'item': |
|
205 $element = $name; |
|
206 $item += 1; |
|
207 break; |
|
208 case 'entry': |
|
209 $element = 'item'; |
|
210 $item += 1; |
|
211 break; |
|
212 } |
|
213 |
|
214 $tag = $name; |
|
215 } |
|
216 |
|
217 /** |
|
218 * Performs an action when a closing tag is encountered. |
|
219 * |
|
220 * Callback function used by xml_parse() within aggregator_parse_feed(). |
|
221 */ |
|
222 function aggregator_element_end($parser, $name) { |
|
223 global $element; |
|
224 |
|
225 switch ($name) { |
|
226 case 'image': |
|
227 case 'textinput': |
|
228 case 'item': |
|
229 case 'entry': |
|
230 case 'info': |
|
231 $element = ''; |
|
232 break; |
|
233 case 'id': |
|
234 case 'content': |
|
235 if ($element == $name) { |
|
236 $element = ''; |
|
237 } |
|
238 } |
|
239 } |
|
240 |
|
241 /** |
|
242 * Performs an action when data is encountered. |
|
243 * |
|
244 * Callback function used by xml_parse() within aggregator_parse_feed(). |
|
245 */ |
|
246 function aggregator_element_data($parser, $data) { |
|
247 global $channel, $element, $items, $item, $image, $tag; |
|
248 $items += array($item => array()); |
|
249 switch ($element) { |
|
250 case 'item': |
|
251 $items[$item] += array($tag => ''); |
|
252 $items[$item][$tag] .= $data; |
|
253 break; |
|
254 case 'image': |
|
255 case 'logo': |
|
256 $image += array($tag => ''); |
|
257 $image[$tag] .= $data; |
|
258 break; |
|
259 case 'link': |
|
260 if ($data) { |
|
261 $items[$item] += array($tag => ''); |
|
262 $items[$item][$tag] .= $data; |
|
263 } |
|
264 break; |
|
265 case 'content': |
|
266 $items[$item] += array('content' => ''); |
|
267 $items[$item]['content'] .= $data; |
|
268 break; |
|
269 case 'summary': |
|
270 $items[$item] += array('summary' => ''); |
|
271 $items[$item]['summary'] .= $data; |
|
272 break; |
|
273 case 'tagline': |
|
274 case 'subtitle': |
|
275 $channel += array('description' => ''); |
|
276 $channel['description'] .= $data; |
|
277 break; |
|
278 case 'info': |
|
279 case 'id': |
|
280 case 'textinput': |
|
281 // The sub-element is not supported. However, we must recognize |
|
282 // it or its contents will end up in the item array. |
|
283 break; |
|
284 default: |
|
285 $channel += array($tag => ''); |
|
286 $channel[$tag] .= $data; |
|
287 } |
|
288 } |
|
289 |
|
290 /** |
|
291 * Parses the W3C date/time format, a subset of ISO 8601. |
|
292 * |
|
293 * PHP date parsing functions do not handle this format. See |
|
294 * http://www.w3.org/TR/NOTE-datetime for more information. Originally from |
|
295 * MagpieRSS (http://magpierss.sourceforge.net/). |
|
296 * |
|
297 * @param $date_str |
|
298 * A string with a potentially W3C DTF date. |
|
299 * |
|
300 * @return |
|
301 * A timestamp if parsed successfully or FALSE if not. |
|
302 */ |
|
303 function aggregator_parse_w3cdtf($date_str) { |
|
304 if (preg_match('/(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2})(:(\d{2}))?(?:([-+])(\d{2}):?(\d{2})|(Z))?/', $date_str, $match)) { |
|
305 list($year, $month, $day, $hours, $minutes, $seconds) = array($match[1], $match[2], $match[3], $match[4], $match[5], $match[6]); |
|
306 // Calculate the epoch for current date assuming GMT. |
|
307 $epoch = gmmktime($hours, $minutes, $seconds, $month, $day, $year); |
|
308 if ($match[10] != 'Z') { // Z is zulu time, aka GMT |
|
309 list($tz_mod, $tz_hour, $tz_min) = array($match[8], $match[9], $match[10]); |
|
310 // Zero out the variables. |
|
311 if (!$tz_hour) { |
|
312 $tz_hour = 0; |
|
313 } |
|
314 if (!$tz_min) { |
|
315 $tz_min = 0; |
|
316 } |
|
317 $offset_secs = (($tz_hour * 60) + $tz_min) * 60; |
|
318 // Is timezone ahead of GMT? If yes, subtract offset. |
|
319 if ($tz_mod == '+') { |
|
320 $offset_secs *= -1; |
|
321 } |
|
322 $epoch += $offset_secs; |
|
323 } |
|
324 return $epoch; |
|
325 } |
|
326 else { |
|
327 return FALSE; |
|
328 } |
|
329 } |