|
1 #!/usr/bin/env python |
|
2 """html2text: Turn HTML into equivalent Markdown-structured text.""" |
|
3 __version__ = "2.35" |
|
4 __author__ = "Aaron Swartz (me@aaronsw.com)" |
|
5 __copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3." |
|
6 __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes"] |
|
7 |
|
8 # TODO: |
|
9 # Support decoded entities with unifiable. |
|
10 # Relative URL resolution |
|
11 |
|
12 if not hasattr(__builtins__, 'True'): True, False = 1, 0 |
|
13 import re, sys, urllib, htmlentitydefs, codecs, StringIO, types |
|
14 import sgmllib |
|
15 sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]') |
|
16 |
|
17 try: from textwrap import wrap |
|
18 except: pass |
|
19 |
|
20 # Use Unicode characters instead of their ascii psuedo-replacements |
|
21 UNICODE_SNOB = 0 |
|
22 |
|
23 # Put the links after each paragraph instead of at the end. |
|
24 LINKS_EACH_PARAGRAPH = 0 |
|
25 |
|
26 # Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.) |
|
27 BODY_WIDTH = 78 |
|
28 |
|
29 # Don't show internal links (href="#local-anchor") -- corresponding link targets |
|
30 # won't be visible in the plain text file anyway. |
|
31 SKIP_INTERNAL_LINKS = False |
|
32 |
|
33 ### Entity Nonsense ### |
|
34 |
|
35 def name2cp(k): |
|
36 if k == 'apos': return ord("'") |
|
37 if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3 |
|
38 return htmlentitydefs.name2codepoint[k] |
|
39 else: |
|
40 k = htmlentitydefs.entitydefs[k] |
|
41 if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1 |
|
42 return ord(codecs.latin_1_decode(k)[0]) |
|
43 |
|
44 unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', |
|
45 'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*', |
|
46 'ndash':'-', 'oelig':'oe', 'aelig':'ae', |
|
47 'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', |
|
48 'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', |
|
49 'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i', |
|
50 'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', |
|
51 'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'} |
|
52 |
|
53 unifiable_n = {} |
|
54 |
|
55 for k in unifiable.keys(): |
|
56 unifiable_n[name2cp(k)] = unifiable[k] |
|
57 |
|
58 def charref(name): |
|
59 if name[0] in ['x','X']: |
|
60 c = int(name[1:], 16) |
|
61 else: |
|
62 c = int(name) |
|
63 |
|
64 if not UNICODE_SNOB and c in unifiable_n.keys(): |
|
65 return unifiable_n[c] |
|
66 else: |
|
67 return unichr(c) |
|
68 |
|
69 def entityref(c): |
|
70 if not UNICODE_SNOB and c in unifiable.keys(): |
|
71 return unifiable[c] |
|
72 else: |
|
73 try: name2cp(c) |
|
74 except KeyError: return "&" + c |
|
75 else: return unichr(name2cp(c)) |
|
76 |
|
77 def replaceEntities(s): |
|
78 s = s.group(1) |
|
79 if s[0] == "#": |
|
80 return charref(s[1:]) |
|
81 else: return entityref(s) |
|
82 |
|
83 r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));") |
|
84 def unescape(s): |
|
85 return r_unescape.sub(replaceEntities, s) |
|
86 |
|
87 def fixattrs(attrs): |
|
88 # Fix bug in sgmllib.py |
|
89 if not attrs: return attrs |
|
90 newattrs = [] |
|
91 for attr in attrs: |
|
92 newattrs.append((attr[0], unescape(attr[1]))) |
|
93 return newattrs |
|
94 |
|
95 ### End Entity Nonsense ### |
|
96 |
|
97 def onlywhite(line): |
|
98 """Return true if the line does only consist of whitespace characters.""" |
|
99 for c in line: |
|
100 if c is not ' ' and c is not ' ': |
|
101 return c is ' ' |
|
102 return line |
|
103 |
|
104 def optwrap(text): |
|
105 """Wrap all paragraphs in the provided text.""" |
|
106 if not BODY_WIDTH: |
|
107 return text |
|
108 |
|
109 assert wrap, "Requires Python 2.3." |
|
110 result = '' |
|
111 newlines = 0 |
|
112 for para in text.split("\n"): |
|
113 if len(para) > 0: |
|
114 if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*': |
|
115 for line in wrap(para, BODY_WIDTH): |
|
116 result += line + "\n" |
|
117 result += "\n" |
|
118 newlines = 2 |
|
119 else: |
|
120 if not onlywhite(para): |
|
121 result += para + "\n" |
|
122 newlines = 1 |
|
123 else: |
|
124 if newlines < 2: |
|
125 result += "\n" |
|
126 newlines += 1 |
|
127 return result |
|
128 |
|
129 def hn(tag): |
|
130 if tag[0] == 'h' and len(tag) == 2: |
|
131 try: |
|
132 n = int(tag[1]) |
|
133 if n in range(1, 10): return n |
|
134 except ValueError: return 0 |
|
135 |
|
136 class _html2text(sgmllib.SGMLParser): |
|
137 def __init__(self, out=sys.stdout.write): |
|
138 sgmllib.SGMLParser.__init__(self) |
|
139 |
|
140 if out is None: self.out = self.outtextf |
|
141 else: self.out = out |
|
142 self.outtext = u'' |
|
143 self.quiet = 0 |
|
144 self.p_p = 0 |
|
145 self.outcount = 0 |
|
146 self.start = 1 |
|
147 self.space = 0 |
|
148 self.a = [] |
|
149 self.astack = [] |
|
150 self.acount = 0 |
|
151 self.list = [] |
|
152 self.blockquote = 0 |
|
153 self.pre = 0 |
|
154 self.startpre = 0 |
|
155 self.lastWasNL = 0 |
|
156 self.abbr_title = None # current abbreviation definition |
|
157 self.abbr_data = None # last inner HTML (for abbr being defined) |
|
158 self.abbr_list = {} # stack of abbreviations to write later |
|
159 |
|
160 def outtextf(self, s): |
|
161 self.outtext += s |
|
162 |
|
163 def close(self): |
|
164 sgmllib.SGMLParser.close(self) |
|
165 |
|
166 self.pbr() |
|
167 self.o('', 0, 'end') |
|
168 |
|
169 return self.outtext |
|
170 |
|
171 def handle_charref(self, c): |
|
172 self.o(charref(c)) |
|
173 |
|
174 def handle_entityref(self, c): |
|
175 self.o(entityref(c)) |
|
176 |
|
177 def unknown_starttag(self, tag, attrs): |
|
178 self.handle_tag(tag, attrs, 1) |
|
179 |
|
180 def unknown_endtag(self, tag): |
|
181 self.handle_tag(tag, None, 0) |
|
182 |
|
183 def previousIndex(self, attrs): |
|
184 """ returns the index of certain set of attributes (of a link) in the |
|
185 self.a list |
|
186 |
|
187 If the set of attributes is not found, returns None |
|
188 """ |
|
189 if not attrs.has_key('href'): return None |
|
190 |
|
191 i = -1 |
|
192 for a in self.a: |
|
193 i += 1 |
|
194 match = 0 |
|
195 |
|
196 if a.has_key('href') and a['href'] == attrs['href']: |
|
197 if a.has_key('title') or attrs.has_key('title'): |
|
198 if (a.has_key('title') and attrs.has_key('title') and |
|
199 a['title'] == attrs['title']): |
|
200 match = True |
|
201 else: |
|
202 match = True |
|
203 |
|
204 if match: return i |
|
205 |
|
206 def handle_tag(self, tag, attrs, start): |
|
207 attrs = fixattrs(attrs) |
|
208 |
|
209 if hn(tag): |
|
210 self.p() |
|
211 if start: self.o(hn(tag)*"#" + ' ') |
|
212 |
|
213 if tag in ['p', 'div']: self.p() |
|
214 |
|
215 if tag == "br" and start: self.o(" \n") |
|
216 |
|
217 if tag == "hr" and start: |
|
218 self.p() |
|
219 self.o("* * *") |
|
220 self.p() |
|
221 |
|
222 if tag in ["head", "style", 'script']: |
|
223 if start: self.quiet += 1 |
|
224 else: self.quiet -= 1 |
|
225 |
|
226 if tag in ["body"]: |
|
227 self.quiet = 0 # sites like 9rules.com never close <head> |
|
228 |
|
229 if tag == "blockquote": |
|
230 if start: |
|
231 self.p(); self.o('> ', 0, 1); self.start = 1 |
|
232 self.blockquote += 1 |
|
233 else: |
|
234 self.blockquote -= 1 |
|
235 self.p() |
|
236 |
|
237 if tag in ['em', 'i', 'u']: self.o("_") |
|
238 if tag in ['strong', 'b']: self.o("**") |
|
239 if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` `` |
|
240 if tag == "abbr": |
|
241 if start: |
|
242 attrsD = {} |
|
243 for (x, y) in attrs: attrsD[x] = y |
|
244 attrs = attrsD |
|
245 |
|
246 self.abbr_title = None |
|
247 self.abbr_data = '' |
|
248 if attrs.has_key('title'): |
|
249 self.abbr_title = attrs['title'] |
|
250 else: |
|
251 if self.abbr_title != None: |
|
252 self.abbr_list[self.abbr_data] = self.abbr_title |
|
253 self.abbr_title = None |
|
254 self.abbr_data = '' |
|
255 |
|
256 if tag == "a": |
|
257 if start: |
|
258 attrsD = {} |
|
259 for (x, y) in attrs: attrsD[x] = y |
|
260 attrs = attrsD |
|
261 if attrs.has_key('href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')): |
|
262 self.astack.append(attrs) |
|
263 self.o("[") |
|
264 else: |
|
265 self.astack.append(None) |
|
266 else: |
|
267 if self.astack: |
|
268 a = self.astack.pop() |
|
269 if a: |
|
270 i = self.previousIndex(a) |
|
271 if i is not None: |
|
272 a = self.a[i] |
|
273 else: |
|
274 self.acount += 1 |
|
275 a['count'] = self.acount |
|
276 a['outcount'] = self.outcount |
|
277 self.a.append(a) |
|
278 self.o("][" + `a['count']` + "]") |
|
279 |
|
280 if tag == "img" and start: |
|
281 attrsD = {} |
|
282 for (x, y) in attrs: attrsD[x] = y |
|
283 attrs = attrsD |
|
284 if attrs.has_key('src'): |
|
285 attrs['href'] = attrs['src'] |
|
286 alt = attrs.get('alt', '') |
|
287 i = self.previousIndex(attrs) |
|
288 if i is not None: |
|
289 attrs = self.a[i] |
|
290 else: |
|
291 self.acount += 1 |
|
292 attrs['count'] = self.acount |
|
293 attrs['outcount'] = self.outcount |
|
294 self.a.append(attrs) |
|
295 self.o("![") |
|
296 self.o(alt) |
|
297 self.o("]["+`attrs['count']`+"]") |
|
298 |
|
299 if tag == 'dl' and start: self.p() |
|
300 if tag == 'dt' and not start: self.pbr() |
|
301 if tag == 'dd' and start: self.o(' ') |
|
302 if tag == 'dd' and not start: self.pbr() |
|
303 |
|
304 if tag in ["ol", "ul"]: |
|
305 if start: |
|
306 self.list.append({'name':tag, 'num':0}) |
|
307 else: |
|
308 if self.list: self.list.pop() |
|
309 |
|
310 self.p() |
|
311 |
|
312 if tag == 'li': |
|
313 if start: |
|
314 self.pbr() |
|
315 if self.list: li = self.list[-1] |
|
316 else: li = {'name':'ul', 'num':0} |
|
317 self.o(" "*len(self.list)) #TODO: line up <ol><li>s > 9 correctly. |
|
318 if li['name'] == "ul": self.o("* ") |
|
319 elif li['name'] == "ol": |
|
320 li['num'] += 1 |
|
321 self.o(`li['num']`+". ") |
|
322 self.start = 1 |
|
323 else: |
|
324 self.pbr() |
|
325 |
|
326 if tag in ["table", "tr"] and start: self.p() |
|
327 if tag == 'td': self.pbr() |
|
328 |
|
329 if tag == "pre": |
|
330 if start: |
|
331 self.startpre = 1 |
|
332 self.pre = 1 |
|
333 else: |
|
334 self.pre = 0 |
|
335 self.p() |
|
336 |
|
337 def pbr(self): |
|
338 if self.p_p == 0: self.p_p = 1 |
|
339 |
|
340 def p(self): self.p_p = 2 |
|
341 |
|
342 def o(self, data, puredata=0, force=0): |
|
343 if self.abbr_data is not None: self.abbr_data += data |
|
344 |
|
345 if not self.quiet: |
|
346 if puredata and not self.pre: |
|
347 data = re.sub('\s+', ' ', data) |
|
348 if data and data[0] == ' ': |
|
349 self.space = 1 |
|
350 data = data[1:] |
|
351 if not data and not force: return |
|
352 |
|
353 if self.startpre: |
|
354 #self.out(" :") #TODO: not output when already one there |
|
355 self.startpre = 0 |
|
356 |
|
357 bq = (">" * self.blockquote) |
|
358 if not (force and data and data[0] == ">") and self.blockquote: bq += " " |
|
359 |
|
360 if self.pre: |
|
361 bq += " " |
|
362 data = data.replace("\n", "\n"+bq) |
|
363 |
|
364 if self.start: |
|
365 self.space = 0 |
|
366 self.p_p = 0 |
|
367 self.start = 0 |
|
368 |
|
369 if force == 'end': |
|
370 # It's the end. |
|
371 self.p_p = 0 |
|
372 self.out("\n") |
|
373 self.space = 0 |
|
374 |
|
375 |
|
376 if self.p_p: |
|
377 self.out(('\n'+bq)*self.p_p) |
|
378 self.space = 0 |
|
379 |
|
380 if self.space: |
|
381 if not self.lastWasNL: self.out(' ') |
|
382 self.space = 0 |
|
383 |
|
384 if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"): |
|
385 if force == "end": self.out("\n") |
|
386 |
|
387 newa = [] |
|
388 for link in self.a: |
|
389 if self.outcount > link['outcount']: |
|
390 self.out(" ["+`link['count']`+"]: " + link['href']) #TODO: base href |
|
391 if link.has_key('title'): self.out(" ("+link['title']+")") |
|
392 self.out("\n") |
|
393 else: |
|
394 newa.append(link) |
|
395 |
|
396 if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done. |
|
397 |
|
398 self.a = newa |
|
399 |
|
400 if self.abbr_list and force == "end": |
|
401 for abbr, definition in self.abbr_list.items(): |
|
402 self.out(" *[" + abbr + "]: " + definition + "\n") |
|
403 |
|
404 self.p_p = 0 |
|
405 self.out(data) |
|
406 self.lastWasNL = data and data[-1] == '\n' |
|
407 self.outcount += 1 |
|
408 |
|
409 def handle_data(self, data): |
|
410 if r'\/script>' in data: self.quiet -= 1 |
|
411 self.o(data, 1) |
|
412 |
|
413 def unknown_decl(self, data): pass |
|
414 |
|
415 def wrapwrite(text): sys.stdout.write(text.encode('utf8')) |
|
416 |
|
417 def html2text_file(html, out=wrapwrite): |
|
418 h = _html2text(out) |
|
419 h.feed(html) |
|
420 h.feed("") |
|
421 return h.close() |
|
422 |
|
423 def html2text(html): |
|
424 return optwrap(html2text_file(html, None)) |
|
425 |
|
426 if __name__ == "__main__": |
|
427 if sys.argv[1:]: |
|
428 arg = sys.argv[1] |
|
429 if arg.startswith('http://'): |
|
430 j = urllib.urlopen(arg) |
|
431 try: |
|
432 from feedparser import _getCharacterEncoding as enc |
|
433 except ImportError: |
|
434 enc = lambda x, y: ('utf-8', 1) |
|
435 text = j.read() |
|
436 encoding = enc(j.headers, text)[0] |
|
437 if encoding == 'us-ascii': encoding = 'utf-8' |
|
438 data = text.decode(encoding) |
|
439 |
|
440 else: |
|
441 encoding = 'utf8' |
|
442 if len(sys.argv) > 2: |
|
443 encoding = sys.argv[2] |
|
444 data = open(arg, 'r').read().decode(encoding) |
|
445 else: |
|
446 data = sys.stdin.read().decode('utf8') |
|
447 wrapwrite(html2text(data)) |
|
448 |