src/cm/ext/html2text.py
author Production Moz <dev@sopinspace.com>
Tue, 31 May 2011 17:52:28 +0200
changeset 344 9787360440db
parent 0 40c8f766c9b8
permissions -rwxr-xr-x
memoize get_activity
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     1
#!/usr/bin/env python
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     2
"""html2text: Turn HTML into equivalent Markdown-structured text."""
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     3
__version__ = "2.35"
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     4
__author__ = "Aaron Swartz (me@aaronsw.com)"
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     5
__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     6
__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes"]
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     7
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     8
# TODO:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     9
#   Support decoded entities with unifiable.
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    10
#   Relative URL resolution
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    11
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    12
if not hasattr(__builtins__, 'True'): True, False = 1, 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    13
import re, sys, urllib, htmlentitydefs, codecs, StringIO, types
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    14
import sgmllib
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    15
sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    16
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    17
try: from textwrap import wrap
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    18
except: pass
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    19
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    20
# Use Unicode characters instead of their ascii psuedo-replacements
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    21
UNICODE_SNOB = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    22
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    23
# Put the links after each paragraph instead of at the end.
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    24
LINKS_EACH_PARAGRAPH = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    25
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    26
# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    27
BODY_WIDTH = 78
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    28
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    29
# Don't show internal links (href="#local-anchor") -- corresponding link targets
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    30
# won't be visible in the plain text file anyway.
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    31
SKIP_INTERNAL_LINKS = False
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    32
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    33
### Entity Nonsense ###
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    34
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    35
def name2cp(k):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    36
    if k == 'apos': return ord("'")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    37
    if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    38
        return htmlentitydefs.name2codepoint[k]
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    39
    else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    40
        k = htmlentitydefs.entitydefs[k]
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    41
        if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    42
        return ord(codecs.latin_1_decode(k)[0])
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    43
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    44
unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    45
'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    46
'ndash':'-', 'oelig':'oe', 'aelig':'ae',
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    47
'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    48
'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    49
'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    50
'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    51
'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'}
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    52
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    53
unifiable_n = {}
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    54
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    55
for k in unifiable.keys():
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    56
    unifiable_n[name2cp(k)] = unifiable[k]
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    57
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    58
def charref(name):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    59
    if name[0] in ['x','X']:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    60
        c = int(name[1:], 16)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    61
    else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    62
        c = int(name)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    63
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    64
    if not UNICODE_SNOB and c in unifiable_n.keys():
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    65
        return unifiable_n[c]
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    66
    else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    67
        return unichr(c)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    68
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    69
def entityref(c):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    70
    if not UNICODE_SNOB and c in unifiable.keys():
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    71
        return unifiable[c]
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    72
    else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    73
        try: name2cp(c)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    74
        except KeyError: return "&" + c
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    75
        else: return unichr(name2cp(c))
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    76
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    77
def replaceEntities(s):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    78
    s = s.group(1)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    79
    if s[0] == "#": 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    80
        return charref(s[1:])
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    81
    else: return entityref(s)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    82
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    83
r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    84
def unescape(s):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    85
    return r_unescape.sub(replaceEntities, s)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    86
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    87
def fixattrs(attrs):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    88
    # Fix bug in sgmllib.py
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    89
    if not attrs: return attrs
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    90
    newattrs = []
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    91
    for attr in attrs:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    92
        newattrs.append((attr[0], unescape(attr[1])))
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    93
    return newattrs
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    94
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    95
### End Entity Nonsense ###
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    96
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    97
def onlywhite(line):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    98
    """Return true if the line does only consist of whitespace characters."""
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    99
    for c in line:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   100
        if c is not ' ' and c is not '  ':
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   101
            return c is ' '
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   102
    return line
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   103
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   104
def optwrap(text):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   105
    """Wrap all paragraphs in the provided text."""
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   106
    if not BODY_WIDTH:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   107
        return text
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   108
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   109
    assert wrap, "Requires Python 2.3."
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   110
    result = ''
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   111
    newlines = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   112
    for para in text.split("\n"):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   113
        if len(para) > 0:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   114
            if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*':
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   115
                for line in wrap(para, BODY_WIDTH):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   116
                    result += line + "\n"
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   117
                result += "\n"
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   118
                newlines = 2
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   119
            else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   120
                if not onlywhite(para):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   121
                    result += para + "\n"
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   122
                    newlines = 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   123
        else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   124
            if newlines < 2:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   125
                result += "\n"
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   126
                newlines += 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   127
    return result
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   128
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   129
def hn(tag):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   130
    if tag[0] == 'h' and len(tag) == 2:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   131
        try:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   132
            n = int(tag[1])
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   133
            if n in range(1, 10): return n
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   134
        except ValueError: return 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   135
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   136
class _html2text(sgmllib.SGMLParser):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   137
    def __init__(self, out=sys.stdout.write):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   138
        sgmllib.SGMLParser.__init__(self)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   139
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   140
        if out is None: self.out = self.outtextf
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   141
        else: self.out = out
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   142
        self.outtext = u''
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   143
        self.quiet = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   144
        self.p_p = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   145
        self.outcount = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   146
        self.start = 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   147
        self.space = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   148
        self.a = []
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   149
        self.astack = []
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   150
        self.acount = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   151
        self.list = []
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   152
        self.blockquote = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   153
        self.pre = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   154
        self.startpre = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   155
        self.lastWasNL = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   156
        self.abbr_title = None # current abbreviation definition
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   157
        self.abbr_data = None # last inner HTML (for abbr being defined)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   158
        self.abbr_list = {} # stack of abbreviations to write later
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   159
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   160
    def outtextf(self, s): 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   161
        self.outtext += s
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   162
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   163
    def close(self):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   164
        sgmllib.SGMLParser.close(self)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   165
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   166
        self.pbr()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   167
        self.o('', 0, 'end')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   168
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   169
        return self.outtext
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   170
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   171
    def handle_charref(self, c):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   172
        self.o(charref(c))
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   173
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   174
    def handle_entityref(self, c):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   175
        self.o(entityref(c))
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   176
            
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   177
    def unknown_starttag(self, tag, attrs):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   178
        self.handle_tag(tag, attrs, 1)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   179
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   180
    def unknown_endtag(self, tag):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   181
        self.handle_tag(tag, None, 0)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   182
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   183
    def previousIndex(self, attrs):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   184
        """ returns the index of certain set of attributes (of a link) in the
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   185
            self.a list
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   186
 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   187
            If the set of attributes is not found, returns None
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   188
        """
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   189
        if not attrs.has_key('href'): return None
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   190
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   191
        i = -1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   192
        for a in self.a:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   193
            i += 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   194
            match = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   195
            
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   196
            if a.has_key('href') and a['href'] == attrs['href']:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   197
                if a.has_key('title') or attrs.has_key('title'):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   198
                        if (a.has_key('title') and attrs.has_key('title') and
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   199
                            a['title'] == attrs['title']):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   200
                            match = True
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   201
                else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   202
                    match = True
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   203
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   204
            if match: return i
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   205
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   206
    def handle_tag(self, tag, attrs, start):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   207
        attrs = fixattrs(attrs)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   208
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   209
        if hn(tag):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   210
            self.p()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   211
            if start: self.o(hn(tag)*"#" + ' ')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   212
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   213
        if tag in ['p', 'div']: self.p()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   214
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   215
        if tag == "br" and start: self.o("  \n")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   216
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   217
        if tag == "hr" and start:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   218
            self.p()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   219
            self.o("* * *")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   220
            self.p()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   221
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   222
        if tag in ["head", "style", 'script']: 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   223
            if start: self.quiet += 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   224
            else: self.quiet -= 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   225
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   226
        if tag in ["body"]:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   227
            self.quiet = 0 # sites like 9rules.com never close <head>
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   228
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   229
        if tag == "blockquote":
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   230
            if start: 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   231
                self.p(); self.o('> ', 0, 1); self.start = 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   232
                self.blockquote += 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   233
            else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   234
                self.blockquote -= 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   235
                self.p()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   236
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   237
        if tag in ['em', 'i', 'u']: self.o("_")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   238
        if tag in ['strong', 'b']: self.o("**")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   239
        if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` ``
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   240
        if tag == "abbr":
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   241
            if start:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   242
                attrsD = {}
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   243
                for (x, y) in attrs: attrsD[x] = y
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   244
                attrs = attrsD
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   245
                
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   246
                self.abbr_title = None
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   247
                self.abbr_data = ''
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   248
                if attrs.has_key('title'):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   249
                    self.abbr_title = attrs['title']
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   250
            else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   251
                if self.abbr_title != None:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   252
                    self.abbr_list[self.abbr_data] = self.abbr_title
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   253
                    self.abbr_title = None
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   254
                self.abbr_data = ''
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   255
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   256
        if tag == "a":
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   257
            if start:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   258
                attrsD = {}
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   259
                for (x, y) in attrs: attrsD[x] = y
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   260
                attrs = attrsD
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   261
                if attrs.has_key('href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')): 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   262
                    self.astack.append(attrs)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   263
                    self.o("[")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   264
                else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   265
                    self.astack.append(None)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   266
            else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   267
                if self.astack:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   268
                    a = self.astack.pop()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   269
                    if a:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   270
                        i = self.previousIndex(a)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   271
                        if i is not None:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   272
                            a = self.a[i]
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   273
                        else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   274
                            self.acount += 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   275
                            a['count'] = self.acount
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   276
                            a['outcount'] = self.outcount
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   277
                            self.a.append(a)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   278
                        self.o("][" + `a['count']` + "]")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   279
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   280
        if tag == "img" and start:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   281
            attrsD = {}
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   282
            for (x, y) in attrs: attrsD[x] = y
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   283
            attrs = attrsD
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   284
            if attrs.has_key('src'):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   285
                attrs['href'] = attrs['src']
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   286
                alt = attrs.get('alt', '')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   287
                i = self.previousIndex(attrs)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   288
                if i is not None:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   289
                    attrs = self.a[i]
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   290
                else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   291
                    self.acount += 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   292
                    attrs['count'] = self.acount
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   293
                    attrs['outcount'] = self.outcount
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   294
                    self.a.append(attrs)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   295
                self.o("![")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   296
                self.o(alt)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   297
                self.o("]["+`attrs['count']`+"]")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   298
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   299
        if tag == 'dl' and start: self.p()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   300
        if tag == 'dt' and not start: self.pbr()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   301
        if tag == 'dd' and start: self.o('    ')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   302
        if tag == 'dd' and not start: self.pbr()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   303
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   304
        if tag in ["ol", "ul"]:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   305
            if start:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   306
                self.list.append({'name':tag, 'num':0})
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   307
            else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   308
                if self.list: self.list.pop()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   309
            
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   310
            self.p()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   311
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   312
        if tag == 'li':
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   313
            if start:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   314
                self.pbr()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   315
                if self.list: li = self.list[-1]
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   316
                else: li = {'name':'ul', 'num':0}
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   317
                self.o("  "*len(self.list)) #TODO: line up <ol><li>s > 9 correctly.
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   318
                if li['name'] == "ul": self.o("* ")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   319
                elif li['name'] == "ol":
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   320
                    li['num'] += 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   321
                    self.o(`li['num']`+". ")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   322
                self.start = 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   323
            else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   324
                self.pbr()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   325
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   326
        if tag in ["table", "tr"] and start: self.p()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   327
        if tag == 'td': self.pbr()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   328
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   329
        if tag == "pre":
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   330
            if start:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   331
                self.startpre = 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   332
                self.pre = 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   333
            else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   334
                self.pre = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   335
            self.p()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   336
            
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   337
    def pbr(self):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   338
        if self.p_p == 0: self.p_p = 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   339
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   340
    def p(self): self.p_p = 2
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   341
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   342
    def o(self, data, puredata=0, force=0):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   343
        if self.abbr_data is not None: self.abbr_data += data
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   344
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   345
        if not self.quiet: 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   346
            if puredata and not self.pre:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   347
                data = re.sub('\s+', ' ', data)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   348
                if data and data[0] == ' ':
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   349
                    self.space = 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   350
                    data = data[1:]
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   351
            if not data and not force: return
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   352
            
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   353
            if self.startpre:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   354
                #self.out(" :") #TODO: not output when already one there
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   355
                self.startpre = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   356
            
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   357
            bq = (">" * self.blockquote)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   358
            if not (force and data and data[0] == ">") and self.blockquote: bq += " "
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   359
            
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   360
            if self.pre:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   361
                bq += "    "
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   362
                data = data.replace("\n", "\n"+bq)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   363
            
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   364
            if self.start:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   365
                self.space = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   366
                self.p_p = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   367
                self.start = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   368
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   369
            if force == 'end':
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   370
                # It's the end.
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   371
                self.p_p = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   372
                self.out("\n")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   373
                self.space = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   374
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   375
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   376
            if self.p_p:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   377
                self.out(('\n'+bq)*self.p_p)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   378
                self.space = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   379
                
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   380
            if self.space:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   381
                if not self.lastWasNL: self.out(' ')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   382
                self.space = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   383
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   384
            if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   385
                if force == "end": self.out("\n")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   386
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   387
                newa = []
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   388
                for link in self.a:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   389
                    if self.outcount > link['outcount']:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   390
                        self.out("   ["+`link['count']`+"]: " + link['href']) #TODO: base href
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   391
                        if link.has_key('title'): self.out(" ("+link['title']+")")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   392
                        self.out("\n")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   393
                    else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   394
                        newa.append(link)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   395
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   396
                if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   397
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   398
                self.a = newa
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   399
            
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   400
            if self.abbr_list and force == "end":
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   401
                for abbr, definition in self.abbr_list.items():
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   402
                    self.out("  *[" + abbr + "]: " + definition + "\n")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   403
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   404
            self.p_p = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   405
            self.out(data)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   406
            self.lastWasNL = data and data[-1] == '\n'
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   407
            self.outcount += 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   408
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   409
    def handle_data(self, data):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   410
        if r'\/script>' in data: self.quiet -= 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   411
        self.o(data, 1)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   412
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   413
    def unknown_decl(self, data): pass
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   414
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   415
def wrapwrite(text): sys.stdout.write(text.encode('utf8'))
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   416
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   417
def html2text_file(html, out=wrapwrite):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   418
    h = _html2text(out)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   419
    h.feed(html)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   420
    h.feed("")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   421
    return h.close()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   422
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   423
def html2text(html):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   424
    return optwrap(html2text_file(html, None))
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   425
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   426
if __name__ == "__main__":
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   427
    if sys.argv[1:]:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   428
        arg = sys.argv[1]
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   429
        if arg.startswith('http://'):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   430
            j = urllib.urlopen(arg)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   431
            try:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   432
                from feedparser import _getCharacterEncoding as enc
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   433
            except ImportError:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   434
                   enc = lambda x, y: ('utf-8', 1)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   435
            text = j.read()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   436
            encoding = enc(j.headers, text)[0]
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   437
            if encoding == 'us-ascii': encoding = 'utf-8'
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   438
            data = text.decode(encoding)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   439
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   440
        else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   441
            encoding = 'utf8'
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   442
            if len(sys.argv) > 2:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   443
                encoding = sys.argv[2]
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   444
            data = open(arg, 'r').read().decode(encoding)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   445
    else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   446
        data = sys.stdin.read().decode('utf8')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   447
    wrapwrite(html2text(data))
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   448