src/cm/ext/html2text.py
author Simon Descarpentries <sid@sopinspace.com>
Mon, 21 Oct 2013 16:37:07 +0200
changeset 553 bf26fb47a14c
parent 0 40c8f766c9b8
permissions -rwxr-xr-x
To allow scrolling in Safari mobile, we set the content of text_view_comments frame in a jQuery UI layout. So the automated scrolling operations in c_sync.js must be adjustable to the right part to scroll. Also, if a comment have to be shown outside of the current viewport, we scroll the correct part to that viewport and then set the comment top Y offset to juste what it needs to avoid the "Add comment" button after scrolling operation. If not in Safari mobile, we add an offset here to avoid comment to display under the "Add comment" button.
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     1
#!/usr/bin/env python
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     2
"""html2text: Turn HTML into equivalent Markdown-structured text."""
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     3
__version__ = "2.35"
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     4
__author__ = "Aaron Swartz (me@aaronsw.com)"
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     5
__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     6
__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes"]
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     7
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     8
# TODO:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
     9
#   Support decoded entities with unifiable.
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    10
#   Relative URL resolution
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    11
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    12
if not hasattr(__builtins__, 'True'): True, False = 1, 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    13
import re, sys, urllib, htmlentitydefs, codecs, StringIO, types
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    14
import sgmllib
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    15
sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    16
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    17
try: from textwrap import wrap
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    18
except: pass
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    19
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    20
# Use Unicode characters instead of their ascii psuedo-replacements
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    21
UNICODE_SNOB = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    22
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    23
# Put the links after each paragraph instead of at the end.
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    24
LINKS_EACH_PARAGRAPH = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    25
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    26
# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    27
BODY_WIDTH = 78
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    28
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    29
# Don't show internal links (href="#local-anchor") -- corresponding link targets
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    30
# won't be visible in the plain text file anyway.
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    31
SKIP_INTERNAL_LINKS = False
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    32
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    33
### Entity Nonsense ###
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    34
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    35
def name2cp(k):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    36
    if k == 'apos': return ord("'")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    37
    if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    38
        return htmlentitydefs.name2codepoint[k]
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    39
    else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    40
        k = htmlentitydefs.entitydefs[k]
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    41
        if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    42
        return ord(codecs.latin_1_decode(k)[0])
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    43
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    44
unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    45
'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    46
'ndash':'-', 'oelig':'oe', 'aelig':'ae',
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    47
'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    48
'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    49
'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    50
'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    51
'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'}
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    52
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    53
unifiable_n = {}
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    54
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    55
for k in unifiable.keys():
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    56
    unifiable_n[name2cp(k)] = unifiable[k]
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    57
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    58
def charref(name):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    59
    if name[0] in ['x','X']:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    60
        c = int(name[1:], 16)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    61
    else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    62
        c = int(name)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    63
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    64
    if not UNICODE_SNOB and c in unifiable_n.keys():
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    65
        return unifiable_n[c]
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    66
    else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    67
        return unichr(c)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    68
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    69
def entityref(c):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    70
    if not UNICODE_SNOB and c in unifiable.keys():
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    71
        return unifiable[c]
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    72
    else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    73
        try: name2cp(c)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    74
        except KeyError: return "&" + c
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    75
        else: return unichr(name2cp(c))
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    76
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    77
def replaceEntities(s):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    78
    s = s.group(1)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    79
    if s[0] == "#": 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    80
        return charref(s[1:])
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    81
    else: return entityref(s)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    82
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    83
r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    84
def unescape(s):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    85
    return r_unescape.sub(replaceEntities, s)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    86
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    87
def fixattrs(attrs):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    88
    # Fix bug in sgmllib.py
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    89
    if not attrs: return attrs
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    90
    newattrs = []
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    91
    for attr in attrs:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    92
        newattrs.append((attr[0], unescape(attr[1])))
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    93
    return newattrs
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    94
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    95
### End Entity Nonsense ###
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    96
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    97
def onlywhite(line):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    98
    """Return true if the line does only consist of whitespace characters."""
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
    99
    for c in line:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   100
        if c is not ' ' and c is not '  ':
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   101
            return c is ' '
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   102
    return line
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   103
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   104
def optwrap(text):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   105
    """Wrap all paragraphs in the provided text."""
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   106
    if not BODY_WIDTH:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   107
        return text
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   108
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   109
    assert wrap, "Requires Python 2.3."
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   110
    result = ''
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   111
    newlines = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   112
    for para in text.split("\n"):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   113
        if len(para) > 0:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   114
            if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*':
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   115
                for line in wrap(para, BODY_WIDTH):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   116
                    result += line + "\n"
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   117
                result += "\n"
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   118
                newlines = 2
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   119
            else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   120
                if not onlywhite(para):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   121
                    result += para + "\n"
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   122
                    newlines = 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   123
        else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   124
            if newlines < 2:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   125
                result += "\n"
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   126
                newlines += 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   127
    return result
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   128
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   129
def hn(tag):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   130
    if tag[0] == 'h' and len(tag) == 2:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   131
        try:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   132
            n = int(tag[1])
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   133
            if n in range(1, 10): return n
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   134
        except ValueError: return 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   135
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   136
class _html2text(sgmllib.SGMLParser):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   137
    def __init__(self, out=sys.stdout.write):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   138
        sgmllib.SGMLParser.__init__(self)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   139
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   140
        if out is None: self.out = self.outtextf
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   141
        else: self.out = out
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   142
        self.outtext = u''
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   143
        self.quiet = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   144
        self.p_p = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   145
        self.outcount = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   146
        self.start = 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   147
        self.space = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   148
        self.a = []
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   149
        self.astack = []
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   150
        self.acount = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   151
        self.list = []
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   152
        self.blockquote = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   153
        self.pre = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   154
        self.startpre = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   155
        self.lastWasNL = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   156
        self.abbr_title = None # current abbreviation definition
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   157
        self.abbr_data = None # last inner HTML (for abbr being defined)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   158
        self.abbr_list = {} # stack of abbreviations to write later
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   159
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   160
    def outtextf(self, s): 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   161
        self.outtext += s
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   162
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   163
    def close(self):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   164
        sgmllib.SGMLParser.close(self)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   165
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   166
        self.pbr()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   167
        self.o('', 0, 'end')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   168
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   169
        return self.outtext
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   170
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   171
    def handle_charref(self, c):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   172
        self.o(charref(c))
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   173
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   174
    def handle_entityref(self, c):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   175
        self.o(entityref(c))
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   176
            
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   177
    def unknown_starttag(self, tag, attrs):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   178
        self.handle_tag(tag, attrs, 1)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   179
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   180
    def unknown_endtag(self, tag):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   181
        self.handle_tag(tag, None, 0)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   182
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   183
    def previousIndex(self, attrs):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   184
        """ returns the index of certain set of attributes (of a link) in the
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   185
            self.a list
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   186
 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   187
            If the set of attributes is not found, returns None
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   188
        """
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   189
        if not attrs.has_key('href'): return None
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   190
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   191
        i = -1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   192
        for a in self.a:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   193
            i += 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   194
            match = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   195
            
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   196
            if a.has_key('href') and a['href'] == attrs['href']:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   197
                if a.has_key('title') or attrs.has_key('title'):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   198
                        if (a.has_key('title') and attrs.has_key('title') and
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   199
                            a['title'] == attrs['title']):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   200
                            match = True
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   201
                else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   202
                    match = True
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   203
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   204
            if match: return i
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   205
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   206
    def handle_tag(self, tag, attrs, start):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   207
        attrs = fixattrs(attrs)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   208
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   209
        if hn(tag):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   210
            self.p()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   211
            if start: self.o(hn(tag)*"#" + ' ')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   212
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   213
        if tag in ['p', 'div']: self.p()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   214
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   215
        if tag == "br" and start: self.o("  \n")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   216
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   217
        if tag == "hr" and start:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   218
            self.p()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   219
            self.o("* * *")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   220
            self.p()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   221
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   222
        if tag in ["head", "style", 'script']: 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   223
            if start: self.quiet += 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   224
            else: self.quiet -= 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   225
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   226
        if tag in ["body"]:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   227
            self.quiet = 0 # sites like 9rules.com never close <head>
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   228
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   229
        if tag == "blockquote":
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   230
            if start: 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   231
                self.p(); self.o('> ', 0, 1); self.start = 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   232
                self.blockquote += 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   233
            else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   234
                self.blockquote -= 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   235
                self.p()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   236
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   237
        if tag in ['em', 'i', 'u']: self.o("_")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   238
        if tag in ['strong', 'b']: self.o("**")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   239
        if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` ``
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   240
        if tag == "abbr":
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   241
            if start:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   242
                attrsD = {}
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   243
                for (x, y) in attrs: attrsD[x] = y
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   244
                attrs = attrsD
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   245
                
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   246
                self.abbr_title = None
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   247
                self.abbr_data = ''
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   248
                if attrs.has_key('title'):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   249
                    self.abbr_title = attrs['title']
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   250
            else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   251
                if self.abbr_title != None:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   252
                    self.abbr_list[self.abbr_data] = self.abbr_title
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   253
                    self.abbr_title = None
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   254
                self.abbr_data = ''
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   255
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   256
        if tag == "a":
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   257
            if start:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   258
                attrsD = {}
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   259
                for (x, y) in attrs: attrsD[x] = y
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   260
                attrs = attrsD
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   261
                if attrs.has_key('href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')): 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   262
                    self.astack.append(attrs)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   263
                    self.o("[")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   264
                else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   265
                    self.astack.append(None)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   266
            else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   267
                if self.astack:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   268
                    a = self.astack.pop()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   269
                    if a:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   270
                        i = self.previousIndex(a)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   271
                        if i is not None:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   272
                            a = self.a[i]
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   273
                        else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   274
                            self.acount += 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   275
                            a['count'] = self.acount
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   276
                            a['outcount'] = self.outcount
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   277
                            self.a.append(a)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   278
                        self.o("][" + `a['count']` + "]")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   279
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   280
        if tag == "img" and start:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   281
            attrsD = {}
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   282
            for (x, y) in attrs: attrsD[x] = y
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   283
            attrs = attrsD
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   284
            if attrs.has_key('src'):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   285
                attrs['href'] = attrs['src']
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   286
                alt = attrs.get('alt', '')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   287
                i = self.previousIndex(attrs)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   288
                if i is not None:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   289
                    attrs = self.a[i]
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   290
                else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   291
                    self.acount += 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   292
                    attrs['count'] = self.acount
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   293
                    attrs['outcount'] = self.outcount
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   294
                    self.a.append(attrs)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   295
                self.o("![")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   296
                self.o(alt)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   297
                self.o("]["+`attrs['count']`+"]")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   298
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   299
        if tag == 'dl' and start: self.p()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   300
        if tag == 'dt' and not start: self.pbr()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   301
        if tag == 'dd' and start: self.o('    ')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   302
        if tag == 'dd' and not start: self.pbr()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   303
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   304
        if tag in ["ol", "ul"]:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   305
            if start:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   306
                self.list.append({'name':tag, 'num':0})
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   307
            else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   308
                if self.list: self.list.pop()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   309
            
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   310
            self.p()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   311
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   312
        if tag == 'li':
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   313
            if start:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   314
                self.pbr()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   315
                if self.list: li = self.list[-1]
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   316
                else: li = {'name':'ul', 'num':0}
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   317
                self.o("  "*len(self.list)) #TODO: line up <ol><li>s > 9 correctly.
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   318
                if li['name'] == "ul": self.o("* ")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   319
                elif li['name'] == "ol":
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   320
                    li['num'] += 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   321
                    self.o(`li['num']`+". ")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   322
                self.start = 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   323
            else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   324
                self.pbr()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   325
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   326
        if tag in ["table", "tr"] and start: self.p()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   327
        if tag == 'td': self.pbr()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   328
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   329
        if tag == "pre":
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   330
            if start:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   331
                self.startpre = 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   332
                self.pre = 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   333
            else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   334
                self.pre = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   335
            self.p()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   336
            
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   337
    def pbr(self):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   338
        if self.p_p == 0: self.p_p = 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   339
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   340
    def p(self): self.p_p = 2
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   341
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   342
    def o(self, data, puredata=0, force=0):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   343
        if self.abbr_data is not None: self.abbr_data += data
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   344
        
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   345
        if not self.quiet: 
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   346
            if puredata and not self.pre:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   347
                data = re.sub('\s+', ' ', data)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   348
                if data and data[0] == ' ':
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   349
                    self.space = 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   350
                    data = data[1:]
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   351
            if not data and not force: return
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   352
            
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   353
            if self.startpre:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   354
                #self.out(" :") #TODO: not output when already one there
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   355
                self.startpre = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   356
            
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   357
            bq = (">" * self.blockquote)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   358
            if not (force and data and data[0] == ">") and self.blockquote: bq += " "
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   359
            
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   360
            if self.pre:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   361
                bq += "    "
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   362
                data = data.replace("\n", "\n"+bq)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   363
            
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   364
            if self.start:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   365
                self.space = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   366
                self.p_p = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   367
                self.start = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   368
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   369
            if force == 'end':
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   370
                # It's the end.
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   371
                self.p_p = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   372
                self.out("\n")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   373
                self.space = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   374
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   375
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   376
            if self.p_p:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   377
                self.out(('\n'+bq)*self.p_p)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   378
                self.space = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   379
                
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   380
            if self.space:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   381
                if not self.lastWasNL: self.out(' ')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   382
                self.space = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   383
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   384
            if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   385
                if force == "end": self.out("\n")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   386
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   387
                newa = []
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   388
                for link in self.a:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   389
                    if self.outcount > link['outcount']:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   390
                        self.out("   ["+`link['count']`+"]: " + link['href']) #TODO: base href
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   391
                        if link.has_key('title'): self.out(" ("+link['title']+")")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   392
                        self.out("\n")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   393
                    else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   394
                        newa.append(link)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   395
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   396
                if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   397
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   398
                self.a = newa
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   399
            
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   400
            if self.abbr_list and force == "end":
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   401
                for abbr, definition in self.abbr_list.items():
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   402
                    self.out("  *[" + abbr + "]: " + definition + "\n")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   403
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   404
            self.p_p = 0
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   405
            self.out(data)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   406
            self.lastWasNL = data and data[-1] == '\n'
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   407
            self.outcount += 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   408
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   409
    def handle_data(self, data):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   410
        if r'\/script>' in data: self.quiet -= 1
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   411
        self.o(data, 1)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   412
    
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   413
    def unknown_decl(self, data): pass
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   414
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   415
def wrapwrite(text): sys.stdout.write(text.encode('utf8'))
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   416
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   417
def html2text_file(html, out=wrapwrite):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   418
    h = _html2text(out)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   419
    h.feed(html)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   420
    h.feed("")
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   421
    return h.close()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   422
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   423
def html2text(html):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   424
    return optwrap(html2text_file(html, None))
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   425
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   426
if __name__ == "__main__":
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   427
    if sys.argv[1:]:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   428
        arg = sys.argv[1]
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   429
        if arg.startswith('http://'):
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   430
            j = urllib.urlopen(arg)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   431
            try:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   432
                from feedparser import _getCharacterEncoding as enc
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   433
            except ImportError:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   434
                   enc = lambda x, y: ('utf-8', 1)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   435
            text = j.read()
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   436
            encoding = enc(j.headers, text)[0]
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   437
            if encoding == 'us-ascii': encoding = 'utf-8'
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   438
            data = text.decode(encoding)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   439
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   440
        else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   441
            encoding = 'utf8'
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   442
            if len(sys.argv) > 2:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   443
                encoding = sys.argv[2]
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   444
            data = open(arg, 'r').read().decode(encoding)
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   445
    else:
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   446
        data = sys.stdin.read().decode('utf8')
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   447
    wrapwrite(html2text(data))
40c8f766c9b8 import from internal svn r 4007
raph
parents:
diff changeset
   448