src/cm/converters/oo_converters.py
branchpreserve_html
changeset 253 a844469257b0
parent 51 321f4057eb78
child 365 a478cb9786fd
equal deleted inserted replaced
252:0f0a79f7f213 253:a844469257b0
   221 THE_OUTDIR = "outdir"
   221 THE_OUTDIR = "outdir"
   222 THE_OUTFILE = "outfile"
   222 THE_OUTFILE = "outfile"
   223 
   223 
   224 THE_INDIR = "indir"
   224 THE_INDIR = "indir"
   225 THE_INFILE = "infile"
   225 THE_INFILE = "infile"
   226 
       
   227 def fix_img_path(html,xhtml,imgs):
       
   228     """
       
   229     imgs : name --> path
       
   230     """
       
   231     finder_re = 'src[\s]*=[\s]*\"((?:(?!https?))[^\"]*)\"'
       
   232     len_res_html = len(re.findall(finder_re,html,re.IGNORECASE))
       
   233     len_res_xhtml = len(re.findall(finder_re,xhtml,re.IGNORECASE))
       
   234     res_html = re.finditer(finder_re,html,re.IGNORECASE)
       
   235     res_xhtml = re.finditer(finder_re,xhtml,re.IGNORECASE)
       
   236     result = []
       
   237     last_index = 0
       
   238     for match_xhtml in res_xhtml:
       
   239         img_path = '' 
       
   240         try:
       
   241             match_html = res_html.next()
       
   242             if match_html:
       
   243                 img_name = match_html.group(1)
       
   244                 img_path = imgs[img_name]
       
   245         except StopIteration:
       
   246             # TODO : report pb
       
   247             pass 
       
   248         offset = len(match_xhtml.group(0)) - len(match_xhtml.group(1))
       
   249         result.append(xhtml[last_index:match_xhtml.start() + offset - 1])
       
   250         result.append(img_path)
       
   251         last_index = match_xhtml.end() - 1 # -1 because trailing "
       
   252     result.append(xhtml[last_index:len(xhtml)])
       
   253     return u''.join(result)
       
   254 
       
   255   
   226   
   256 def extract_css_body(xhtml):
   227 def extract_css_body(xhtml):
   257     dom = parseString(xhtml.encode('utf8'))
   228     dom = parseString(xhtml.encode('utf8'))
   258     style = dom.getElementsByTagName("style")[0].toxml()
   229     style = dom.getElementsByTagName("style")[0].toxml()
   259     body = dom.getElementsByTagName("body")[0].toxml()
   230     body = dom.getElementsByTagName("body")[0].toxml()