221 THE_OUTDIR = "outdir" |
221 THE_OUTDIR = "outdir" |
222 THE_OUTFILE = "outfile" |
222 THE_OUTFILE = "outfile" |
223 |
223 |
224 THE_INDIR = "indir" |
224 THE_INDIR = "indir" |
225 THE_INFILE = "infile" |
225 THE_INFILE = "infile" |
226 |
|
227 def fix_img_path(html,xhtml,imgs): |
|
228 """ |
|
229 imgs : name --> path |
|
230 """ |
|
231 finder_re = 'src[\s]*=[\s]*\"((?:(?!https?))[^\"]*)\"' |
|
232 len_res_html = len(re.findall(finder_re,html,re.IGNORECASE)) |
|
233 len_res_xhtml = len(re.findall(finder_re,xhtml,re.IGNORECASE)) |
|
234 res_html = re.finditer(finder_re,html,re.IGNORECASE) |
|
235 res_xhtml = re.finditer(finder_re,xhtml,re.IGNORECASE) |
|
236 result = [] |
|
237 last_index = 0 |
|
238 for match_xhtml in res_xhtml: |
|
239 img_path = '' |
|
240 try: |
|
241 match_html = res_html.next() |
|
242 if match_html: |
|
243 img_name = match_html.group(1) |
|
244 img_path = imgs[img_name] |
|
245 except StopIteration: |
|
246 # TODO : report pb |
|
247 pass |
|
248 offset = len(match_xhtml.group(0)) - len(match_xhtml.group(1)) |
|
249 result.append(xhtml[last_index:match_xhtml.start() + offset - 1]) |
|
250 result.append(img_path) |
|
251 last_index = match_xhtml.end() - 1 # -1 because trailing " |
|
252 result.append(xhtml[last_index:len(xhtml)]) |
|
253 return u''.join(result) |
|
254 |
|
255 |
226 |
256 def extract_css_body(xhtml): |
227 def extract_css_body(xhtml): |
257 dom = parseString(xhtml.encode('utf8')) |
228 dom = parseString(xhtml.encode('utf8')) |
258 style = dom.getElementsByTagName("style")[0].toxml() |
229 style = dom.getElementsByTagName("style")[0].toxml() |
259 body = dom.getElementsByTagName("body")[0].toxml() |
230 body = dom.getElementsByTagName("body")[0].toxml() |