|
1 #!/usr/bin/python2.4 |
|
2 |
|
3 """Diff Match and Patch |
|
4 |
|
5 Copyright 2006 Google Inc. |
|
6 http://code.google.com/p/google-diff-match-patch/ |
|
7 |
|
8 Licensed under the Apache License, Version 2.0 (the "License"); |
|
9 you may not use this file except in compliance with the License. |
|
10 You may obtain a copy of the License at |
|
11 |
|
12 http://www.apache.org/licenses/LICENSE-2.0 |
|
13 |
|
14 Unless required by applicable law or agreed to in writing, software |
|
15 distributed under the License is distributed on an "AS IS" BASIS, |
|
16 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
17 See the License for the specific language governing permissions and |
|
18 limitations under the License. |
|
19 """ |
|
20 |
|
21 """Functions for diff, match and patch. |
|
22 |
|
23 Computes the difference between two texts to create a patch. |
|
24 Applies the patch onto another text, allowing for errors. |
|
25 """ |
|
26 |
|
27 __author__ = 'fraser@google.com (Neil Fraser)' |
|
28 |
|
29 import math |
|
30 import time |
|
31 import urllib |
|
32 import re |
|
33 |
|
34 class diff_match_patch: |
|
35 """Class containing the diff, match and patch methods. |
|
36 |
|
37 Also contains the behaviour settings. |
|
38 """ |
|
39 |
|
40 def __init__(self): |
|
41 """Inits a diff_match_patch object with default settings. |
|
42 Redefine these in your program to override the defaults. |
|
43 """ |
|
44 |
|
45 # Number of seconds to map a diff before giving up (0 for infinity). |
|
46 self.Diff_Timeout = 1.0 |
|
47 # Cost of an empty edit operation in terms of edit characters. |
|
48 self.Diff_EditCost = 4 |
|
49 # The size beyond which the double-ended diff activates. |
|
50 # Double-ending is twice as fast, but less accurate. |
|
51 self.Diff_DualThreshold = 32 |
|
52 # At what point is no match declared (0.0 = perfection, 1.0 = very loose). |
|
53 self.Match_Threshold = 0.5 |
|
54 # How far to search for a match (0 = exact location, 1000+ = broad match). |
|
55 # A match this many characters away from the expected location will add |
|
56 # 1.0 to the score (0.0 is a perfect match). |
|
57 self.Match_Distance = 1000 |
|
58 # When deleting a large block of text (over ~64 characters), how close does |
|
59 # the contents have to match the expected contents. (0.0 = perfection, |
|
60 # 1.0 = very loose). Note that Match_Threshold controls how closely the |
|
61 # end points of a delete need to match. |
|
62 self.Patch_DeleteThreshold = 0.5 |
|
63 # Chunk size for context length. |
|
64 self.Patch_Margin = 4 |
|
65 |
|
66 # How many bits in a number? |
|
67 # Python has no maximum, thus to disable patch splitting set to 0. |
|
68 # However to avoid long patches in certain pathological cases, use 32. |
|
69 # Multiple short patches (using native ints) are much faster than long ones. |
|
70 self.Match_MaxBits = 32 |
|
71 |
|
72 # DIFF FUNCTIONS |
|
73 |
|
74 # The data structure representing a diff is an array of tuples: |
|
75 # [(DIFF_DELETE, "Hello"), (DIFF_INSERT, "Goodbye"), (DIFF_EQUAL, " world.")] |
|
76 # which means: delete "Hello", add "Goodbye" and keep " world." |
|
77 DIFF_DELETE = -1 |
|
78 DIFF_INSERT = 1 |
|
79 DIFF_EQUAL = 0 |
|
80 |
|
81 def diff_main(self, text1, text2, checklines=True): |
|
82 """Find the differences between two texts. Simplifies the problem by |
|
83 stripping any common prefix or suffix off the texts before diffing. |
|
84 |
|
85 Args: |
|
86 text1: Old string to be diffed. |
|
87 text2: New string to be diffed. |
|
88 checklines: Optional speedup flag. If present and false, then don't run |
|
89 a line-level diff first to identify the changed areas. |
|
90 Defaults to true, which does a faster, slightly less optimal diff. |
|
91 |
|
92 Returns: |
|
93 Array of changes. |
|
94 """ |
|
95 |
|
96 # Check for equality (speedup) |
|
97 if text1 == text2: |
|
98 return [(self.DIFF_EQUAL, text1)] |
|
99 |
|
100 # Trim off common prefix (speedup) |
|
101 commonlength = self.diff_commonPrefix(text1, text2) |
|
102 commonprefix = text1[:commonlength] |
|
103 text1 = text1[commonlength:] |
|
104 text2 = text2[commonlength:] |
|
105 |
|
106 # Trim off common suffix (speedup) |
|
107 commonlength = self.diff_commonSuffix(text1, text2) |
|
108 if commonlength == 0: |
|
109 commonsuffix = '' |
|
110 else: |
|
111 commonsuffix = text1[-commonlength:] |
|
112 text1 = text1[:-commonlength] |
|
113 text2 = text2[:-commonlength] |
|
114 |
|
115 # Compute the diff on the middle block |
|
116 diffs = self.diff_compute(text1, text2, checklines) |
|
117 |
|
118 # Restore the prefix and suffix |
|
119 if commonprefix: |
|
120 diffs[:0] = [(self.DIFF_EQUAL, commonprefix)] |
|
121 if commonsuffix: |
|
122 diffs.append((self.DIFF_EQUAL, commonsuffix)) |
|
123 self.diff_cleanupMerge(diffs) |
|
124 return diffs |
|
125 |
|
126 def diff_compute(self, text1, text2, checklines): |
|
127 """Find the differences between two texts. Assumes that the texts do not |
|
128 have any common prefix or suffix. |
|
129 |
|
130 Args: |
|
131 text1: Old string to be diffed. |
|
132 text2: New string to be diffed. |
|
133 checklines: Speedup flag. If false, then don't run a line-level diff |
|
134 first to identify the changed areas. |
|
135 If true, then run a faster, slightly less optimal diff. |
|
136 |
|
137 Returns: |
|
138 Array of changes. |
|
139 """ |
|
140 if not text1: |
|
141 # Just add some text (speedup) |
|
142 return [(self.DIFF_INSERT, text2)] |
|
143 |
|
144 if not text2: |
|
145 # Just delete some text (speedup) |
|
146 return [(self.DIFF_DELETE, text1)] |
|
147 |
|
148 if len(text1) > len(text2): |
|
149 (longtext, shorttext) = (text1, text2) |
|
150 else: |
|
151 (shorttext, longtext) = (text1, text2) |
|
152 i = longtext.find(shorttext) |
|
153 if i != -1: |
|
154 # Shorter text is inside the longer text (speedup) |
|
155 diffs = [(self.DIFF_INSERT, longtext[:i]), (self.DIFF_EQUAL, shorttext), |
|
156 (self.DIFF_INSERT, longtext[i + len(shorttext):])] |
|
157 # Swap insertions for deletions if diff is reversed. |
|
158 if len(text1) > len(text2): |
|
159 diffs[0] = (self.DIFF_DELETE, diffs[0][1]) |
|
160 diffs[2] = (self.DIFF_DELETE, diffs[2][1]) |
|
161 return diffs |
|
162 longtext = shorttext = None # Garbage collect. |
|
163 |
|
164 # Check to see if the problem can be split in two. |
|
165 hm = self.diff_halfMatch(text1, text2) |
|
166 if hm: |
|
167 # A half-match was found, sort out the return data. |
|
168 (text1_a, text1_b, text2_a, text2_b, mid_common) = hm |
|
169 # Send both pairs off for separate processing. |
|
170 diffs_a = self.diff_main(text1_a, text2_a, checklines) |
|
171 diffs_b = self.diff_main(text1_b, text2_b, checklines) |
|
172 # Merge the results. |
|
173 return diffs_a + [(self.DIFF_EQUAL, mid_common)] + diffs_b |
|
174 |
|
175 # Perform a real diff. |
|
176 if checklines and (len(text1) < 100 or len(text2) < 100): |
|
177 checklines = False # Too trivial for the overhead. |
|
178 if checklines: |
|
179 # Scan the text on a line-by-line basis first. |
|
180 (text1, text2, linearray) = self.diff_linesToChars(text1, text2) |
|
181 |
|
182 diffs = self.diff_map(text1, text2) |
|
183 if not diffs: # No acceptable result. |
|
184 diffs = [(self.DIFF_DELETE, text1), (self.DIFF_INSERT, text2)] |
|
185 if checklines: |
|
186 # Convert the diff back to original text. |
|
187 self.diff_charsToLines(diffs, linearray) |
|
188 # Eliminate freak matches (e.g. blank lines) |
|
189 self.diff_cleanupSemantic(diffs) |
|
190 |
|
191 # Rediff any replacement blocks, this time character-by-character. |
|
192 # Add a dummy entry at the end. |
|
193 diffs.append((self.DIFF_EQUAL, '')) |
|
194 pointer = 0 |
|
195 count_delete = 0 |
|
196 count_insert = 0 |
|
197 text_delete = '' |
|
198 text_insert = '' |
|
199 while pointer < len(diffs): |
|
200 if diffs[pointer][0] == self.DIFF_INSERT: |
|
201 count_insert += 1 |
|
202 text_insert += diffs[pointer][1] |
|
203 elif diffs[pointer][0] == self.DIFF_DELETE: |
|
204 count_delete += 1 |
|
205 text_delete += diffs[pointer][1] |
|
206 elif diffs[pointer][0] == self.DIFF_EQUAL: |
|
207 # Upon reaching an equality, check for prior redundancies. |
|
208 if count_delete >= 1 and count_insert >= 1: |
|
209 # Delete the offending records and add the merged ones. |
|
210 a = self.diff_main(text_delete, text_insert, False) |
|
211 diffs[pointer - count_delete - count_insert : pointer] = a |
|
212 pointer = pointer - count_delete - count_insert + len(a) |
|
213 count_insert = 0 |
|
214 count_delete = 0 |
|
215 text_delete = '' |
|
216 text_insert = '' |
|
217 |
|
218 pointer += 1 |
|
219 |
|
220 diffs.pop() # Remove the dummy entry at the end. |
|
221 return diffs |
|
222 |
|
223 def diff_linesToChars(self, text1, text2): |
|
224 """Split two texts into an array of strings. Reduce the texts to a string |
|
225 of hashes where each Unicode character represents one line. |
|
226 |
|
227 Args: |
|
228 text1: First string. |
|
229 text2: Second string. |
|
230 |
|
231 Returns: |
|
232 Three element tuple, containing the encoded text1, the encoded text2 and |
|
233 the array of unique strings. The zeroth element of the array of unique |
|
234 strings is intentionally blank. |
|
235 """ |
|
236 lineArray = [] # e.g. lineArray[4] == "Hello\n" |
|
237 lineHash = {} # e.g. lineHash["Hello\n"] == 4 |
|
238 |
|
239 # "\x00" is a valid character, but various debuggers don't like it. |
|
240 # So we'll insert a junk entry to avoid generating a null character. |
|
241 lineArray.append('') |
|
242 |
|
243 def diff_linesToCharsMunge(text): |
|
244 """Split a text into an array of strings. Reduce the texts to a string |
|
245 of hashes where each Unicode character represents one line. |
|
246 Modifies linearray and linehash through being a closure. |
|
247 |
|
248 Args: |
|
249 text: String to encode. |
|
250 |
|
251 Returns: |
|
252 Encoded string. |
|
253 """ |
|
254 chars = [] |
|
255 # Walk the text, pulling out a substring for each line. |
|
256 # text.split('\n') would would temporarily double our memory footprint. |
|
257 # Modifying text would create many large strings to garbage collect. |
|
258 lineStart = 0 |
|
259 lineEnd = -1 |
|
260 while lineEnd < len(text) - 1: |
|
261 lineEnd = text.find('\n', lineStart) |
|
262 if lineEnd == -1: |
|
263 lineEnd = len(text) - 1 |
|
264 line = text[lineStart:lineEnd + 1] |
|
265 lineStart = lineEnd + 1 |
|
266 |
|
267 if line in lineHash: |
|
268 chars.append(unichr(lineHash[line])) |
|
269 else: |
|
270 lineArray.append(line) |
|
271 lineHash[line] = len(lineArray) - 1 |
|
272 chars.append(unichr(len(lineArray) - 1)) |
|
273 return "".join(chars) |
|
274 |
|
275 chars1 = diff_linesToCharsMunge(text1) |
|
276 chars2 = diff_linesToCharsMunge(text2) |
|
277 return (chars1, chars2, lineArray) |
|
278 |
|
279 def diff_charsToLines(self, diffs, lineArray): |
|
280 """Rehydrate the text in a diff from a string of line hashes to real lines |
|
281 of text. |
|
282 |
|
283 Args: |
|
284 diffs: Array of diff tuples. |
|
285 lineArray: Array of unique strings. |
|
286 """ |
|
287 for x in xrange(len(diffs)): |
|
288 text = [] |
|
289 for char in diffs[x][1]: |
|
290 text.append(lineArray[ord(char)]) |
|
291 diffs[x] = (diffs[x][0], "".join(text)) |
|
292 |
|
293 def diff_map(self, text1, text2): |
|
294 """Explore the intersection points between the two texts. |
|
295 |
|
296 Args: |
|
297 text1: Old string to be diffed. |
|
298 text2: New string to be diffed. |
|
299 |
|
300 Returns: |
|
301 Array of diff tuples or None if no diff available. |
|
302 """ |
|
303 |
|
304 # Unlike in most languages, Python counts time in seconds. |
|
305 s_end = time.time() + self.Diff_Timeout # Don't run for too long. |
|
306 # Cache the text lengths to prevent multiple calls. |
|
307 text1_length = len(text1) |
|
308 text2_length = len(text2) |
|
309 max_d = text1_length + text2_length - 1 |
|
310 doubleEnd = self.Diff_DualThreshold * 2 < max_d |
|
311 v_map1 = [] |
|
312 v_map2 = [] |
|
313 v1 = {} |
|
314 v2 = {} |
|
315 v1[1] = 0 |
|
316 v2[1] = 0 |
|
317 footsteps = {} |
|
318 done = False |
|
319 # If the total number of characters is odd, then the front path will |
|
320 # collide with the reverse path. |
|
321 front = (text1_length + text2_length) % 2 |
|
322 for d in xrange(max_d): |
|
323 # Bail out if timeout reached. |
|
324 if self.Diff_Timeout > 0 and time.time() > s_end: |
|
325 return None |
|
326 |
|
327 # Walk the front path one step. |
|
328 v_map1.append({}) |
|
329 for k in xrange(-d, d + 1, 2): |
|
330 if k == -d or k != d and v1[k - 1] < v1[k + 1]: |
|
331 x = v1[k + 1] |
|
332 else: |
|
333 x = v1[k - 1] + 1 |
|
334 y = x - k |
|
335 if doubleEnd: |
|
336 footstep = (x, y) |
|
337 if front and footstep in footsteps: |
|
338 done = True |
|
339 if not front: |
|
340 footsteps[footstep] = d |
|
341 |
|
342 while (not done and x < text1_length and y < text2_length and |
|
343 text1[x] == text2[y]): |
|
344 x += 1 |
|
345 y += 1 |
|
346 if doubleEnd: |
|
347 footstep = (x, y) |
|
348 if front and footstep in footsteps: |
|
349 done = True |
|
350 if not front: |
|
351 footsteps[footstep] = d |
|
352 |
|
353 v1[k] = x |
|
354 v_map1[d][(x, y)] = True |
|
355 if x == text1_length and y == text2_length: |
|
356 # Reached the end in single-path mode. |
|
357 return self.diff_path1(v_map1, text1, text2) |
|
358 elif done: |
|
359 # Front path ran over reverse path. |
|
360 v_map2 = v_map2[:footsteps[footstep] + 1] |
|
361 a = self.diff_path1(v_map1, text1[:x], text2[:y]) |
|
362 b = self.diff_path2(v_map2, text1[x:], text2[y:]) |
|
363 return a + b |
|
364 |
|
365 if doubleEnd: |
|
366 # Walk the reverse path one step. |
|
367 v_map2.append({}) |
|
368 for k in xrange(-d, d + 1, 2): |
|
369 if k == -d or k != d and v2[k - 1] < v2[k + 1]: |
|
370 x = v2[k + 1] |
|
371 else: |
|
372 x = v2[k - 1] + 1 |
|
373 y = x - k |
|
374 footstep = (text1_length - x, text2_length - y) |
|
375 if not front and footstep in footsteps: |
|
376 done = True |
|
377 if front: |
|
378 footsteps[footstep] = d |
|
379 while (not done and x < text1_length and y < text2_length and |
|
380 text1[-x - 1] == text2[-y - 1]): |
|
381 x += 1 |
|
382 y += 1 |
|
383 footstep = (text1_length - x, text2_length - y) |
|
384 if not front and footstep in footsteps: |
|
385 done = True |
|
386 if front: |
|
387 footsteps[footstep] = d |
|
388 |
|
389 v2[k] = x |
|
390 v_map2[d][(x, y)] = True |
|
391 if done: |
|
392 # Reverse path ran over front path. |
|
393 v_map1 = v_map1[:footsteps[footstep] + 1] |
|
394 a = self.diff_path1(v_map1, text1[:text1_length - x], |
|
395 text2[:text2_length - y]) |
|
396 b = self.diff_path2(v_map2, text1[text1_length - x:], |
|
397 text2[text2_length - y:]) |
|
398 return a + b |
|
399 |
|
400 # Number of diffs equals number of characters, no commonality at all. |
|
401 return None |
|
402 |
|
403 def diff_path1(self, v_map, text1, text2): |
|
404 """Work from the middle back to the start to determine the path. |
|
405 |
|
406 Args: |
|
407 v_map: Array of paths. |
|
408 text1: Old string fragment to be diffed. |
|
409 text2: New string fragment to be diffed. |
|
410 |
|
411 Returns: |
|
412 Array of diff tuples. |
|
413 """ |
|
414 path = [] |
|
415 x = len(text1) |
|
416 y = len(text2) |
|
417 last_op = None |
|
418 for d in xrange(len(v_map) - 2, -1, -1): |
|
419 while True: |
|
420 if (x - 1, y) in v_map[d]: |
|
421 x -= 1 |
|
422 if last_op == self.DIFF_DELETE: |
|
423 path[0] = (self.DIFF_DELETE, text1[x] + path[0][1]) |
|
424 else: |
|
425 path[:0] = [(self.DIFF_DELETE, text1[x])] |
|
426 last_op = self.DIFF_DELETE |
|
427 break |
|
428 elif (x, y - 1) in v_map[d]: |
|
429 y -= 1 |
|
430 if last_op == self.DIFF_INSERT: |
|
431 path[0] = (self.DIFF_INSERT, text2[y] + path[0][1]) |
|
432 else: |
|
433 path[:0] = [(self.DIFF_INSERT, text2[y])] |
|
434 last_op = self.DIFF_INSERT |
|
435 break |
|
436 else: |
|
437 x -= 1 |
|
438 y -= 1 |
|
439 assert text1[x] == text2[y], ("No diagonal. " + |
|
440 "Can't happen. (diff_path1)") |
|
441 if last_op == self.DIFF_EQUAL: |
|
442 path[0] = (self.DIFF_EQUAL, text1[x] + path[0][1]) |
|
443 else: |
|
444 path[:0] = [(self.DIFF_EQUAL, text1[x])] |
|
445 last_op = self.DIFF_EQUAL |
|
446 return path |
|
447 |
|
448 def diff_path2(self, v_map, text1, text2): |
|
449 """Work from the middle back to the end to determine the path. |
|
450 |
|
451 Args: |
|
452 v_map: Array of paths. |
|
453 text1: Old string fragment to be diffed. |
|
454 text2: New string fragment to be diffed. |
|
455 |
|
456 Returns: |
|
457 Array of diff tuples. |
|
458 """ |
|
459 path = [] |
|
460 x = len(text1) |
|
461 y = len(text2) |
|
462 last_op = None |
|
463 for d in xrange(len(v_map) - 2, -1, -1): |
|
464 while True: |
|
465 if (x - 1, y) in v_map[d]: |
|
466 x -= 1 |
|
467 if last_op == self.DIFF_DELETE: |
|
468 path[-1] = (self.DIFF_DELETE, path[-1][1] + text1[-x - 1]) |
|
469 else: |
|
470 path.append((self.DIFF_DELETE, text1[-x - 1])) |
|
471 last_op = self.DIFF_DELETE |
|
472 break |
|
473 elif (x, y - 1) in v_map[d]: |
|
474 y -= 1 |
|
475 if last_op == self.DIFF_INSERT: |
|
476 path[-1] = (self.DIFF_INSERT, path[-1][1] + text2[-y - 1]) |
|
477 else: |
|
478 path.append((self.DIFF_INSERT, text2[-y - 1])) |
|
479 last_op = self.DIFF_INSERT |
|
480 break |
|
481 else: |
|
482 x -= 1 |
|
483 y -= 1 |
|
484 assert text1[-x - 1] == text2[-y - 1], ("No diagonal. " + |
|
485 "Can't happen. (diff_path2)") |
|
486 if last_op == self.DIFF_EQUAL: |
|
487 path[-1] = (self.DIFF_EQUAL, path[-1][1] + text1[-x - 1]) |
|
488 else: |
|
489 path.append((self.DIFF_EQUAL, text1[-x - 1])) |
|
490 last_op = self.DIFF_EQUAL |
|
491 return path |
|
492 |
|
493 def diff_commonPrefix(self, text1, text2): |
|
494 """Determine the common prefix of two strings. |
|
495 |
|
496 Args: |
|
497 text1: First string. |
|
498 text2: Second string. |
|
499 |
|
500 Returns: |
|
501 The number of characters common to the start of each string. |
|
502 """ |
|
503 # Quick check for common null cases. |
|
504 if not text1 or not text2 or text1[0] != text2[0]: |
|
505 return 0 |
|
506 # Binary search. |
|
507 # Performance analysis: http://neil.fraser.name/news/2007/10/09/ |
|
508 pointermin = 0 |
|
509 pointermax = min(len(text1), len(text2)) |
|
510 pointermid = pointermax |
|
511 pointerstart = 0 |
|
512 while pointermin < pointermid: |
|
513 if text1[pointerstart:pointermid] == text2[pointerstart:pointermid]: |
|
514 pointermin = pointermid |
|
515 pointerstart = pointermin |
|
516 else: |
|
517 pointermax = pointermid |
|
518 pointermid = int((pointermax - pointermin) / 2 + pointermin) |
|
519 return pointermid |
|
520 |
|
521 def diff_commonSuffix(self, text1, text2): |
|
522 """Determine the common suffix of two strings. |
|
523 |
|
524 Args: |
|
525 text1: First string. |
|
526 text2: Second string. |
|
527 |
|
528 Returns: |
|
529 The number of characters common to the end of each string. |
|
530 """ |
|
531 # Quick check for common null cases. |
|
532 if not text1 or not text2 or text1[-1] != text2[-1]: |
|
533 return 0 |
|
534 # Binary search. |
|
535 # Performance analysis: http://neil.fraser.name/news/2007/10/09/ |
|
536 pointermin = 0 |
|
537 pointermax = min(len(text1), len(text2)) |
|
538 pointermid = pointermax |
|
539 pointerend = 0 |
|
540 while pointermin < pointermid: |
|
541 if (text1[-pointermid:len(text1) - pointerend] == |
|
542 text2[-pointermid:len(text2) - pointerend]): |
|
543 pointermin = pointermid |
|
544 pointerend = pointermin |
|
545 else: |
|
546 pointermax = pointermid |
|
547 pointermid = int((pointermax - pointermin) / 2 + pointermin) |
|
548 return pointermid |
|
549 |
|
550 def diff_halfMatch(self, text1, text2): |
|
551 """Do the two texts share a substring which is at least half the length of |
|
552 the longer text? |
|
553 |
|
554 Args: |
|
555 text1: First string. |
|
556 text2: Second string. |
|
557 |
|
558 Returns: |
|
559 Five element Array, containing the prefix of text1, the suffix of text1, |
|
560 the prefix of text2, the suffix of text2 and the common middle. Or None |
|
561 if there was no match. |
|
562 """ |
|
563 if len(text1) > len(text2): |
|
564 (longtext, shorttext) = (text1, text2) |
|
565 else: |
|
566 (shorttext, longtext) = (text1, text2) |
|
567 if len(longtext) < 10 or len(shorttext) < 1: |
|
568 return None # Pointless. |
|
569 |
|
570 def diff_halfMatchI(longtext, shorttext, i): |
|
571 """Does a substring of shorttext exist within longtext such that the |
|
572 substring is at least half the length of longtext? |
|
573 Closure, but does not reference any external variables. |
|
574 |
|
575 Args: |
|
576 longtext: Longer string. |
|
577 shorttext: Shorter string. |
|
578 i: Start index of quarter length substring within longtext. |
|
579 |
|
580 Returns: |
|
581 Five element Array, containing the prefix of longtext, the suffix of |
|
582 longtext, the prefix of shorttext, the suffix of shorttext and the |
|
583 common middle. Or None if there was no match. |
|
584 """ |
|
585 seed = longtext[i:i + len(longtext) / 4] |
|
586 best_common = '' |
|
587 j = shorttext.find(seed) |
|
588 while j != -1: |
|
589 prefixLength = self.diff_commonPrefix(longtext[i:], shorttext[j:]) |
|
590 suffixLength = self.diff_commonSuffix(longtext[:i], shorttext[:j]) |
|
591 if len(best_common) < suffixLength + prefixLength: |
|
592 best_common = (shorttext[j - suffixLength:j] + |
|
593 shorttext[j:j + prefixLength]) |
|
594 best_longtext_a = longtext[:i - suffixLength] |
|
595 best_longtext_b = longtext[i + prefixLength:] |
|
596 best_shorttext_a = shorttext[:j - suffixLength] |
|
597 best_shorttext_b = shorttext[j + prefixLength:] |
|
598 j = shorttext.find(seed, j + 1) |
|
599 |
|
600 if len(best_common) >= len(longtext) / 2: |
|
601 return (best_longtext_a, best_longtext_b, |
|
602 best_shorttext_a, best_shorttext_b, best_common) |
|
603 else: |
|
604 return None |
|
605 |
|
606 # First check if the second quarter is the seed for a half-match. |
|
607 hm1 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 3) / 4) |
|
608 # Check again based on the third quarter. |
|
609 hm2 = diff_halfMatchI(longtext, shorttext, (len(longtext) + 1) / 2) |
|
610 if not hm1 and not hm2: |
|
611 return None |
|
612 elif not hm2: |
|
613 hm = hm1 |
|
614 elif not hm1: |
|
615 hm = hm2 |
|
616 else: |
|
617 # Both matched. Select the longest. |
|
618 if len(hm1[4]) > len(hm2[4]): |
|
619 hm = hm1 |
|
620 else: |
|
621 hm = hm2 |
|
622 |
|
623 # A half-match was found, sort out the return data. |
|
624 if len(text1) > len(text2): |
|
625 (text1_a, text1_b, text2_a, text2_b, mid_common) = hm |
|
626 else: |
|
627 (text2_a, text2_b, text1_a, text1_b, mid_common) = hm |
|
628 return (text1_a, text1_b, text2_a, text2_b, mid_common) |
|
629 |
|
630 def diff_cleanupSemantic(self, diffs): |
|
631 """Reduce the number of edits by eliminating semantically trivial |
|
632 equalities. |
|
633 |
|
634 Args: |
|
635 diffs: Array of diff tuples. |
|
636 """ |
|
637 changes = False |
|
638 equalities = [] # Stack of indices where equalities are found. |
|
639 lastequality = None # Always equal to equalities[-1][1] |
|
640 pointer = 0 # Index of current position. |
|
641 length_changes1 = 0 # Number of chars that changed prior to the equality. |
|
642 length_changes2 = 0 # Number of chars that changed after the equality. |
|
643 while pointer < len(diffs): |
|
644 if diffs[pointer][0] == self.DIFF_EQUAL: # equality found |
|
645 equalities.append(pointer) |
|
646 length_changes1 = length_changes2 |
|
647 length_changes2 = 0 |
|
648 lastequality = diffs[pointer][1] |
|
649 else: # an insertion or deletion |
|
650 length_changes2 += len(diffs[pointer][1]) |
|
651 if (lastequality != None and (len(lastequality) <= length_changes1) and |
|
652 (len(lastequality) <= length_changes2)): |
|
653 # Duplicate record |
|
654 diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality)) |
|
655 # Change second copy to insert. |
|
656 diffs[equalities[-1] + 1] = (self.DIFF_INSERT, |
|
657 diffs[equalities[-1] + 1][1]) |
|
658 # Throw away the equality we just deleted. |
|
659 equalities.pop() |
|
660 # Throw away the previous equality (it needs to be reevaluated). |
|
661 if len(equalities) != 0: |
|
662 equalities.pop() |
|
663 if len(equalities): |
|
664 pointer = equalities[-1] |
|
665 else: |
|
666 pointer = -1 |
|
667 length_changes1 = 0 # Reset the counters. |
|
668 length_changes2 = 0 |
|
669 lastequality = None |
|
670 changes = True |
|
671 pointer += 1 |
|
672 |
|
673 if changes: |
|
674 self.diff_cleanupMerge(diffs) |
|
675 |
|
676 self.diff_cleanupSemanticLossless(diffs) |
|
677 |
|
678 def diff_cleanupSemanticLossless(self, diffs): |
|
679 """Look for single edits surrounded on both sides by equalities |
|
680 which can be shifted sideways to align the edit to a word boundary. |
|
681 e.g: The c<ins>at c</ins>ame. -> The <ins>cat </ins>came. |
|
682 |
|
683 Args: |
|
684 diffs: Array of diff tuples. |
|
685 """ |
|
686 |
|
687 def diff_cleanupSemanticScore(one, two): |
|
688 """Given two strings, compute a score representing whether the |
|
689 internal boundary falls on logical boundaries. |
|
690 Scores range from 5 (best) to 0 (worst). |
|
691 Closure, but does not reference any external variables. |
|
692 |
|
693 Args: |
|
694 one: First string. |
|
695 two: Second string. |
|
696 |
|
697 Returns: |
|
698 The score. |
|
699 """ |
|
700 if not one or not two: |
|
701 # Edges are the best. |
|
702 return 5 |
|
703 |
|
704 # Each port of this function behaves slightly differently due to |
|
705 # subtle differences in each language's definition of things like |
|
706 # 'whitespace'. Since this function's purpose is largely cosmetic, |
|
707 # the choice has been made to use each language's native features |
|
708 # rather than force total conformity. |
|
709 score = 0 |
|
710 # One point for non-alphanumeric. |
|
711 if not one[-1].isalnum() or not two[0].isalnum(): |
|
712 score += 1 |
|
713 # Two points for whitespace. |
|
714 if one[-1].isspace() or two[0].isspace(): |
|
715 score += 1 |
|
716 # Three points for line breaks. |
|
717 if (one[-1] == "\r" or one[-1] == "\n" or |
|
718 two[0] == "\r" or two[0] == "\n"): |
|
719 score += 1 |
|
720 # Four points for blank lines. |
|
721 if (re.search("\\n\\r?\\n$", one) or |
|
722 re.match("^\\r?\\n\\r?\\n", two)): |
|
723 score += 1 |
|
724 return score |
|
725 |
|
726 pointer = 1 |
|
727 # Intentionally ignore the first and last element (don't need checking). |
|
728 while pointer < len(diffs) - 1: |
|
729 if (diffs[pointer - 1][0] == self.DIFF_EQUAL and |
|
730 diffs[pointer + 1][0] == self.DIFF_EQUAL): |
|
731 # This is a single edit surrounded by equalities. |
|
732 equality1 = diffs[pointer - 1][1] |
|
733 edit = diffs[pointer][1] |
|
734 equality2 = diffs[pointer + 1][1] |
|
735 |
|
736 # First, shift the edit as far left as possible. |
|
737 commonOffset = self.diff_commonSuffix(equality1, edit) |
|
738 if commonOffset: |
|
739 commonString = edit[-commonOffset:] |
|
740 equality1 = equality1[:-commonOffset] |
|
741 edit = commonString + edit[:-commonOffset] |
|
742 equality2 = commonString + equality2 |
|
743 |
|
744 # Second, step character by character right, looking for the best fit. |
|
745 bestEquality1 = equality1 |
|
746 bestEdit = edit |
|
747 bestEquality2 = equality2 |
|
748 bestScore = (diff_cleanupSemanticScore(equality1, edit) + |
|
749 diff_cleanupSemanticScore(edit, equality2)) |
|
750 while edit and equality2 and edit[0] == equality2[0]: |
|
751 equality1 += edit[0] |
|
752 edit = edit[1:] + equality2[0] |
|
753 equality2 = equality2[1:] |
|
754 score = (diff_cleanupSemanticScore(equality1, edit) + |
|
755 diff_cleanupSemanticScore(edit, equality2)) |
|
756 # The >= encourages trailing rather than leading whitespace on edits. |
|
757 if score >= bestScore: |
|
758 bestScore = score |
|
759 bestEquality1 = equality1 |
|
760 bestEdit = edit |
|
761 bestEquality2 = equality2 |
|
762 |
|
763 if diffs[pointer - 1][1] != bestEquality1: |
|
764 # We have an improvement, save it back to the diff. |
|
765 if bestEquality1: |
|
766 diffs[pointer - 1] = (diffs[pointer - 1][0], bestEquality1) |
|
767 else: |
|
768 del diffs[pointer - 1] |
|
769 pointer -= 1 |
|
770 diffs[pointer] = (diffs[pointer][0], bestEdit) |
|
771 if bestEquality2: |
|
772 diffs[pointer + 1] = (diffs[pointer + 1][0], bestEquality2) |
|
773 else: |
|
774 del diffs[pointer + 1] |
|
775 pointer -= 1 |
|
776 pointer += 1 |
|
777 |
|
778 def diff_cleanupEfficiency(self, diffs): |
|
779 """Reduce the number of edits by eliminating operationally trivial |
|
780 equalities. |
|
781 |
|
782 Args: |
|
783 diffs: Array of diff tuples. |
|
784 """ |
|
785 changes = False |
|
786 equalities = [] # Stack of indices where equalities are found. |
|
787 lastequality = '' # Always equal to equalities[-1][1] |
|
788 pointer = 0 # Index of current position. |
|
789 pre_ins = False # Is there an insertion operation before the last equality. |
|
790 pre_del = False # Is there a deletion operation before the last equality. |
|
791 post_ins = False # Is there an insertion operation after the last equality. |
|
792 post_del = False # Is there a deletion operation after the last equality. |
|
793 while pointer < len(diffs): |
|
794 if diffs[pointer][0] == self.DIFF_EQUAL: # equality found |
|
795 if (len(diffs[pointer][1]) < self.Diff_EditCost and |
|
796 (post_ins or post_del)): |
|
797 # Candidate found. |
|
798 equalities.append(pointer) |
|
799 pre_ins = post_ins |
|
800 pre_del = post_del |
|
801 lastequality = diffs[pointer][1] |
|
802 else: |
|
803 # Not a candidate, and can never become one. |
|
804 equalities = [] |
|
805 lastequality = '' |
|
806 |
|
807 post_ins = post_del = False |
|
808 else: # an insertion or deletion |
|
809 if diffs[pointer][0] == self.DIFF_DELETE: |
|
810 post_del = True |
|
811 else: |
|
812 post_ins = True |
|
813 |
|
814 # Five types to be split: |
|
815 # <ins>A</ins><del>B</del>XY<ins>C</ins><del>D</del> |
|
816 # <ins>A</ins>X<ins>C</ins><del>D</del> |
|
817 # <ins>A</ins><del>B</del>X<ins>C</ins> |
|
818 # <ins>A</del>X<ins>C</ins><del>D</del> |
|
819 # <ins>A</ins><del>B</del>X<del>C</del> |
|
820 |
|
821 if lastequality and ((pre_ins and pre_del and post_ins and post_del) or |
|
822 ((len(lastequality) < self.Diff_EditCost / 2) and |
|
823 (pre_ins + pre_del + post_ins + post_del) == 3)): |
|
824 # Duplicate record |
|
825 diffs.insert(equalities[-1], (self.DIFF_DELETE, lastequality)) |
|
826 # Change second copy to insert. |
|
827 diffs[equalities[-1] + 1] = (self.DIFF_INSERT, |
|
828 diffs[equalities[-1] + 1][1]) |
|
829 equalities.pop() # Throw away the equality we just deleted |
|
830 lastequality = '' |
|
831 if pre_ins and pre_del: |
|
832 # No changes made which could affect previous entry, keep going. |
|
833 post_ins = post_del = True |
|
834 equalities = [] |
|
835 else: |
|
836 if len(equalities): |
|
837 equalities.pop() # Throw away the previous equality |
|
838 if len(equalities): |
|
839 pointer = equalities[-1] |
|
840 else: |
|
841 pointer = -1 |
|
842 post_ins = post_del = False |
|
843 changes = True |
|
844 pointer += 1 |
|
845 |
|
846 if changes: |
|
847 self.diff_cleanupMerge(diffs) |
|
848 |
|
849 def diff_cleanupMerge(self, diffs): |
|
850 """Reorder and merge like edit sections. Merge equalities. |
|
851 Any edit section can move as long as it doesn't cross an equality. |
|
852 |
|
853 Args: |
|
854 diffs: Array of diff tuples. |
|
855 """ |
|
856 diffs.append((self.DIFF_EQUAL, '')) # Add a dummy entry at the end. |
|
857 pointer = 0 |
|
858 count_delete = 0 |
|
859 count_insert = 0 |
|
860 text_delete = '' |
|
861 text_insert = '' |
|
862 while pointer < len(diffs): |
|
863 if diffs[pointer][0] == self.DIFF_INSERT: |
|
864 count_insert += 1 |
|
865 text_insert += diffs[pointer][1] |
|
866 pointer += 1 |
|
867 elif diffs[pointer][0] == self.DIFF_DELETE: |
|
868 count_delete += 1 |
|
869 text_delete += diffs[pointer][1] |
|
870 pointer += 1 |
|
871 elif diffs[pointer][0] == self.DIFF_EQUAL: |
|
872 # Upon reaching an equality, check for prior redundancies. |
|
873 if count_delete != 0 or count_insert != 0: |
|
874 if count_delete != 0 and count_insert != 0: |
|
875 # Factor out any common prefixies. |
|
876 commonlength = self.diff_commonPrefix(text_insert, text_delete) |
|
877 if commonlength != 0: |
|
878 x = pointer - count_delete - count_insert - 1 |
|
879 if x >= 0 and diffs[x][0] == self.DIFF_EQUAL: |
|
880 diffs[x] = (diffs[x][0], diffs[x][1] + |
|
881 text_insert[:commonlength]) |
|
882 else: |
|
883 diffs.insert(0, (self.DIFF_EQUAL, text_insert[:commonlength])) |
|
884 pointer += 1 |
|
885 text_insert = text_insert[commonlength:] |
|
886 text_delete = text_delete[commonlength:] |
|
887 # Factor out any common suffixies. |
|
888 commonlength = self.diff_commonSuffix(text_insert, text_delete) |
|
889 if commonlength != 0: |
|
890 diffs[pointer] = (diffs[pointer][0], text_insert[-commonlength:] + |
|
891 diffs[pointer][1]) |
|
892 text_insert = text_insert[:-commonlength] |
|
893 text_delete = text_delete[:-commonlength] |
|
894 # Delete the offending records and add the merged ones. |
|
895 if count_delete == 0: |
|
896 diffs[pointer - count_insert : pointer] = [ |
|
897 (self.DIFF_INSERT, text_insert)] |
|
898 elif count_insert == 0: |
|
899 diffs[pointer - count_delete : pointer] = [ |
|
900 (self.DIFF_DELETE, text_delete)] |
|
901 else: |
|
902 diffs[pointer - count_delete - count_insert : pointer] = [ |
|
903 (self.DIFF_DELETE, text_delete), |
|
904 (self.DIFF_INSERT, text_insert)] |
|
905 pointer = pointer - count_delete - count_insert + 1 |
|
906 if count_delete != 0: |
|
907 pointer += 1 |
|
908 if count_insert != 0: |
|
909 pointer += 1 |
|
910 elif pointer != 0 and diffs[pointer - 1][0] == self.DIFF_EQUAL: |
|
911 # Merge this equality with the previous one. |
|
912 diffs[pointer - 1] = (diffs[pointer - 1][0], |
|
913 diffs[pointer - 1][1] + diffs[pointer][1]) |
|
914 del diffs[pointer] |
|
915 else: |
|
916 pointer += 1 |
|
917 |
|
918 count_insert = 0 |
|
919 count_delete = 0 |
|
920 text_delete = '' |
|
921 text_insert = '' |
|
922 |
|
923 if diffs[-1][1] == '': |
|
924 diffs.pop() # Remove the dummy entry at the end. |
|
925 |
|
926 # Second pass: look for single edits surrounded on both sides by equalities |
|
927 # which can be shifted sideways to eliminate an equality. |
|
928 # e.g: A<ins>BA</ins>C -> <ins>AB</ins>AC |
|
929 changes = False |
|
930 pointer = 1 |
|
931 # Intentionally ignore the first and last element (don't need checking). |
|
932 while pointer < len(diffs) - 1: |
|
933 if (diffs[pointer - 1][0] == self.DIFF_EQUAL and |
|
934 diffs[pointer + 1][0] == self.DIFF_EQUAL): |
|
935 # This is a single edit surrounded by equalities. |
|
936 if diffs[pointer][1].endswith(diffs[pointer - 1][1]): |
|
937 # Shift the edit over the previous equality. |
|
938 diffs[pointer] = (diffs[pointer][0], |
|
939 diffs[pointer - 1][1] + |
|
940 diffs[pointer][1][:-len(diffs[pointer - 1][1])]) |
|
941 diffs[pointer + 1] = (diffs[pointer + 1][0], |
|
942 diffs[pointer - 1][1] + diffs[pointer + 1][1]) |
|
943 del diffs[pointer - 1] |
|
944 changes = True |
|
945 elif diffs[pointer][1].startswith(diffs[pointer + 1][1]): |
|
946 # Shift the edit over the next equality. |
|
947 diffs[pointer - 1] = (diffs[pointer - 1][0], |
|
948 diffs[pointer - 1][1] + diffs[pointer + 1][1]) |
|
949 diffs[pointer] = (diffs[pointer][0], |
|
950 diffs[pointer][1][len(diffs[pointer + 1][1]):] + |
|
951 diffs[pointer + 1][1]) |
|
952 del diffs[pointer + 1] |
|
953 changes = True |
|
954 pointer += 1 |
|
955 |
|
956 # If shifts were made, the diff needs reordering and another shift sweep. |
|
957 if changes: |
|
958 self.diff_cleanupMerge(diffs) |
|
959 |
|
960 def diff_xIndex(self, diffs, loc): |
|
961 """loc is a location in text1, compute and return the equivalent location |
|
962 in text2. e.g. "The cat" vs "The big cat", 1->1, 5->8 |
|
963 |
|
964 Args: |
|
965 diffs: Array of diff tuples. |
|
966 loc: Location within text1. |
|
967 |
|
968 Returns: |
|
969 Location within text2. |
|
970 """ |
|
971 chars1 = 0 |
|
972 chars2 = 0 |
|
973 last_chars1 = 0 |
|
974 last_chars2 = 0 |
|
975 for x in xrange(len(diffs)): |
|
976 (op, text) = diffs[x] |
|
977 if op != self.DIFF_INSERT: # Equality or deletion. |
|
978 chars1 += len(text) |
|
979 if op != self.DIFF_DELETE: # Equality or insertion. |
|
980 chars2 += len(text) |
|
981 if chars1 > loc: # Overshot the location. |
|
982 break |
|
983 last_chars1 = chars1 |
|
984 last_chars2 = chars2 |
|
985 |
|
986 if len(diffs) != x and diffs[x][0] == self.DIFF_DELETE: |
|
987 # The location was deleted. |
|
988 return last_chars2 |
|
989 # Add the remaining len(character). |
|
990 return last_chars2 + (loc - last_chars1) |
|
991 |
|
992 def diff_prettyHtml(self, diffs): |
|
993 """Convert a diff array into a pretty HTML report. |
|
994 |
|
995 Args: |
|
996 diffs: Array of diff tuples. |
|
997 |
|
998 Returns: |
|
999 HTML representation. |
|
1000 """ |
|
1001 html = [] |
|
1002 i = 0 |
|
1003 for (op, data) in diffs: |
|
1004 text = (data.replace("&", "&").replace("<", "<") |
|
1005 .replace(">", ">").replace("\n", "¶<BR>")) |
|
1006 if op == self.DIFF_INSERT: |
|
1007 html.append("<INS STYLE=\"background:#E6FFE6;\" TITLE=\"i=%i\">%s</INS>" |
|
1008 % (i, text)) |
|
1009 elif op == self.DIFF_DELETE: |
|
1010 html.append("<DEL STYLE=\"background:#FFE6E6;\" TITLE=\"i=%i\">%s</DEL>" |
|
1011 % (i, text)) |
|
1012 elif op == self.DIFF_EQUAL: |
|
1013 html.append("<SPAN TITLE=\"i=%i\">%s</SPAN>" % (i, text)) |
|
1014 if op != self.DIFF_DELETE: |
|
1015 i += len(data) |
|
1016 return "".join(html) |
|
1017 |
|
1018 def diff_text1(self, diffs): |
|
1019 """Compute and return the source text (all equalities and deletions). |
|
1020 |
|
1021 Args: |
|
1022 diffs: Array of diff tuples. |
|
1023 |
|
1024 Returns: |
|
1025 Source text. |
|
1026 """ |
|
1027 text = [] |
|
1028 for (op, data) in diffs: |
|
1029 if op != self.DIFF_INSERT: |
|
1030 text.append(data) |
|
1031 return "".join(text) |
|
1032 |
|
1033 def diff_text2(self, diffs): |
|
1034 """Compute and return the destination text (all equalities and insertions). |
|
1035 |
|
1036 Args: |
|
1037 diffs: Array of diff tuples. |
|
1038 |
|
1039 Returns: |
|
1040 Destination text. |
|
1041 """ |
|
1042 text = [] |
|
1043 for (op, data) in diffs: |
|
1044 if op != self.DIFF_DELETE: |
|
1045 text.append(data) |
|
1046 return "".join(text) |
|
1047 |
|
1048 def diff_levenshtein(self, diffs): |
|
1049 """Compute the Levenshtein distance; the number of inserted, deleted or |
|
1050 substituted characters. |
|
1051 |
|
1052 Args: |
|
1053 diffs: Array of diff tuples. |
|
1054 |
|
1055 Returns: |
|
1056 Number of changes. |
|
1057 """ |
|
1058 levenshtein = 0 |
|
1059 insertions = 0 |
|
1060 deletions = 0 |
|
1061 for (op, data) in diffs: |
|
1062 if op == self.DIFF_INSERT: |
|
1063 insertions += len(data) |
|
1064 elif op == self.DIFF_DELETE: |
|
1065 deletions += len(data) |
|
1066 elif op == self.DIFF_EQUAL: |
|
1067 # A deletion and an insertion is one substitution. |
|
1068 levenshtein += max(insertions, deletions) |
|
1069 insertions = 0 |
|
1070 deletions = 0 |
|
1071 levenshtein += max(insertions, deletions) |
|
1072 return levenshtein |
|
1073 |
|
1074 def diff_toDelta(self, diffs): |
|
1075 """Crush the diff into an encoded string which describes the operations |
|
1076 required to transform text1 into text2. |
|
1077 E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'. |
|
1078 Operations are tab-separated. Inserted text is escaped using %xx notation. |
|
1079 |
|
1080 Args: |
|
1081 diffs: Array of diff tuples. |
|
1082 |
|
1083 Returns: |
|
1084 Delta text. |
|
1085 """ |
|
1086 text = [] |
|
1087 for (op, data) in diffs: |
|
1088 if op == self.DIFF_INSERT: |
|
1089 # High ascii will raise UnicodeDecodeError. Use Unicode instead. |
|
1090 data = data.encode("utf-8") |
|
1091 text.append("+" + urllib.quote(data, "!~*'();/?:@&=+$,# ")) |
|
1092 elif op == self.DIFF_DELETE: |
|
1093 text.append("-%d" % len(data)) |
|
1094 elif op == self.DIFF_EQUAL: |
|
1095 text.append("=%d" % len(data)) |
|
1096 return "\t".join(text) |
|
1097 |
|
1098 def diff_fromDelta(self, text1, delta): |
|
1099 """Given the original text1, and an encoded string which describes the |
|
1100 operations required to transform text1 into text2, compute the full diff. |
|
1101 |
|
1102 Args: |
|
1103 text1: Source string for the diff. |
|
1104 delta: Delta text. |
|
1105 |
|
1106 Returns: |
|
1107 Array of diff tuples. |
|
1108 |
|
1109 Raises: |
|
1110 ValueError: If invalid input. |
|
1111 """ |
|
1112 if type(delta) == unicode: |
|
1113 # Deltas should be composed of a subset of ascii chars, Unicode not |
|
1114 # required. If this encode raises UnicodeEncodeError, delta is invalid. |
|
1115 delta = delta.encode("ascii") |
|
1116 diffs = [] |
|
1117 pointer = 0 # Cursor in text1 |
|
1118 tokens = delta.split("\t") |
|
1119 for token in tokens: |
|
1120 if token == "": |
|
1121 # Blank tokens are ok (from a trailing \t). |
|
1122 continue |
|
1123 # Each token begins with a one character parameter which specifies the |
|
1124 # operation of this token (delete, insert, equality). |
|
1125 param = token[1:] |
|
1126 if token[0] == "+": |
|
1127 param = urllib.unquote(param).decode("utf-8") |
|
1128 diffs.append((self.DIFF_INSERT, param)) |
|
1129 elif token[0] == "-" or token[0] == "=": |
|
1130 try: |
|
1131 n = int(param) |
|
1132 except ValueError: |
|
1133 raise ValueError, "Invalid number in diff_fromDelta: " + param |
|
1134 if n < 0: |
|
1135 raise ValueError, "Negative number in diff_fromDelta: " + param |
|
1136 text = text1[pointer : pointer + n] |
|
1137 pointer += n |
|
1138 if token[0] == "=": |
|
1139 diffs.append((self.DIFF_EQUAL, text)) |
|
1140 else: |
|
1141 diffs.append((self.DIFF_DELETE, text)) |
|
1142 else: |
|
1143 # Anything else is an error. |
|
1144 raise ValueError, ("Invalid diff operation in diff_fromDelta: " + |
|
1145 token[0]) |
|
1146 if pointer != len(text1): |
|
1147 raise ValueError, ( |
|
1148 "Delta length (%d) does not equal source text length (%d)." % |
|
1149 (pointer, len(text1))) |
|
1150 return diffs |
|
1151 |
|
1152 # MATCH FUNCTIONS |
|
1153 |
|
1154 def match_main(self, text, pattern, loc): |
|
1155 """Locate the best instance of 'pattern' in 'text' near 'loc'. |
|
1156 |
|
1157 Args: |
|
1158 text: The text to search. |
|
1159 pattern: The pattern to search for. |
|
1160 loc: The location to search around. |
|
1161 |
|
1162 Returns: |
|
1163 Best match index or -1. |
|
1164 """ |
|
1165 loc = max(0, min(loc, len(text))) |
|
1166 if text == pattern: |
|
1167 # Shortcut (potentially not guaranteed by the algorithm) |
|
1168 return 0 |
|
1169 elif not text: |
|
1170 # Nothing to match. |
|
1171 return -1 |
|
1172 elif text[loc:loc + len(pattern)] == pattern: |
|
1173 # Perfect match at the perfect spot! (Includes case of null pattern) |
|
1174 return loc |
|
1175 else: |
|
1176 # Do a fuzzy compare. |
|
1177 match = self.match_bitap(text, pattern, loc) |
|
1178 return match |
|
1179 |
|
1180 def match_bitap(self, text, pattern, loc): |
|
1181 """Locate the best instance of 'pattern' in 'text' near 'loc' using the |
|
1182 Bitap algorithm. |
|
1183 |
|
1184 Args: |
|
1185 text: The text to search. |
|
1186 pattern: The pattern to search for. |
|
1187 loc: The location to search around. |
|
1188 |
|
1189 Returns: |
|
1190 Best match index or -1. |
|
1191 """ |
|
1192 # Python doesn't have a maxint limit, so ignore this check. |
|
1193 #if self.Match_MaxBits != 0 and len(pattern) > self.Match_MaxBits: |
|
1194 # raise ValueError("Pattern too long for this application.") |
|
1195 |
|
1196 # Initialise the alphabet. |
|
1197 s = self.match_alphabet(pattern) |
|
1198 |
|
1199 def match_bitapScore(e, x): |
|
1200 """Compute and return the score for a match with e errors and x location. |
|
1201 Accesses loc and pattern through being a closure. |
|
1202 |
|
1203 Args: |
|
1204 e: Number of errors in match. |
|
1205 x: Location of match. |
|
1206 |
|
1207 Returns: |
|
1208 Overall score for match (0.0 = good, 1.0 = bad). |
|
1209 """ |
|
1210 accuracy = float(e) / len(pattern) |
|
1211 proximity = abs(loc - x) |
|
1212 if not self.Match_Distance: |
|
1213 # Dodge divide by zero error. |
|
1214 return proximity and 1.0 or accuracy |
|
1215 return accuracy + (proximity / float(self.Match_Distance)) |
|
1216 |
|
1217 # Highest score beyond which we give up. |
|
1218 score_threshold = self.Match_Threshold |
|
1219 # Is there a nearby exact match? (speedup) |
|
1220 best_loc = text.find(pattern, loc) |
|
1221 if best_loc != -1: |
|
1222 score_threshold = min(match_bitapScore(0, best_loc), score_threshold) |
|
1223 # What about in the other direction? (speedup) |
|
1224 best_loc = text.rfind(pattern, loc + len(pattern)) |
|
1225 if best_loc != -1: |
|
1226 score_threshold = min(match_bitapScore(0, best_loc), score_threshold) |
|
1227 |
|
1228 # Initialise the bit arrays. |
|
1229 matchmask = 1 << (len(pattern) - 1) |
|
1230 best_loc = -1 |
|
1231 |
|
1232 bin_max = len(pattern) + len(text) |
|
1233 # Empty initialization added to appease pychecker. |
|
1234 last_rd = None |
|
1235 for d in xrange(len(pattern)): |
|
1236 # Scan for the best match each iteration allows for one more error. |
|
1237 # Run a binary search to determine how far from 'loc' we can stray at |
|
1238 # this error level. |
|
1239 bin_min = 0 |
|
1240 bin_mid = bin_max |
|
1241 while bin_min < bin_mid: |
|
1242 if match_bitapScore(d, loc + bin_mid) <= score_threshold: |
|
1243 bin_min = bin_mid |
|
1244 else: |
|
1245 bin_max = bin_mid |
|
1246 bin_mid = (bin_max - bin_min) / 2 + bin_min |
|
1247 |
|
1248 # Use the result from this iteration as the maximum for the next. |
|
1249 bin_max = bin_mid |
|
1250 start = max(1, loc - bin_mid + 1) |
|
1251 finish = min(loc + bin_mid, len(text)) + len(pattern) |
|
1252 |
|
1253 rd = range(finish + 1) |
|
1254 rd.append((1 << d) - 1) |
|
1255 for j in xrange(finish, start - 1, -1): |
|
1256 if len(text) <= j - 1: |
|
1257 # Out of range. |
|
1258 charMatch = 0 |
|
1259 else: |
|
1260 charMatch = s.get(text[j - 1], 0) |
|
1261 if d == 0: # First pass: exact match. |
|
1262 rd[j] = ((rd[j + 1] << 1) | 1) & charMatch |
|
1263 else: # Subsequent passes: fuzzy match. |
|
1264 rd[j] = ((rd[j + 1] << 1) | 1) & charMatch | ( |
|
1265 ((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1] |
|
1266 if rd[j] & matchmask: |
|
1267 score = match_bitapScore(d, j - 1) |
|
1268 # This match will almost certainly be better than any existing match. |
|
1269 # But check anyway. |
|
1270 if score <= score_threshold: |
|
1271 # Told you so. |
|
1272 score_threshold = score |
|
1273 best_loc = j - 1 |
|
1274 if best_loc > loc: |
|
1275 # When passing loc, don't exceed our current distance from loc. |
|
1276 start = max(1, 2 * loc - best_loc) |
|
1277 else: |
|
1278 # Already passed loc, downhill from here on in. |
|
1279 break |
|
1280 # No hope for a (better) match at greater error levels. |
|
1281 if match_bitapScore(d + 1, loc) > score_threshold: |
|
1282 break |
|
1283 last_rd = rd |
|
1284 return best_loc |
|
1285 |
|
1286 def match_alphabet(self, pattern): |
|
1287 """Initialise the alphabet for the Bitap algorithm. |
|
1288 |
|
1289 Args: |
|
1290 pattern: The text to encode. |
|
1291 |
|
1292 Returns: |
|
1293 Hash of character locations. |
|
1294 """ |
|
1295 s = {} |
|
1296 for char in pattern: |
|
1297 s[char] = 0 |
|
1298 for i in xrange(len(pattern)): |
|
1299 s[pattern[i]] |= 1 << (len(pattern) - i - 1) |
|
1300 return s |
|
1301 |
|
1302 # PATCH FUNCTIONS |
|
1303 |
|
1304 def patch_addContext(self, patch, text): |
|
1305 """Increase the context until it is unique, |
|
1306 but don't let the pattern expand beyond Match_MaxBits. |
|
1307 |
|
1308 Args: |
|
1309 patch: The patch to grow. |
|
1310 text: Source text. |
|
1311 """ |
|
1312 if len(text) == 0: |
|
1313 return |
|
1314 pattern = text[patch.start2 : patch.start2 + patch.length1] |
|
1315 padding = 0 |
|
1316 |
|
1317 # Look for the first and last matches of pattern in text. If two different |
|
1318 # matches are found, increase the pattern length. |
|
1319 while (text.find(pattern) != text.rfind(pattern) and (self.Match_MaxBits == |
|
1320 0 or len(pattern) < self.Match_MaxBits - self.Patch_Margin - |
|
1321 self.Patch_Margin)): |
|
1322 padding += self.Patch_Margin |
|
1323 pattern = text[max(0, patch.start2 - padding) : |
|
1324 patch.start2 + patch.length1 + padding] |
|
1325 # Add one chunk for good luck. |
|
1326 padding += self.Patch_Margin |
|
1327 |
|
1328 # Add the prefix. |
|
1329 prefix = text[max(0, patch.start2 - padding) : patch.start2] |
|
1330 if prefix: |
|
1331 patch.diffs[:0] = [(self.DIFF_EQUAL, prefix)] |
|
1332 # Add the suffix. |
|
1333 suffix = text[patch.start2 + patch.length1 : |
|
1334 patch.start2 + patch.length1 + padding] |
|
1335 if suffix: |
|
1336 patch.diffs.append((self.DIFF_EQUAL, suffix)) |
|
1337 |
|
1338 # Roll back the start points. |
|
1339 patch.start1 -= len(prefix) |
|
1340 patch.start2 -= len(prefix) |
|
1341 # Extend lengths. |
|
1342 patch.length1 += len(prefix) + len(suffix) |
|
1343 patch.length2 += len(prefix) + len(suffix) |
|
1344 |
|
1345 def patch_make(self, a, b=None, c=None): |
|
1346 """Compute a list of patches to turn text1 into text2. |
|
1347 Use diffs if provided, otherwise compute it ourselves. |
|
1348 There are four ways to call this function, depending on what data is |
|
1349 available to the caller: |
|
1350 Method 1: |
|
1351 a = text1, b = text2 |
|
1352 Method 2: |
|
1353 a = diffs |
|
1354 Method 3 (optimal): |
|
1355 a = text1, b = diffs |
|
1356 Method 4 (deprecated, use method 3): |
|
1357 a = text1, b = text2, c = diffs |
|
1358 |
|
1359 Args: |
|
1360 a: text1 (methods 1,3,4) or Array of diff tuples for text1 to |
|
1361 text2 (method 2). |
|
1362 b: text2 (methods 1,4) or Array of diff tuples for text1 to |
|
1363 text2 (method 3) or undefined (method 2). |
|
1364 c: Array of diff tuples for text1 to text2 (method 4) or |
|
1365 undefined (methods 1,2,3). |
|
1366 |
|
1367 Returns: |
|
1368 Array of patch objects. |
|
1369 """ |
|
1370 text1 = None |
|
1371 diffs = None |
|
1372 # Note that texts may arrive as 'str' or 'unicode'. |
|
1373 if isinstance(a, basestring) and isinstance(b, basestring) and c is None: |
|
1374 # Method 1: text1, text2 |
|
1375 # Compute diffs from text1 and text2. |
|
1376 text1 = a |
|
1377 diffs = self.diff_main(text1, b, True) |
|
1378 if len(diffs) > 2: |
|
1379 self.diff_cleanupSemantic(diffs) |
|
1380 self.diff_cleanupEfficiency(diffs) |
|
1381 elif isinstance(a, list) and b is None and c is None: |
|
1382 # Method 2: diffs |
|
1383 # Compute text1 from diffs. |
|
1384 diffs = a |
|
1385 text1 = self.diff_text1(diffs) |
|
1386 elif isinstance(a, basestring) and isinstance(b, list) and c is None: |
|
1387 # Method 3: text1, diffs |
|
1388 text1 = a |
|
1389 diffs = b |
|
1390 elif (isinstance(a, basestring) and isinstance(b, basestring) and |
|
1391 isinstance(c, list)): |
|
1392 # Method 4: text1, text2, diffs |
|
1393 # text2 is not used. |
|
1394 text1 = a |
|
1395 diffs = c |
|
1396 else: |
|
1397 raise ValueError("Unknown call format to patch_make.") |
|
1398 |
|
1399 if not diffs: |
|
1400 return [] # Get rid of the None case. |
|
1401 patches = [] |
|
1402 patch = patch_obj() |
|
1403 char_count1 = 0 # Number of characters into the text1 string. |
|
1404 char_count2 = 0 # Number of characters into the text2 string. |
|
1405 prepatch_text = text1 # Recreate the patches to determine context info. |
|
1406 postpatch_text = text1 |
|
1407 for x in xrange(len(diffs)): |
|
1408 (diff_type, diff_text) = diffs[x] |
|
1409 if len(patch.diffs) == 0 and diff_type != self.DIFF_EQUAL: |
|
1410 # A new patch starts here. |
|
1411 patch.start1 = char_count1 |
|
1412 patch.start2 = char_count2 |
|
1413 if diff_type == self.DIFF_INSERT: |
|
1414 # Insertion |
|
1415 patch.diffs.append(diffs[x]) |
|
1416 patch.length2 += len(diff_text) |
|
1417 postpatch_text = (postpatch_text[:char_count2] + diff_text + |
|
1418 postpatch_text[char_count2:]) |
|
1419 elif diff_type == self.DIFF_DELETE: |
|
1420 # Deletion. |
|
1421 patch.length1 += len(diff_text) |
|
1422 patch.diffs.append(diffs[x]) |
|
1423 postpatch_text = (postpatch_text[:char_count2] + |
|
1424 postpatch_text[char_count2 + len(diff_text):]) |
|
1425 elif (diff_type == self.DIFF_EQUAL and |
|
1426 len(diff_text) <= 2 * self.Patch_Margin and |
|
1427 len(patch.diffs) != 0 and len(diffs) != x + 1): |
|
1428 # Small equality inside a patch. |
|
1429 patch.diffs.append(diffs[x]) |
|
1430 patch.length1 += len(diff_text) |
|
1431 patch.length2 += len(diff_text) |
|
1432 |
|
1433 if (diff_type == self.DIFF_EQUAL and |
|
1434 len(diff_text) >= 2 * self.Patch_Margin): |
|
1435 # Time for a new patch. |
|
1436 if len(patch.diffs) != 0: |
|
1437 self.patch_addContext(patch, prepatch_text) |
|
1438 patches.append(patch) |
|
1439 patch = patch_obj() |
|
1440 # Unlike Unidiff, our patch lists have a rolling context. |
|
1441 # http://code.google.com/p/google-diff-match-patch/wiki/Unidiff |
|
1442 # Update prepatch text & pos to reflect the application of the |
|
1443 # just completed patch. |
|
1444 prepatch_text = postpatch_text |
|
1445 char_count1 = char_count2 |
|
1446 |
|
1447 # Update the current character count. |
|
1448 if diff_type != self.DIFF_INSERT: |
|
1449 char_count1 += len(diff_text) |
|
1450 if diff_type != self.DIFF_DELETE: |
|
1451 char_count2 += len(diff_text) |
|
1452 |
|
1453 # Pick up the leftover patch if not empty. |
|
1454 if len(patch.diffs) != 0: |
|
1455 self.patch_addContext(patch, prepatch_text) |
|
1456 patches.append(patch) |
|
1457 return patches |
|
1458 |
|
1459 def patch_deepCopy(self, patches): |
|
1460 """Given an array of patches, return another array that is identical. |
|
1461 |
|
1462 Args: |
|
1463 patches: Array of patch objects. |
|
1464 |
|
1465 Returns: |
|
1466 Array of patch objects. |
|
1467 """ |
|
1468 patchesCopy = [] |
|
1469 for patch in patches: |
|
1470 patchCopy = patch_obj() |
|
1471 # No need to deep copy the tuples since they are immutable. |
|
1472 patchCopy.diffs = patch.diffs[:] |
|
1473 patchCopy.start1 = patch.start1 |
|
1474 patchCopy.start2 = patch.start2 |
|
1475 patchCopy.length1 = patch.length1 |
|
1476 patchCopy.length2 = patch.length2 |
|
1477 patchesCopy.append(patchCopy) |
|
1478 return patchesCopy |
|
1479 |
|
1480 def patch_apply(self, patches, text): |
|
1481 """Merge a set of patches onto the text. Return a patched text, as well |
|
1482 as a list of true/false values indicating which patches were applied. |
|
1483 |
|
1484 Args: |
|
1485 patches: Array of patch objects. |
|
1486 text: Old text. |
|
1487 |
|
1488 Returns: |
|
1489 Two element Array, containing the new text and an array of boolean values. |
|
1490 """ |
|
1491 if not patches: |
|
1492 return (text, []) |
|
1493 |
|
1494 # Deep copy the patches so that no changes are made to originals. |
|
1495 patches = self.patch_deepCopy(patches) |
|
1496 |
|
1497 nullPadding = self.patch_addPadding(patches) |
|
1498 text = nullPadding + text + nullPadding |
|
1499 self.patch_splitMax(patches) |
|
1500 |
|
1501 # delta keeps track of the offset between the expected and actual location |
|
1502 # of the previous patch. If there are patches expected at positions 10 and |
|
1503 # 20, but the first patch was found at 12, delta is 2 and the second patch |
|
1504 # has an effective expected position of 22. |
|
1505 delta = 0 |
|
1506 results = [] |
|
1507 for patch in patches: |
|
1508 expected_loc = patch.start2 + delta |
|
1509 text1 = self.diff_text1(patch.diffs) |
|
1510 end_loc = -1 |
|
1511 if len(text1) > self.Match_MaxBits: |
|
1512 # patch_splitMax will only provide an oversized pattern in the case of |
|
1513 # a monster delete. |
|
1514 start_loc = self.match_main(text, text1[:self.Match_MaxBits], |
|
1515 expected_loc) |
|
1516 if start_loc != -1: |
|
1517 end_loc = self.match_main(text, text1[-self.Match_MaxBits:], |
|
1518 expected_loc + len(text1) - self.Match_MaxBits) |
|
1519 if end_loc == -1 or start_loc >= end_loc: |
|
1520 # Can't find valid trailing context. Drop this patch. |
|
1521 start_loc = -1 |
|
1522 else: |
|
1523 start_loc = self.match_main(text, text1, expected_loc) |
|
1524 if start_loc == -1: |
|
1525 # No match found. :( |
|
1526 results.append(False) |
|
1527 # Subtract the delta for this failed patch from subsequent patches. |
|
1528 delta -= patch.length2 - patch.length1 |
|
1529 else: |
|
1530 # Found a match. :) |
|
1531 results.append(True) |
|
1532 delta = start_loc - expected_loc |
|
1533 if end_loc == -1: |
|
1534 text2 = text[start_loc : start_loc + len(text1)] |
|
1535 else: |
|
1536 text2 = text[start_loc : end_loc + self.Match_MaxBits] |
|
1537 if text1 == text2: |
|
1538 # Perfect match, just shove the replacement text in. |
|
1539 text = (text[:start_loc] + self.diff_text2(patch.diffs) + |
|
1540 text[start_loc + len(text1):]) |
|
1541 else: |
|
1542 # Imperfect match. |
|
1543 # Run a diff to get a framework of equivalent indices. |
|
1544 diffs = self.diff_main(text1, text2, False) |
|
1545 if (len(text1) > self.Match_MaxBits and |
|
1546 self.diff_levenshtein(diffs) / float(len(text1)) > |
|
1547 self.Patch_DeleteThreshold): |
|
1548 # The end points match, but the content is unacceptably bad. |
|
1549 results[-1] = False |
|
1550 else: |
|
1551 self.diff_cleanupSemanticLossless(diffs) |
|
1552 index1 = 0 |
|
1553 for (op, data) in patch.diffs: |
|
1554 if op != self.DIFF_EQUAL: |
|
1555 index2 = self.diff_xIndex(diffs, index1) |
|
1556 if op == self.DIFF_INSERT: # Insertion |
|
1557 text = text[:start_loc + index2] + data + text[start_loc + |
|
1558 index2:] |
|
1559 elif op == self.DIFF_DELETE: # Deletion |
|
1560 text = text[:start_loc + index2] + text[start_loc + |
|
1561 self.diff_xIndex(diffs, index1 + len(data)):] |
|
1562 if op != self.DIFF_DELETE: |
|
1563 index1 += len(data) |
|
1564 # Strip the padding off. |
|
1565 text = text[len(nullPadding):-len(nullPadding)] |
|
1566 return (text, results) |
|
1567 |
|
1568 def patch_addPadding(self, patches): |
|
1569 """Add some padding on text start and end so that edges can match |
|
1570 something. Intended to be called only from within patch_apply. |
|
1571 |
|
1572 Args: |
|
1573 patches: Array of patch objects. |
|
1574 |
|
1575 Returns: |
|
1576 The padding string added to each side. |
|
1577 """ |
|
1578 paddingLength = self.Patch_Margin |
|
1579 nullPadding = "" |
|
1580 for x in xrange(1, paddingLength + 1): |
|
1581 nullPadding += chr(x) |
|
1582 |
|
1583 # Bump all the patches forward. |
|
1584 for patch in patches: |
|
1585 patch.start1 += paddingLength |
|
1586 patch.start2 += paddingLength |
|
1587 |
|
1588 # Add some padding on start of first diff. |
|
1589 patch = patches[0] |
|
1590 diffs = patch.diffs |
|
1591 if not diffs or diffs[0][0] != self.DIFF_EQUAL: |
|
1592 # Add nullPadding equality. |
|
1593 diffs.insert(0, (self.DIFF_EQUAL, nullPadding)) |
|
1594 patch.start1 -= paddingLength # Should be 0. |
|
1595 patch.start2 -= paddingLength # Should be 0. |
|
1596 patch.length1 += paddingLength |
|
1597 patch.length2 += paddingLength |
|
1598 elif paddingLength > len(diffs[0][1]): |
|
1599 # Grow first equality. |
|
1600 extraLength = paddingLength - len(diffs[0][1]) |
|
1601 newText = nullPadding[len(diffs[0][1]):] + diffs[0][1] |
|
1602 diffs[0] = (diffs[0][0], newText) |
|
1603 patch.start1 -= extraLength |
|
1604 patch.start2 -= extraLength |
|
1605 patch.length1 += extraLength |
|
1606 patch.length2 += extraLength |
|
1607 |
|
1608 # Add some padding on end of last diff. |
|
1609 patch = patches[-1] |
|
1610 diffs = patch.diffs |
|
1611 if not diffs or diffs[-1][0] != self.DIFF_EQUAL: |
|
1612 # Add nullPadding equality. |
|
1613 diffs.append((self.DIFF_EQUAL, nullPadding)) |
|
1614 patch.length1 += paddingLength |
|
1615 patch.length2 += paddingLength |
|
1616 elif paddingLength > len(diffs[-1][1]): |
|
1617 # Grow last equality. |
|
1618 extraLength = paddingLength - len(diffs[-1][1]) |
|
1619 newText = diffs[-1][1] + nullPadding[:extraLength] |
|
1620 diffs[-1] = (diffs[-1][0], newText) |
|
1621 patch.length1 += extraLength |
|
1622 patch.length2 += extraLength |
|
1623 |
|
1624 return nullPadding |
|
1625 |
|
1626 def patch_splitMax(self, patches): |
|
1627 """Look through the patches and break up any which are longer than the |
|
1628 maximum limit of the match algorithm. |
|
1629 |
|
1630 Args: |
|
1631 patches: Array of patch objects. |
|
1632 """ |
|
1633 if self.Match_MaxBits == 0: |
|
1634 return |
|
1635 for x in xrange(len(patches)): |
|
1636 if patches[x].length1 > self.Match_MaxBits: |
|
1637 bigpatch = patches[x] |
|
1638 # Remove the big old patch. |
|
1639 del patches[x] |
|
1640 x -= 1 |
|
1641 patch_size = self.Match_MaxBits |
|
1642 start1 = bigpatch.start1 |
|
1643 start2 = bigpatch.start2 |
|
1644 precontext = '' |
|
1645 while len(bigpatch.diffs) != 0: |
|
1646 # Create one of several smaller patches. |
|
1647 patch = patch_obj() |
|
1648 empty = True |
|
1649 patch.start1 = start1 - len(precontext) |
|
1650 patch.start2 = start2 - len(precontext) |
|
1651 if precontext: |
|
1652 patch.length1 = patch.length2 = len(precontext) |
|
1653 patch.diffs.append((self.DIFF_EQUAL, precontext)) |
|
1654 |
|
1655 while (len(bigpatch.diffs) != 0 and |
|
1656 patch.length1 < patch_size - self.Patch_Margin): |
|
1657 (diff_type, diff_text) = bigpatch.diffs[0] |
|
1658 if diff_type == self.DIFF_INSERT: |
|
1659 # Insertions are harmless. |
|
1660 patch.length2 += len(diff_text) |
|
1661 start2 += len(diff_text) |
|
1662 patch.diffs.append(bigpatch.diffs.pop(0)) |
|
1663 empty = False |
|
1664 elif (diff_type == self.DIFF_DELETE and len(patch.diffs) == 1 and |
|
1665 patch.diffs[0][0] == self.DIFF_EQUAL and |
|
1666 len(diff_text) > 2 * patch_size): |
|
1667 # This is a large deletion. Let it pass in one chunk. |
|
1668 patch.length1 += len(diff_text) |
|
1669 start1 += len(diff_text) |
|
1670 empty = False |
|
1671 patch.diffs.append((diff_type, diff_text)) |
|
1672 del bigpatch.diffs[0] |
|
1673 else: |
|
1674 # Deletion or equality. Only take as much as we can stomach. |
|
1675 diff_text = diff_text[:patch_size - patch.length1 - |
|
1676 self.Patch_Margin] |
|
1677 patch.length1 += len(diff_text) |
|
1678 start1 += len(diff_text) |
|
1679 if diff_type == self.DIFF_EQUAL: |
|
1680 patch.length2 += len(diff_text) |
|
1681 start2 += len(diff_text) |
|
1682 else: |
|
1683 empty = False |
|
1684 |
|
1685 patch.diffs.append((diff_type, diff_text)) |
|
1686 if diff_text == bigpatch.diffs[0][1]: |
|
1687 del bigpatch.diffs[0] |
|
1688 else: |
|
1689 bigpatch.diffs[0] = (bigpatch.diffs[0][0], |
|
1690 bigpatch.diffs[0][1][len(diff_text):]) |
|
1691 |
|
1692 # Compute the head context for the next patch. |
|
1693 precontext = self.diff_text2(patch.diffs) |
|
1694 precontext = precontext[-self.Patch_Margin:] |
|
1695 # Append the end context for this patch. |
|
1696 postcontext = self.diff_text1(bigpatch.diffs)[:self.Patch_Margin] |
|
1697 if postcontext: |
|
1698 patch.length1 += len(postcontext) |
|
1699 patch.length2 += len(postcontext) |
|
1700 if len(patch.diffs) != 0 and patch.diffs[-1][0] == self.DIFF_EQUAL: |
|
1701 patch.diffs[-1] = (self.DIFF_EQUAL, patch.diffs[-1][1] + |
|
1702 postcontext) |
|
1703 else: |
|
1704 patch.diffs.append((self.DIFF_EQUAL, postcontext)) |
|
1705 |
|
1706 if not empty: |
|
1707 x += 1 |
|
1708 patches.insert(x, patch) |
|
1709 |
|
1710 def patch_toText(self, patches): |
|
1711 """Take a list of patches and return a textual representation. |
|
1712 |
|
1713 Args: |
|
1714 patches: Array of patch objects. |
|
1715 |
|
1716 Returns: |
|
1717 Text representation of patches. |
|
1718 """ |
|
1719 text = [] |
|
1720 for patch in patches: |
|
1721 text.append(str(patch)) |
|
1722 return "".join(text) |
|
1723 |
|
1724 def patch_fromText(self, textline): |
|
1725 """Parse a textual representation of patches and return a list of patch |
|
1726 objects. |
|
1727 |
|
1728 Args: |
|
1729 textline: Text representation of patches. |
|
1730 |
|
1731 Returns: |
|
1732 Array of patch objects. |
|
1733 |
|
1734 Raises: |
|
1735 ValueError: If invalid input. |
|
1736 """ |
|
1737 if type(textline) == unicode: |
|
1738 # Patches should be composed of a subset of ascii chars, Unicode not |
|
1739 # required. If this encode raises UnicodeEncodeError, patch is invalid. |
|
1740 textline = textline.encode("ascii") |
|
1741 patches = [] |
|
1742 if not textline: |
|
1743 return patches |
|
1744 text = textline.split('\n') |
|
1745 while len(text) != 0: |
|
1746 m = re.match("^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$", text[0]) |
|
1747 if not m: |
|
1748 raise ValueError, "Invalid patch string: " + text[0] |
|
1749 patch = patch_obj() |
|
1750 patches.append(patch) |
|
1751 patch.start1 = int(m.group(1)) |
|
1752 if m.group(2) == '': |
|
1753 patch.start1 -= 1 |
|
1754 patch.length1 = 1 |
|
1755 elif m.group(2) == '0': |
|
1756 patch.length1 = 0 |
|
1757 else: |
|
1758 patch.start1 -= 1 |
|
1759 patch.length1 = int(m.group(2)) |
|
1760 |
|
1761 patch.start2 = int(m.group(3)) |
|
1762 if m.group(4) == '': |
|
1763 patch.start2 -= 1 |
|
1764 patch.length2 = 1 |
|
1765 elif m.group(4) == '0': |
|
1766 patch.length2 = 0 |
|
1767 else: |
|
1768 patch.start2 -= 1 |
|
1769 patch.length2 = int(m.group(4)) |
|
1770 |
|
1771 del text[0] |
|
1772 |
|
1773 while len(text) != 0: |
|
1774 if text[0]: |
|
1775 sign = text[0][0] |
|
1776 else: |
|
1777 sign = '' |
|
1778 line = urllib.unquote(text[0][1:]) |
|
1779 line = line.decode("utf-8") |
|
1780 if sign == '+': |
|
1781 # Insertion. |
|
1782 patch.diffs.append((self.DIFF_INSERT, line)) |
|
1783 elif sign == '-': |
|
1784 # Deletion. |
|
1785 patch.diffs.append((self.DIFF_DELETE, line)) |
|
1786 elif sign == ' ': |
|
1787 # Minor equality. |
|
1788 patch.diffs.append((self.DIFF_EQUAL, line)) |
|
1789 elif sign == '@': |
|
1790 # Start of next patch. |
|
1791 break |
|
1792 elif sign == '': |
|
1793 # Blank line? Whatever. |
|
1794 pass |
|
1795 else: |
|
1796 # WTF? |
|
1797 raise ValueError, "Invalid patch mode: '%s'\n%s" % (sign, line) |
|
1798 del text[0] |
|
1799 return patches |
|
1800 |
|
1801 |
|
1802 class patch_obj: |
|
1803 """Class representing one patch operation. |
|
1804 """ |
|
1805 |
|
1806 def __init__(self): |
|
1807 """Initializes with an empty list of diffs. |
|
1808 """ |
|
1809 self.diffs = [] |
|
1810 self.start1 = None |
|
1811 self.start2 = None |
|
1812 self.length1 = 0 |
|
1813 self.length2 = 0 |
|
1814 |
|
1815 def __str__(self): |
|
1816 """Emmulate GNU diff's format. |
|
1817 Header: @@ -382,8 +481,9 @@ |
|
1818 Indicies are printed as 1-based, not 0-based. |
|
1819 |
|
1820 Returns: |
|
1821 The GNU diff string. |
|
1822 """ |
|
1823 if self.length1 == 0: |
|
1824 coords1 = str(self.start1) + ",0" |
|
1825 elif self.length1 == 1: |
|
1826 coords1 = str(self.start1 + 1) |
|
1827 else: |
|
1828 coords1 = str(self.start1 + 1) + "," + str(self.length1) |
|
1829 if self.length2 == 0: |
|
1830 coords2 = str(self.start2) + ",0" |
|
1831 elif self.length2 == 1: |
|
1832 coords2 = str(self.start2 + 1) |
|
1833 else: |
|
1834 coords2 = str(self.start2 + 1) + "," + str(self.length2) |
|
1835 text = ["@@ -", coords1, " +", coords2, " @@\n"] |
|
1836 # Escape the body of the patch with %xx notation. |
|
1837 for (op, data) in self.diffs: |
|
1838 if op == diff_match_patch.DIFF_INSERT: |
|
1839 text.append("+") |
|
1840 elif op == diff_match_patch.DIFF_DELETE: |
|
1841 text.append("-") |
|
1842 elif op == diff_match_patch.DIFF_EQUAL: |
|
1843 text.append(" ") |
|
1844 # High ascii will raise UnicodeDecodeError. Use Unicode instead. |
|
1845 data = data.encode("utf-8") |
|
1846 text.append(urllib.quote(data, "!~*'();/?:@&=+$,# ") + "\n") |
|
1847 return "".join(text) |