import fnmatch import logging import os import re import subprocess import tempfile from difflib import SequenceMatcher try: import pygments from pygments.lexers import get_lexer_for_filename # from pygments.lexers import guess_lexer_for_filename from pygments.formatters import HtmlFormatter except ImportError: pass from django.utils.html import escape from django.utils.http import urlquote from django.utils.safestring import mark_safe from django.utils.translation import ugettext as _ from djblets.log import log_timed from djblets.siteconfig.models import SiteConfiguration from djblets.util.misc import cache_memoize from reviewboard.accounts.models import Profile from reviewboard.admin.checks import get_can_enable_syntax_highlighting from reviewboard.diffviewer.myersdiff import MyersDiffer from reviewboard.diffviewer.smdiff import SMDiffer from reviewboard.scmtools.core import PRE_CREATION, HEAD DEFAULT_DIFF_COMPAT_VERSION = 1 class UserVisibleError(Exception): pass class DiffCompatError(Exception): pass def Differ(a, b, ignore_space=False, compat_version=DEFAULT_DIFF_COMPAT_VERSION): """ Factory wrapper for returning a differ class based on the compat version and flags specified. """ if compat_version == 0: return SMDiffer(a, b) elif compat_version == 1: return MyersDiffer(a, b, ignore_space) else: raise DiffCompatError( "Invalid diff compatibility version (%s) passed to Differ" % (compat_version)) def convert_line_endings(data): # Files without a trailing newline come out of Perforce (and possibly # other systems) with a trailing \r. Diff will see the \r and # add a "\ No newline at end of file" marker at the end of the file's # contents, which patch understands and will happily apply this to # a file with a trailing \r. # # The problem is that we normalize \r's to \n's, which breaks patch. # Our solution to this is to just remove that last \r and not turn # it into a \n. # # See http://code.google.com/p/reviewboard/issues/detail?id=386 # and http://reviews.review-board.org/r/286/ if data == "": return "" if data[-1] == "\r": data = data[:-1] temp = data.replace('\r\r\n', '\n') temp = data.replace('\r\n', '\n') temp = temp.replace('\r', '\n') return temp def patch(diff, file, filename): """Apply a diff to a file. Delegates out to `patch` because noone except Larry Wall knows how to patch.""" log_timer = log_timed("Patching file %s" % filename) if diff.strip() == "": # Someone uploaded an unchanged file. Return the one we're patching. return file # Prepare the temporary directory if none is available tempdir = tempfile.mkdtemp(prefix='reviewboard.') (fd, oldfile) = tempfile.mkstemp(dir=tempdir) f = os.fdopen(fd, "w+b") f.write(convert_line_endings(file)) f.close() diff = convert_line_endings(diff) # XXX: catch exception if Popen fails? newfile = '%s-new' % oldfile p = subprocess.Popen(['patch', '-o', newfile, oldfile], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) p.stdin.write(diff) p.stdin.close() patch_output = p.stdout.read() failure = p.wait() if failure: f = open("%s.diff" % (os.path.join(tempdir, os.path.basename(filename))), "w") f.write(diff) f.close() log_timer.done() # FIXME: This doesn't provide any useful error report on why the patch # failed to apply, which makes it hard to debug. We might also want to # have it clean up if DEBUG=False raise Exception(_("The patch to '%s' didn't apply cleanly. The temporary " + "files have been left in '%s' for debugging purposes.\n" + "`patch` returned: %s") % (filename, tempdir, patch_output)) f = open(newfile, "r") data = f.read() f.close() os.unlink(oldfile) os.unlink(newfile) os.rmdir(tempdir) log_timer.done() return data def get_line_changed_regions(oldline, newline): if oldline is None or newline is None: return (None, None) # Use the SequenceMatcher directly. It seems to give us better results # for this. We should investigate steps to move to the new differ. differ = SequenceMatcher(None, oldline, newline) # This thresholds our results -- we don't want to show inter-line diffs if # most of the line has changed, unless those lines are very short. # FIXME: just a plain, linear threshold is pretty crummy here. Short # changes in a short line get lost. I haven't yet thought of a fancy # nonlinear test. if differ.ratio() < 0.6: return (None, None) oldchanges = [] newchanges = [] back = (0, 0) for tag, i1, i2, j1, j2 in differ.get_opcodes(): if tag == "equal": if (i2 - i1 < 3) or (j2 - j1 < 3): back = (j2 - j1, i2 - i1) continue oldstart, oldend = i1 - back[0], i2 newstart, newend = j1 - back[1], j2 if oldchanges != [] and oldstart <= oldchanges[-1][1] < oldend: oldchanges[-1] = (oldchanges[-1][0], oldend) elif not oldline[oldstart:oldend].isspace(): oldchanges.append((oldstart, oldend)) if newchanges != [] and newstart <= newchanges[-1][1] < newend: newchanges[-1] = (newchanges[-1][0], newend) elif not newline[newstart:newend].isspace(): newchanges.append((newstart, newend)) back = (0, 0) return (oldchanges, newchanges) def convert_to_utf8(s, enc): """ Returns the passed string as a unicode string. If conversion to UTF-8 fails, we try the user-specified encoding, which defaults to ISO 8859-15. This can be overridden by users inside the repository configuration, which gives users repository-level control over file encodings (file-level control is really, really hard). """ if isinstance(s, unicode): return s elif isinstance(s, basestring): try: u = unicode(s, 'utf-8') return u except UnicodeError: for e in enc.split(','): try: u = unicode(s, e) return u.encode('utf-8') except UnicodeError: pass raise Exception(_("Diff content couldn't be converted to UTF-8 " "using the following encodings: %s") % enc) else: raise TypeError("Value to convert is unexpected type %s", type(s)) def get_original_file(filediff): """ Get a file either from the cache or the SCM, applying the parent diff if it exists. SCM exceptions are passed back to the caller. """ data = "" if filediff.source_revision != PRE_CREATION: def fetch_file(file, revision): log_timer = log_timed("Fetching file '%s' r%s from %s" % (file, revision, repository)) data = tool.get_file(file, revision) data = convert_line_endings(data) log_timer.done() return data repository = filediff.diffset.repository tool = repository.get_scmtool() file = filediff.source_file revision = filediff.source_revision key = "%s:%s:%s" % (filediff.diffset.repository.path, urlquote(file), revision) # We wrap the result of get_file in a list and then return the first # element after getting the result from the cache. This prevents the # cache backend from converting to unicode, since we're no longer # passing in a string and the cache backend doesn't recursively look # through the list in order to convert the elements inside. # # Basically, this fixes the massive regressions introduced by the # Django unicode changes. data = cache_memoize(key, lambda: [fetch_file(file, revision)], large_data=True)[0] # If there's a parent diff set, apply it to the buffer. if filediff.parent_diff: data = patch(filediff.parent_diff, data, filediff.source_file) return data def get_patched_file(buffer, filediff): return patch(filediff.diff, buffer, filediff.dest_file) def get_chunks(diffset, filediff, interfilediff, force_interdiff, enable_syntax_highlighting): def diff_line(vlinenum, oldlinenum, newlinenum, oldline, newline, oldmarkup, newmarkup): # This function accesses the variable meta, defined in an outer context. if oldline and newline and oldline != newline: oldregion, newregion = get_line_changed_regions(oldline, newline) else: oldregion = newregion = [] return [vlinenum, oldlinenum or '', mark_safe(oldmarkup or ''), oldregion, newlinenum or '', mark_safe(newmarkup or ''), newregion, (oldlinenum, newlinenum) in meta['whitespace_lines']] def new_chunk(lines, numlines, tag, collapsable=False, meta={}): return { 'lines': lines, 'numlines': numlines, 'change': tag, 'collapsable': collapsable, 'meta': meta, } def add_ranged_chunks(lines, start, end, collapsable=False): chunks.append(new_chunk(lines[start:end], end - start, 'equal', collapsable)) def apply_pygments(data, filename): # XXX Guessing is preferable but really slow, especially on XML # files. #if filename.endswith(".xml"): lexer = get_lexer_for_filename(filename, stripnl=False) #else: # lexer = guess_lexer_for_filename(filename, data, stripnl=False) try: # This is only available in 0.7 and higher lexer.add_filter('codetagify') except AttributeError: pass return pygments.highlight(data, lexer, HtmlFormatter()).splitlines() # There are three ways this function is called: # # 1) filediff, no interfilediff # - Returns chunks for a single filediff. This is the usual way # people look at diffs in the diff viewer. # # In this mode, we get the original file based on the filediff # and then patch it to get the resulting file. # # This is also used for interdiffs where the source revision # has no equivalent modified file but the interdiff revision # does. It's no different than a standard diff. # # 2) filediff, interfilediff # - Returns chunks showing the changes between a source filediff # and the interdiff. # # This is the typical mode used when showing the changes # between two diffs. It requires that the file is included in # both revisions of a diffset. # # 3) filediff, no interfilediff, force_interdiff # - Returns chunks showing the changes between a source # diff and an unmodified version of the diff. # # This is used when the source revision in the diffset contains # modifications to a file which have then been reverted in the # interdiff revision. We don't actually have an interfilediff # in this case, so we have to indicate that we are indeed in # interdiff mode so that we can special-case this and not # grab a patched file for the interdiff version. assert filediff file = filediff.source_file revision = filediff.source_revision old = get_original_file(filediff) new = get_patched_file(old, filediff) if interfilediff: old = new interdiff_orig = get_original_file(interfilediff) new = get_patched_file(interdiff_orig, interfilediff) elif force_interdiff: # Basically, revert the change. temp = old old = new new = temp encoding = diffset.repository.encoding or 'iso-8859-15' old = convert_to_utf8(old, encoding) new = convert_to_utf8(new, encoding) # Normalize the input so that if there isn't a trailing newline, we add # it. if old and old[-1] != '\n': old += '\n' if new and new[-1] != '\n': new += '\n' a = re.split(r"\r?\n", old or '') b = re.split(r"\r?\n", new or '') # Remove the trailing newline, now that we've split this. This will # prevent a duplicate line number at the end of the diff. del(a[-1]) del(b[-1]) a_num_lines = len(a) b_num_lines = len(b) markup_a = markup_b = None siteconfig = SiteConfiguration.objects.get_current() threshold = siteconfig.get('diffviewer_syntax_highlighting_threshold') if threshold and (a_num_lines > threshold or b_num_lines > threshold): enable_syntax_highlighting = False if enable_syntax_highlighting: try: # TODO: Try to figure out the right lexer for these files # once instead of twice. markup_a = apply_pygments(old or '', filediff.source_file) markup_b = apply_pygments(new or '', filediff.dest_file) except ValueError: pass if not markup_a: markup_a = re.split(r"\r?\n", escape(old)) if not markup_b: markup_b = re.split(r"\r?\n", escape(new)) chunks = [] linenum = 1 ignore_space = True for pattern in siteconfig.get("diffviewer_include_space_patterns"): if fnmatch.fnmatch(file, pattern): ignore_space = False break differ = Differ(a, b, ignore_space=ignore_space, compat_version=diffset.diffcompat) # TODO: Make this back into a preference if people really want it. context_num_lines = siteconfig.get("diffviewer_context_num_lines") collapse_threshold = 2 * context_num_lines + 3 if interfilediff: logging.debug("Generating diff chunks for interdiff ids %s-%s", filediff.id, interfilediff.id) else: logging.debug("Generating diff chunks for filediff id %s", filediff.id) for tag, i1, i2, j1, j2, meta in opcodes_with_metadata(differ): oldlines = markup_a[i1:i2] newlines = markup_b[j1:j2] numlines = max(len(oldlines), len(newlines)) lines = map(diff_line, xrange(linenum, linenum + numlines), xrange(i1 + 1, i2 + 1), xrange(j1 + 1, j2 + 1), a[i1:i2], b[j1:j2], oldlines, newlines) linenum += numlines if tag == 'equal' and numlines > collapse_threshold: last_range_start = numlines - context_num_lines if len(chunks) == 0: add_ranged_chunks(lines, 0, last_range_start, True) add_ranged_chunks(lines, last_range_start, numlines) else: add_ranged_chunks(lines, 0, context_num_lines) if i2 == a_num_lines and j2 == b_num_lines: add_ranged_chunks(lines, context_num_lines, numlines, True) else: add_ranged_chunks(lines, context_num_lines, last_range_start, True) add_ranged_chunks(lines, last_range_start, numlines) else: chunks.append(new_chunk(lines, numlines, tag, meta=meta)) if interfilediff: logging.debug("Done generating diff chunks for interdiff ids %s-%s", filediff.id, interfilediff.id) else: logging.debug("Done generating diff chunks for filediff id %s", filediff.id) return chunks def opcodes_with_metadata(differ): groups = [] ws_re = re.compile(r"\s") for tag, i1, i2, j1, j2 in differ.get_opcodes(): meta = { "whitespace_chunk": False, # True if this chunk is only whitespace. "whitespace_lines": [], # List of tuples (x,y), with whitespace changes. } if tag == 'replace': # replace groups are good for whitespace only changes. assert (i2 - i1) == (j2 - j1) for i, j in zip(xrange(i1, i2), xrange(j1, j2)): if ws_re.sub("", differ.a[i]) == ws_re.sub("", differ.b[j]): meta["whitespace_lines"].append((i + 1, j + 1)) if len(meta["whitespace_lines"]) == (i2 - i1): meta["whitespace_chunk"] = True groups.append((tag, i1, i2, j1, j2, meta)) for group in groups: yield group def get_revision_str(revision): if revision == HEAD: return "HEAD" elif revision == PRE_CREATION: return "" else: return "Revision %s" % revision def get_diff_files(diffset, filediff=None, interdiffset=None, enable_syntax_highlighting=True, load_chunks=True): if filediff: filediffs = [filediff] if interdiffset: log_timer = log_timed("Generating diff file info for " "interdiffset ids %s-%s, filediff %s" % (diffset.id, interdiffset.id, filediff.id)) else: log_timer = log_timed("Generating diff file info for " "diffset id %s, filediff %s" % (diffset.id, filediff.id)) else: filediffs = diffset.files.all() if interdiffset: log_timer = log_timed("Generating diff file info for " "interdiffset ids %s-%s" % (diffset.id, interdiffset.id)) else: log_timer = log_timed("Generating diff file info for " "diffset id %s" % diffset.id) # A map used to quickly look up the equivalent interfilediff given a # source file. interdiff_map = {} if interdiffset: for interfilediff in interdiffset.files.all(): if not filediff or \ filediff.source_file == interfilediff.source_file: interdiff_map[interfilediff.source_file] = interfilediff key_prefix = "diff-sidebyside-" if enable_syntax_highlighting: key_prefix += "hl-" # In order to support interdiffs properly, we need to display diffs # on every file in the union of both diffsets. Iterating over one diffset # or the other doesn't suffice. # # We build a list of parts containing the source filediff, the interdiff # filediff (if specified), and whether to force showing an interdiff # (in the case where a file existed in the source filediff but was # reverted in the interdiff). filediff_parts = [] for filediff in filediffs: interfilediff = None if filediff.source_file in interdiff_map: interfilediff = interdiff_map[filediff.source_file] del(interdiff_map[filediff.source_file]) filediff_parts.append((filediff, interfilediff, interdiffset != None)) if interdiffset: # We've removed everything in the map that we've already found. # What's left are interdiff files that are new. They have no file # to diff against. # # The end result is going to be a view that's the same as when you're # viewing a standard diff. As such, we can pretend the interdiff is # the source filediff and not specify an interdiff. Keeps things # simple, code-wise, since we really have no need to special-case # this. for interdiff in interdiff_map.values(): filediff_parts.append((interdiff, None, False)) files = [] index = 0 for parts in filediff_parts: filediff, interfilediff, force_interdiff = parts filediff_revision_str = get_revision_str(filediff.source_revision) newfile = (filediff.source_revision == PRE_CREATION) if interdiffset: # First, find out if we want to even process this one. # We only process if there's a difference in files. if (filediff and interfilediff and filediff.diff == interfilediff.diff): continue source_revision = "Diff Revision %s" % diffset.revision if not interfilediff and force_interdiff: dest_revision = "Diff Revision %s - File Reverted" % \ interdiffset.revision else: dest_revision = "Diff Revision %s" % interdiffset.revision else: source_revision = get_revision_str(filediff.source_revision) if newfile: dest_revision = _("New File") else: dest_revision = _("New Change") i = filediff.source_file.rfind('/') if i != -1: basepath = filediff.source_file[:i] basename = filediff.source_file[i + 1:] else: basepath = "" basename = filediff.source_file file = { 'depot_filename': filediff.source_file, 'basename': basename, 'basepath': basepath, 'revision': source_revision, 'dest_revision': dest_revision, 'filediff': filediff, 'interfilediff': interfilediff, 'force_interdiff': force_interdiff, 'binary': filediff.binary, 'newfile': newfile, 'index': len(files), } if load_chunks: chunks = [] if not filediff.binary: key = key_prefix if not force_interdiff: key += str(filediff.id) elif interfilediff: key += "interdiff-%s-%s" % (filediff.id, interfilediff.id) else: key += "interdiff-%s-none" % filediff.id chunks = cache_memoize( key, lambda: get_chunks(filediff.diffset, filediff, interfilediff, force_interdiff, enable_syntax_highlighting), large_data=True) file['chunks'] = chunks file['changed_chunks'] = [] file['whitespace_only'] = True for j, chunk in enumerate(file['chunks']): chunk['index'] = j if chunk['change'] != 'equal': file['changed_chunks'].append(chunk) meta = chunk.get('meta', {}) if not meta.get('whitespace_chunk', False): file['whitespace_only'] = False file['num_changes'] = len(file['changed_chunks']) files.append(file) def cmp_file(x, y): # Sort based on basepath in asc order if x["basepath"] != y["basepath"]: return cmp(x["basepath"], y["basepath"]) # Sort based on filename in asc order, then basod on extension in desc # order, to make *.h be ahead of *.c/cpp x_file, x_ext = os.path.splitext(x["basename"]) y_file, y_ext = os.path.splitext(y["basename"]) if x_file != y_file: return cmp(x_file, y_file) else: return cmp(y_ext, x_ext) files.sort(cmp_file) log_timer.done() return files def get_file_chunks_in_range(context, filediff, interfilediff, first_line, num_lines): """ A generator that yields chunks within a range of lines in the specified filediff/interfilediff. This is primarily intended for use with templates. It takes a RequestContext for looking up the user and for caching file lists, in order to improve performance and reduce lookup times for files that have already been fetched. Each returned chunk is a dictionary with the following fields: ============= ======================================================== Variable Description ============= ======================================================== ``change`` The change type ("equal", "replace", "insert", "delete") ``numlines`` The number of lines in the chunk. ``lines`` The list of lines in the chunk. ``meta`` A dictionary containing metadata on the chunk ============= ======================================================== Each line in the list of lines is an array with the following data: ======== ============================================================= Index Description ======== ============================================================= 0 Virtual line number (union of the original and patched files) 1 Real line number in the original file 2 HTML markup of the original file 3 Changed regions of the original line (for "replace" chunks) 4 Real line number in the patched file 5 HTML markup of the patched file 6 Changed regions of the patched line (for "replace" chunks) 7 True if line consists of only whitespace changes ======== ============================================================= """ interdiffset = None key = "_diff_files_%s_%s" % (filediff.diffset.id, filediff.id) if interfilediff: key += "_%s" % (interfilediff.id) interdiffset = interfilediff.diffset if key in context: files = context[key] else: assert 'user' in context files = get_diff_files(filediff.diffset, filediff, interdiffset, get_enable_highlighting(context['user'])) context[key] = files if not files: raise StopIteration assert len(files) == 1 for chunk in files[0]['chunks']: lines = chunk['lines'] if lines[-1][0] >= first_line >= lines[0][0]: start_index = first_line - lines[0][0] if first_line + num_lines <= lines[-1][0]: last_index = start_index + num_lines else: last_index = len(lines) new_chunk = { 'lines': chunk['lines'][start_index:last_index], 'numlines': last_index - start_index, 'change': chunk['change'], 'meta': chunk['meta'], } yield new_chunk first_line += new_chunk['numlines'] num_lines -= new_chunk['numlines'] assert num_lines >= 0 if num_lines == 0: break def get_enable_highlighting(user): if user.is_authenticated(): profile, profile_is_new = Profile.objects.get_or_create(user=user) user_syntax_highlighting = profile.syntax_highlighting else: user_syntax_highlighting = True siteconfig = SiteConfiguration.objects.get_current() return (siteconfig.get('diffviewer_syntax_highlighting') and user_syntax_highlighting and get_can_enable_syntax_highlighting())