diff hggit/hg2git.py @ 596:d6b9c30a3e0f

Export Git objects from incremental Mercurial changes This replaces the brute force Mercurial to Git export with one that is incremental. It results in a decent performance win and paves the road for parallel export via using multiple incremental exporters.
author Gregory Szorc <gregory.szorc@gmail.com>
date Tue, 19 Mar 2013 22:44:01 -0700
parents
children 792955be68dd
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hggit/hg2git.py	Tue Mar 19 22:44:01 2013 -0700
@@ -0,0 +1,274 @@
+# This file contains code dealing specifically with converting Mercurial
+# repositories to Git repositories. Code in this file is meant to be a generic
+# library and should be usable outside the context of hg-git or an hg command.
+
+import os
+import stat
+
+import dulwich.objects as dulobjs
+import mercurial.node
+
+import util
+
+
+class IncrementalChangesetExporter(object):
+    """Incrementally export Mercurial changesets to Git trees.
+
+    The purpose of this class is to facilitate Git tree export that is more
+    optimal than brute force.
+
+    A "dumb" implementations of Mercurial to Git export would iterate over
+    every file present in a Mercurial changeset and would convert each to
+    a Git blob and then conditionally add it to a Git repository if it didn't
+    yet exist. This is suboptimal because the overhead associated with
+    obtaining every file's raw content and converting it to a Git blob is
+    not trivial!
+
+    This class works around the suboptimality of brute force export by
+    leveraging the information stored in Mercurial - the knowledge of what
+    changed between changesets - to only export Git objects corresponding to
+    changes in Mercurial. In the context of converting Mercurial repositories
+    to Git repositories, we only export objects Git (possibly) hasn't seen yet.
+    This prevents a lot of redundant work and is thus faster.
+
+    Callers instantiate an instance of this class against a mercurial.localrepo
+    instance. They then associate it with a specific changesets by calling
+    update_changeset(). On each call to update_changeset(), the instance
+    computes the difference between the current and new changesets and emits
+    Git objects that haven't yet been encountered during the lifetime of the
+    class instance. In other words, it expresses Mercurial changeset deltas in
+    terms of Git objects. Callers then (usually) take this set of Git objects
+    and add them to the Git repository.
+
+    This class only emits Git blobs and trees, not commits.
+
+    The tree calculation part of this class is essentially a reimplementation
+    of dulwich.index.commit_tree. However, since our implementation reuses
+    Tree instances and only recalculates SHA-1 when things change, we are
+    more efficient.
+    """
+
+    def __init__(self, hg_repo):
+        """Create an instance against a mercurial.localrepo."""
+        self._hg = hg_repo
+
+        # Our current revision.
+        self._rev = mercurial.node.nullrev
+
+        # Path to dulwich.objects.Tree.
+        self._dirs = {}
+
+        # Mercurial file nodeid to Git blob SHA-1. Used to prevent redundant
+        # blob calculation.
+        self._blob_cache = {}
+
+    @property
+    def root_tree_sha(self):
+        """The SHA-1 of the root Git tree.
+
+        This is needed to construct a Git commit object.
+        """
+        return self._dirs[''].id
+
+    def update_changeset(self, ctx):
+        """Set the tree to track a new Mercurial changeset.
+
+        This is a generator of 2-tuples. The first item in each tuple is a
+        dulwich object, either a Blob or a Tree. The second item is the
+        corresponding Mercurial nodeid for the item, if any. Only blobs will
+        have nodeids. Trees do not correspond to a specific nodeid, so it does
+        not make sense to emit a nodeid for them.
+
+        When exporting trees from Mercurial, callers typically write the
+        returned dulwich object to the Git repo via the store's add_object().
+
+        Some emitted objects may already exist in the Git repository. This
+        class does not know about the Git repository, so it's up to the caller
+        to conditionally add the object, etc.
+
+        Emitted objects are those that have changed since the last call to
+        update_changeset. If this is the first call to update_chanageset, all
+        objects in the tree are emitted.
+        """
+        # Our general strategy is to accumulate dulwich.objects.Blob and
+        # dulwich.objects.Tree instances for the current Mercurial changeset.
+        # We do this incremental by iterating over the Mercurial-reported
+        # changeset delta. We rely on the behavior of Mercurial to lazy
+        # calculate a Tree's SHA-1 when we modify it. This is critical to
+        # performance.
+
+        # In theory we should be able to look at changectx.files(). This is
+        # *much* faster. However, it may not be accurate, especially with older
+        # repositories, which may not record things like deleted files
+        # explicitly in the manifest (which is where files() gets its data).
+        # The only reliable way to get the full set of changes is by looking at
+        # the full manifest. And, the easy way to compare two manifests is
+        # localrepo.status().
+        modified, added, removed = self._hg.status(self._rev, ctx.rev())[0:3]
+
+        # We first process file removals so we can prune dead trees.
+        for path in sorted(removed, key=len, reverse=True):
+            d = os.path.dirname(path)
+            tree = self._dirs.get(d, dulobjs.Tree())
+
+            del tree[os.path.basename(path)]
+
+            # If removing this file made the tree empty, we should delete this
+            # tree. This could result in parent trees losing their only child
+            # and so on.
+            if not len(tree):
+                self._remove_tree(d)
+                continue
+
+            self._dirs[d] = tree
+
+        # For every file that changed or was added, we need to calculate the
+        # corresponding Git blob and its tree entry. We emit the blob
+        # immediately and update trees to be aware of its presence.
+        for path in sorted(set(modified) | set(added), key=len, reverse=True):
+            # Handle special Mercurial paths.
+            if path == '.hgsubstate':
+                self._handle_subrepos(ctx)
+                continue
+
+            if path == '.hgsub':
+                continue
+
+            d = os.path.dirname(path)
+            tree = self._dirs.setdefault(d, dulobjs.Tree())
+
+            fctx = ctx[path]
+
+            entry, blob = IncrementalChangesetExporter.tree_entry(fctx,
+                self._blob_cache)
+            if blob is not None:
+                yield (blob, fctx.filenode())
+
+            tree.add(*entry)
+
+        # Now that all the trees represent the current changeset, recalculate
+        # the tree IDs and emit them. Note that we wait until now to calculate
+        # tree SHA-1s. This is an important difference between us and
+        # dulwich.index.commit_tree(), which builds new Tree instances for each
+        # series of blobs.
+        for obj in self._populate_tree_entries():
+            yield (obj, None)
+
+        self._rev = ctx.rev()
+
+    def _remove_tree(self, path):
+        """Remove a (presumably empty) tree from the current changeset.
+
+        A now-empty tree may be the only child of its parent. So, we traverse
+        up the chain to the root tree, deleting any empty trees along the way.
+        """
+        try:
+            del self._dirs[path]
+        except KeyError:
+            return
+
+        # Now we traverse up to the parent and delete any references.
+        if path == '':
+            return
+
+        basename = os.path.basename(path)
+        parent = os.path.dirname(path)
+        while True:
+            tree = self._dirs.get(parent, None)
+
+            # No parent entry. Nothing to remove or update.
+            if tree is None:
+                return
+
+            try:
+                del tree[basename]
+            except KeyError:
+                return
+
+            if len(tree):
+                return
+
+            # The parent tree is empty. Se, we can delete it.
+            del self._dirs[parent]
+
+            if parent == '':
+                return
+
+            basename = os.path.basename(parent)
+            parent = os.path.dirname(parent)
+
+    def _populate_tree_entries(self):
+        self._dirs.setdefault('', dulobjs.Tree())
+
+        # Fill in missing directories.
+        for path in self._dirs.keys():
+            parent = os.path.dirname(path)
+
+            while parent != '':
+                parent_tree = self._dirs.get(parent, None)
+
+                if parent_tree is not None:
+                    break
+
+                self._dirs[parent] = dulobjs.Tree()
+                parent = os.path.dirname(parent)
+
+        # TODO only emit trees that have been modified.
+        for d in sorted(self._dirs.keys(), key=len, reverse=True):
+            tree = self._dirs[d]
+            yield tree
+
+            if d == '':
+                continue
+
+            parent_tree = self._dirs[os.path.dirname(d)]
+
+            # Accessing the tree's ID is what triggers SHA-1 calculation and is
+            # the expensive part (at least if the tree has been modified since
+            # the last time we retrieved its ID).
+            parent_tree[os.path.basename(d)] = (stat.S_IFDIR, tree.id)
+
+    def _handle_subrepos(self, ctx):
+        substate = util.parse_hgsubstate(ctx['.hgsubstate'].data().splitlines())
+        sub = util.OrderedDict()
+
+        if '.hgsub' in ctx:
+            sub = util.parse_hgsub(ctx['.hgsub'].data().splitlines())
+
+        for path, sha in substate.iteritems():
+            # Ignore non-Git repositories keeping state in .hgsubstate.
+            if path in sub and not sub[path].startswith('[git]'):
+                continue
+
+            d = os.path.dirname(path)
+            tree = self._dirs.setdefault(d, dulobjs.Tree())
+            tree.add(os.path.basename(path), dulobjs.S_IFGITLINK, sha)
+
+    @staticmethod
+    def tree_entry(fctx, blob_cache):
+        """Compute a dulwich TreeEntry from a filectx.
+
+        A side effect is the TreeEntry is stored in the passed cache.
+
+        Returns a 2-tuple of (dulwich.objects.TreeEntry, dulwich.objects.Blob).
+        """
+        blob_id = blob_cache.get(fctx.filenode(), None)
+        blob = None
+
+        if blob_id is None:
+            blob = dulobjs.Blob.from_string(fctx.data())
+            blob_id = blob.id
+            blob_cache[fctx.filenode()] = blob_id
+
+        flags = fctx.flags()
+
+        if 'l' in flags:
+            mode = 0120000
+        elif 'x' in flags:
+            mode = 0100755
+        else:
+            mode = 0100644
+
+        return (dulobjs.TreeEntry(os.path.basename(fctx.path()), mode, blob_id),
+                blob)
+