view git_handler.py @ 99:541571f2e429

add progress indication during export
author Sverre Rabbelier <sverre@rabbelier.nl>
date Sun, 10 May 2009 07:29:15 -0700
parents f0628f5270b6
children e17d9eea44ab
line wrap: on
line source

import os, errno, sys, time, datetime, pickle, copy, math
import toposort
import dulwich
from dulwich.repo import Repo
from dulwich.client import SimpleFetchGraphWalker
from hgext import bookmarks
from mercurial.i18n import _
from mercurial.node import bin, hex, nullid
from mercurial import hg, util, context, error
from dulwich.objects import (
    Blob,
    Commit,
    ShaFile,
    Tag,
    Tree,
    hex_to_sha
    )

import math

def seconds_to_offset(time):
    hours = (float(time) / 60 / 60)
    hour_diff = math.fmod(time, 60)
    minutes = int(hour_diff)
    hours = int(math.floor(hours))
    if hours > 12:
        sign = '+'
        hours = 12 - (hours - 12)
    elif hours > 0:
        sign = '-'
    else:
        sign = ''
    return sign + str(hours).rjust(2, '0') + str(minutes).rjust(2, '0')

def offset_to_seconds(offset):
    if len(offset) == 5:
        sign = offset[0:1]
        hours = int(offset[1:3])
        minutes = int(offset[3:5])
        if sign == '+':
            hours = 12 + (12 - hours)
        return (hours * 60 * 60) + (minutes) * 60
    else:
        return 0

class GitHandler(object):

    def __init__(self, dest_repo, ui):
        self.repo = dest_repo
        self.ui = ui
        self.init_if_missing()
        self.load_git()
        self.load_map()
        self.load_config()

    # make the git data directory
    def init_if_missing(self):
        git_hg_path = os.path.join(self.repo.path, 'git')
        if not os.path.exists(git_hg_path):
            os.mkdir(git_hg_path)
            Repo.init_bare(git_hg_path)

    def load_git(self):
        git_dir = os.path.join(self.repo.path, 'git')
        self.git = Repo(git_dir)

    ## FILE LOAD AND SAVE METHODS

    def map_set(self, gitsha, hgsha):
        self._map_git[gitsha] = hgsha
        self._map_hg[hgsha] = gitsha

    def map_hg_get(self, gitsha):
	return self._map_git.get(gitsha)

    def map_git_get(self, hgsha):
	return self._map_hg.get(hgsha)

    def load_map(self):
        self._map_git = {}
        self._map_hg = {}
        if os.path.exists(self.repo.join('git-mapfile')):
            for line in self.repo.opener('git-mapfile'):
                gitsha, hgsha = line.strip().split(' ', 1)
                self._map_git[gitsha] = hgsha
                self._map_hg[hgsha] = gitsha

    def save_map(self):
        file = self.repo.opener('git-mapfile', 'w+')
        for gitsha, hgsha in self._map_git.iteritems():
            file.write("%s %s\n" % (gitsha, hgsha))
        file.close()

    def load_config(self):
        self._config = {}
        if os.path.exists(self.repo.join('git-config')):
            for line in self.repo.opener('git-config'):
                key, value = line.strip().split(' ', 1)
                self._config[key] = value

    def save_config(self):
        file = self.repo.opener('git-config', 'w+')
        for key, value in self._config.iteritems():
            file.write("%s %s\n" % (key, value))
        file.close()


    ## END FILE LOAD AND SAVE METHODS

    def fetch(self, remote_name):
        self.ui.status(_("fetching from : %s\n") % remote_name)
        self.export_git_objects()
        refs = self.fetch_pack(remote_name)
        if refs:
            self.import_git_objects(remote_name)
        self.save_map()

    def export(self):
        self.export_git_objects()
        self.update_references()
        self.save_map()

    def push(self, remote_name):
        self.ui.status(_("pushing to : %s\n") % remote_name)
        self.export()
        self.upload_pack(remote_name)

    def remote_add(self, remote_name, git_url):
        self._config['remote.' + remote_name + '.url'] = git_url
        self.save_config()

    def remote_remove(self, remote_name):
        key = 'remote.' + remote_name + '.url'
        if key in self._config:
            del self._config[key]
        self.save_config()

    def remote_show(self, remote_name):
        key = 'remote.' + remote_name + '.url'
        if key in self._config:
            name = self._config[key]
            self.ui.status(_("URL for %s : %s\n") % (remote_name, name, ))
        else:
            self.ui.status(_("No remote named : %s\n") % remote_name)
        return

    def remote_list(self):
        for key, value in self._config.iteritems():
            if key[0:6] == 'remote':
                self.ui.status('%s\t%s\n' % (key, value, ))

    def remote_name_to_url(self, remote_name):
        return self._config['remote.' + remote_name + '.url']

    def update_references(self):
        # TODO : if bookmarks exist, add them as git branches
        c = self.map_git_get(hex(self.repo.changelog.tip()))
        self.git.set_ref('refs/heads/master', c)

    def export_git_objects(self):
        self.ui.status(_("exporting git objects\n"))
        total = len(self.repo.changelog)
        magnitude = int(math.log(total, 10)) + 1
        for i, rev in enumerate(self.repo.changelog):
            if i%100 == 0:
                self.ui.status(_("at: %*d/%d\n") % (magnitude, i, total))
            self.export_hg_commit(rev)
            self.save_map()

    # convert this commit into git objects
    # go through the manifest, convert all blobs/trees we don't have
    # write the commit object (with metadata info)
    def export_hg_commit(self, rev):
        # return if we've already processed this
        node = self.repo.changelog.lookup(rev)
        phgsha = hex(node)
        pgit_sha = self.map_git_get(phgsha)
        if pgit_sha:
            return pgit_sha

        self.ui.status(_("converting revision %s\n") % str(rev))

        # make sure parents are converted first
        parents = self.repo.parents(rev)
        for parent in parents:
            p_rev = parent.rev()
            hgsha = hex(parent.node())
            git_sha = self.map_git_get(hgsha)
            if not p_rev == -1:
                if not git_sha:
                    self.export_hg_commit(p_rev)

        ctx = self.repo.changectx(rev)
        tree_sha, renames = self.write_git_tree(ctx)
        
        commit = {}
        commit['tree'] = tree_sha
        (time, timezone) = ctx.date()

        # hg authors might not have emails
        author = ctx.user()
        if not '>' in author: # TODO : this kills losslessness - die (submodules)?
            author = author + ' <none@none>'
        commit['author'] = author + ' ' + str(int(time)) + ' ' + seconds_to_offset(timezone)
        message = ctx.description()
        commit['message'] = ctx.description() + "\n"

        extra = ctx.extra()
        if 'committer' in extra:
            commit['committer'] = extra['committer']

        # HG EXTRA INFORMATION
        add_extras = False
        extra_message = ''
        if not ctx.branch() == 'default':
            add_extras = True
            extra_message += "branch : " + ctx.branch() + "\n"

        if renames:
            add_extras = True
            for oldfile, newfile in renames:
                extra_message += "rename : " + oldfile + " => " + newfile + "\n"
            
        if add_extras:
            commit['message'] += "\n--HG--\n" + extra_message

        commit['parents'] = []
        for parent in parents:
            hgsha = hex(parent.node())
            git_sha = self.map_git_get(hgsha)
            if git_sha:
                commit['parents'].append(git_sha)

        commit_sha = self.git.write_commit_hash(commit) # writing new blobs to git
        self.map_set(commit_sha, phgsha)
        return commit_sha

    def write_git_tree(self, ctx):
        trees = {}
        man = ctx.manifest()
        renames = []
        for filenm in man.keys():
            # write blob if not in our git database
            fctx = ctx.filectx(filenm)
            rename = fctx.renamed()
            if rename:
                filerename, sha = rename
                renames.append((filerename, filenm))
            is_exec = 'x' in fctx.flags()
            is_link = 'l' in fctx.flags()
            file_id = hex(fctx.filenode())
            blob_sha = self.map_git_get(file_id)
            if not blob_sha:
                blob_sha = self.git.write_blob(fctx.data()) # writing new blobs to git
                self.map_set(blob_sha, file_id)

            parts = filenm.split('/')
            if len(parts) > 1:
                # get filename and path for leading subdir
                filepath = parts[-1:][0]
                dirpath = "/".join([v for v in parts[0:-1]]) + '/'

                # get subdir name and path for parent dir
                parpath = '/'
                nparpath = '/'
                for part in parts[0:-1]:
                    if nparpath == '/':
                        nparpath = part + '/'
                    else:
                        nparpath += part + '/'

                    treeentry = ['tree', part + '/', nparpath]

                    if parpath not in trees:
                        trees[parpath] = []
                    if treeentry not in trees[parpath]:
                        trees[parpath].append( treeentry )

                    parpath = nparpath

                # set file entry
                fileentry = ['blob', filepath, blob_sha, is_exec, is_link]
                if dirpath not in trees:
                    trees[dirpath] = []
                trees[dirpath].append(fileentry)

            else:
                fileentry = ['blob', parts[0], blob_sha, is_exec, is_link]
                if '/' not in trees:
                    trees['/'] = []
                trees['/'].append(fileentry)

        # sort by tree depth, so we write the deepest trees first
        dirs = trees.keys()
        dirs.sort(lambda a, b: len(b.split('/'))-len(a.split('/')))
        dirs.remove('/')
        dirs.append('/')
        
        # write all the trees
        tree_sha = None
        tree_shas = {}
        for dirnm in dirs:
            tree_data = []
            for entry in trees[dirnm]:
                # replace tree path with tree SHA
                if entry[0] == 'tree':
                    sha = tree_shas[entry[2]]
                    entry[2] = sha
                tree_data.append(entry)
            tree_sha = self.git.write_tree_array(tree_data) # writing new trees to git
            tree_shas[dirnm] = tree_sha
        
        return (tree_sha, renames) # should be the last root tree sha

    def remote_head(self, remote_name):
        for head, sha in self.git.remote_refs(remote_name).iteritems():
            if head == 'HEAD':
                return self.map_hg_get(sha)
        return None

    def upload_pack(self, remote_name):
        git_url = self.remote_name_to_url(remote_name)
        client, path = self.get_transport_and_path(git_url)
        changed = self.get_changed_refs
        genpack = self.generate_pack_contents
        try:
            self.ui.status(_("creating and sending data\n"))
            changed_refs = client.send_pack(path, changed, genpack)
            if changed_refs:
                new_refs = {}
                for ref, sha in changed_refs.iteritems():
                    self.ui.status("    "+ remote_name + "::" + ref + " => GIT:" + sha[0:8] + "\n")
                    new_refs[ref] = sha
                self.git.set_remote_refs(new_refs, remote_name)
                self.update_hg_bookmarks(remote_name)
        except:
            # TODO : remove try/except or do something useful here
            raise

    # TODO : for now, we'll just push all heads that match remote heads
    #        * we should have specified push, tracking branches and --all
    # takes a dict of refs:shas from the server and returns what should be
    # pushed up
    def get_changed_refs(self, refs):
        keys = refs.keys()

        changed = {}
        if not keys:
            return None

        # TODO : this is a huge hack
        if keys[0] == 'capabilities^{}': # nothing on the server yet - first push
            changed['refs/heads/master'] = self.git.ref('master')

        for ref_name in keys:
            parts = ref_name.split('/')
            if parts[0] == 'refs': # strip off 'refs/heads'
                if parts[1] == 'heads':
                    head = "/".join([v for v in parts[2:]])
                    local_ref = self.git.ref(ref_name)
                    if local_ref:
                        if not local_ref == refs[ref_name]:
                            changed[ref_name] = local_ref
        return changed

    # takes a list of shas the server wants and shas the server has
    # and generates a list of commit shas we need to push up
    def generate_pack_contents(self, want, have):
        graph_walker = SimpleFetchGraphWalker(want, self.git.get_parents)
        next = graph_walker.next()
        shas = set()
        while next:
            if next in have:
                graph_walker.ack(next)
            else:
                shas.add(next)
            next = graph_walker.next()
        
        seen = []
        
        # so now i have the shas, need to turn them into a list of
        # tuples (sha, path) for ALL the objects i'm sending
        # TODO : don't send blobs or trees they already have
        def get_objects(tree, path):
            changes = list()
            changes.append((tree, path))
            for (mode, name, sha) in tree.entries():
                if mode == 57344: # TODO : properly handle submodules and document what 57344 means
                    continue
                if sha in seen:
                    continue
                    
                obj = self.git.get_object(sha)
                seen.append(sha)
                if isinstance (obj, Blob):
                    changes.append((obj, path + name))
                elif isinstance(obj, Tree):
                    changes.extend(get_objects(obj, path + name + '/'))
            return changes

        objects = []
        for commit_sha in shas:
            commit = self.git.commit(commit_sha)
            objects.append((commit, 'commit'))
            tree = self.git.get_object(commit.tree)
            objects.extend( get_objects(tree, '/') )

        return objects

    def fetch_pack(self, remote_name):
        git_url = self.remote_name_to_url(remote_name)
        client, path = self.get_transport_and_path(git_url)
        graphwalker = SimpleFetchGraphWalker(self.git.heads().values(), self.git.get_parents)
        f, commit = self.git.object_store.add_pack()
        try:
            determine_wants = self.git.object_store.determine_wants_all
            refs = client.fetch_pack(path, determine_wants, graphwalker, f.write, sys.stdout.write)
            f.close()
            commit()
            if refs:
                self.git.set_remote_refs(refs, remote_name)
            else:
                self.ui.status(_("nothing new on the server\n"))
            return refs
        except:
            f.close()
            raise

    def import_git_objects(self, remote_name):
        self.ui.status(_("importing Git objects into Hg\n"))
        # import heads as remote references
        todo = []
        done = set()
        convert_list = {}

        # get a list of all the head shas
        for head, sha in self.git.remote_refs(remote_name).iteritems():
            todo.append(sha)

        # traverse the heads getting a list of all the unique commits
        while todo:
            sha = todo.pop()
            assert isinstance(sha, str)
            if sha in done:
                continue
            done.add(sha)
            try:
                commit = self.git.commit(sha)
                convert_list[sha] = commit
                todo.extend([p for p in commit.parents if p not in done])
            except:
                self.ui.warn(_("Cannot import tags yet\n")) # TODO

        # sort the commits
        commits = toposort.TopoSort(convert_list).items()

        # import each of the commits, oldest first
        for csha in commits:
            if not self.map_hg_get(csha): # it's already here
                commit = convert_list[csha]
                self.import_git_commit(commit)

        self.update_hg_bookmarks(remote_name)

    def update_hg_bookmarks(self, remote_name):
        try:
            bms = bookmarks.parse(self.repo)
            for head, sha in self.git.remote_refs(remote_name).iteritems():
                hgsha = hex_to_sha(self.map_hg_get(sha))
                if not head == 'HEAD':
                    bms[remote_name + '/' + head] = hgsha
            bookmarks.write(self.repo, bms)
        except AttributeError:
            self.ui.warn(_('creating bookmarks failed, do you have'
                         ' bookmarks enabled?\n'))

    def convert_git_int_mode(self, mode):
	# TODO : make these into constants
        convert = {
         33188: '',
         40960: 'l',
         33261: 'e'}
        if mode in convert:
            return convert[mode]
        return ''

    def extract_hg_metadata(self, message):
        split = message.split("\n\n--HG--\n", 1)
        renames = {}
        branch = False
        if len(split) == 2:
            message, meta = split
            lines = meta.split("\n")
            for line in lines:
                if line == '':
                    continue 
                
                command, data = line.split(" : ", 1)
                if command == 'rename':
                    before, after = data.split(" => ", 1)
                    renames[after] = before
                if command == 'branch':
                    branch = data
        return (message, renames, branch)
        
    def import_git_commit(self, commit):
        self.ui.debug(_("importing: %s\n") % commit.id)
        # TODO : find and use hg named branches
        # TODO : add extra Git data (committer info) as extras to changeset

        # TODO : (?) have to handle merge contexts at some point (two parent files, etc)
        # TODO : Do something less coarse-grained than try/except on the
        #        get_file call for removed files
        
        (strip_message, hg_renames, hg_branch) = self.extract_hg_metadata(commit.message)
        
        def getfilectx(repo, memctx, f):
            try:
                (mode, sha, data) = self.git.get_file(commit, f)
                e = self.convert_git_int_mode(mode)
            except TypeError:
                raise IOError()
            if f in hg_renames:
                copied_path = hg_renames[f]
            else:
                copied_path = None
            return context.memfilectx(f, data, 'l' in e, 'x' in e, copied_path)

        p1 = "0" * 40
        p2 = "0" * 40
        if len(commit.parents) > 0:
            sha = commit.parents[0]
            p1 = self.map_hg_get(sha)
        if len(commit.parents) > 1:
            sha = commit.parents[1]
            p2 = self.map_hg_get(sha)
        if len(commit.parents) > 2:
            # TODO : map extra parents to the extras file
            pass

        # get a list of the changed, added, removed files
        files = self.git.get_files_changed(commit)

        # if this is a merge commit, don't list renamed files
        # i'm really confused here - this is the only way my use case will
        # work, but it seems really hacky - do I need to just remove renames
        # from one of the parents? AARRGH!
        if not (p2 == "0"*40):
            for removefile in hg_renames.values():
                files.remove(removefile)

        extra = {}

        # if named branch, add to extra
        if hg_branch:
            extra['branch'] = hg_branch

        # if committer is different than author, add it to extra
        if not commit._author_raw == commit._committer_raw:
            extra['committer'] = commit._committer_raw

        if hg_branch:
            extra['branch'] = hg_branch
                
        text = strip_message
        date = datetime.datetime.fromtimestamp(commit.author_time).strftime("%Y-%m-%d %H:%M:%S")
        ctx = context.memctx(self.repo, (p1, p2), text, files, getfilectx,
                             commit.author, date, extra)
        a = self.repo.commitctx(ctx)

        # get changeset id
        p2 = hex(self.repo.changelog.tip())
        # save changeset to mapping file
        gitsha = commit.id
        self.map_set(gitsha, p2)

    def check_bookmarks(self):
        if self.ui.config('extensions', 'hgext.bookmarks') is not None:
            self.ui.warn("YOU NEED TO SETUP BOOKMARKS\n")

    def get_transport_and_path(self, uri):
        from dulwich.client import TCPGitClient, SSHGitClient, SubprocessGitClient
        for handler, transport in (("git://", TCPGitClient), ("git@", SSHGitClient), ("git+ssh://", SSHGitClient)):
            if uri.startswith(handler):
                if handler == 'git@':
                    host, path = uri[len(handler):].split(":", 1)
                    host = 'git@' + host
                else:
                    host, path = uri[len(handler):].split("/", 1)
                return transport(host), '/' + path
        # if its not git or git+ssh, try a local url..
        return SubprocessGitClient(), uri

    def clear(self):
        git_dir = self.repo.join('git')
        mapfile = self.repo.join('git-mapfile')
        if os.path.exists(git_dir):
            for root, dirs, files in os.walk(git_dir, topdown=False):
                for name in files:
                    os.remove(os.path.join(root, name))
                for name in dirs:
                    os.rmdir(os.path.join(root, name))
            os.rmdir(git_dir)
        if os.path.exists(mapfile):
            os.remove(mapfile)