changeset 171:88e413d853ee

fixed serious speed issue with rename detection
author Scott Chacon <schacon@gmail.com>
date Wed, 03 Jun 2009 09:22:17 -0700
parents 92e708d6e3a1
children ac92cdc45ceb
files TODO.txt __init__.py git_handler.py lsprofcalltree.py
diffstat 4 files changed, 93 insertions(+), 8 deletions(-) [+]
line wrap: on
line diff
--- a/TODO.txt	Tue Jun 02 21:27:19 2009 -0700
+++ b/TODO.txt	Wed Jun 03 09:22:17 2009 -0700
@@ -30,6 +30,7 @@
 
 SPEED/EFFICIENCY
 ================
+* cache rename detection on push (seems to be very expensive)
 * switch object mapping to hg->git since the many to one is that direction
 * don't send blobs/trees already on server (thin pack)
 * packfile creation benchmarking (seems to take a while sometimes)
--- a/__init__.py	Tue Jun 02 21:27:19 2009 -0700
+++ b/__init__.py	Wed Jun 03 09:22:17 2009 -0700
@@ -49,14 +49,19 @@
 def gpush(ui, repo, remote_name='origin', branch=None):
     git = GitHandler(repo, ui)
     import cProfile, pstats
+    import lsprofcalltree
     prof = cProfile.Profile()
     prof = prof.runctx("git.push(remote_name)", globals(), locals())
     stats = pstats.Stats(prof)
+    k = lsprofcalltree.KCacheGrind(prof)
+    data = open('/tmp/prof.kgrind', 'w+')
+    k.output(data)
+    data.close()
     stats.sort_stats("time")  # Or cumulative
     stats.print_stats(80)  # 80 = how many to print
     # The rest is optional.
-    # stats.print_callees()
-    # stats.print_callers()
+    #stats.print_callees()
+    #stats.print_callers()
 
 def gimport(ui, repo, remote_name=None):
     git = GitHandler(repo, ui)
--- a/git_handler.py	Tue Jun 02 21:27:19 2009 -0700
+++ b/git_handler.py	Wed Jun 03 09:22:17 2009 -0700
@@ -187,6 +187,7 @@
         return dict(filter(is_local_head, refs.items()))
 
     def export_git_objects(self):
+        self.manifest_renames = {}
         self.ui.status(_("importing Hg objects into Git\n"))
         total = len(self.repo.changelog)
         if total:
@@ -331,16 +332,24 @@
         trees = {}
         man = ctx.manifest()
         renames = []
-        for filenm in man.keys():
+        for filenm, nodesha in man.iteritems():
+            file_id = hex(nodesha)
             # write blob if not in our git database
-            fctx = ctx.filectx(filenm)
-            rename = fctx.renamed()
-            if rename:
-                filerename, sha = rename
+            fctx = ctx.filectx(filenm) 
+            filerename = None
+            if file_id in self.manifest_renames:
+                filerename = self.manifest_renames[file_id]
+            else:
+                rename = fctx.renamed()
+                if rename:
+                    filerename, sha = rename
+                    self.manifest_renames[file_id] = filerename
+                else:
+                    self.manifest_renames[file_id] = None                    
+            if filerename:
                 renames.append((filerename, filenm))
             is_exec = 'x' in fctx.flags()
             is_link = 'l' in fctx.flags()
-            file_id = hex(fctx.filenode())
             blob_sha = self.map_git_get(file_id)
             if not blob_sha:
                 blob_sha = self.git.write_blob(fctx.data()) # writing new blobs to git
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lsprofcalltree.py	Wed Jun 03 09:22:17 2009 -0700
@@ -0,0 +1,70 @@
+
+
+def label(code):
+    if isinstance(code, str):
+        return ('~', 0, code)    # built-in functions ('~' sorts at the end)
+    else:
+        return '%s %s:%d' % (code.co_name, code.co_filename, code.co_firstlineno)
+
+
+
+class KCacheGrind(object):
+    def __init__(self, profiler):
+        self.data = profiler.getstats()
+        self.out_file = None
+
+    def output(self, out_file):
+        self.out_file = out_file
+        print >> out_file, 'events: Ticks'
+        self._print_summary()
+        for entry in self.data:
+            self._entry(entry)
+
+    def _print_summary(self):
+        max_cost = 0
+        for entry in self.data:
+            totaltime = int(entry.totaltime * 1000)
+            max_cost = max(max_cost, totaltime)
+        print >> self.out_file, 'summary: %d' % (max_cost,)
+
+    def _entry(self, entry):
+        out_file = self.out_file
+        code = entry.code
+        inlinetime = int(entry.inlinetime * 1000)
+        #print >> out_file, 'ob=%s' % (code.co_filename,)
+        if isinstance(code, str):
+            print >> out_file, 'fi=~'
+        else:
+            print >> out_file, 'fi=%s' % (code.co_filename,)
+        print >> out_file, 'fn=%s' % (label(code),)
+        if isinstance(code, str):
+            print >> out_file, '0 ', inlinetime
+        else:
+            print >> out_file, '%d %d' % (code.co_firstlineno, inlinetime)
+        # recursive calls are counted in entry.calls
+        if entry.calls:
+            calls = entry.calls
+        else:
+            calls = []
+        if isinstance(code, str):
+            lineno = 0
+        else:
+            lineno = code.co_firstlineno
+        for subentry in calls:
+            self._subentry(lineno, subentry)
+        print >> out_file
+
+    def _subentry(self, lineno, subentry):
+        out_file = self.out_file
+        code = subentry.code
+        totaltime = int(subentry.totaltime * 1000)
+        #print >> out_file, 'cob=%s' % (code.co_filename,)
+        print >> out_file, 'cfn=%s' % (label(code),)
+        if isinstance(code, str):
+            print >> out_file, 'cfi=~'
+            print >> out_file, 'calls=%d 0' % (subentry.callcount,)
+        else:
+            print >> out_file, 'cfi=%s' % (code.co_filename,)
+            print >> out_file, 'calls=%d %d' % (
+                subentry.callcount, code.co_firstlineno)
+        print >> out_file, '%d %d' % (lineno, totaltime)