# HG changeset patch # User Pierre-Yves David # Date 1493619204 -7200 # Node ID d6584ce580309b62715d8c239cdeb3a634b75398 # Parent 017b971ba28f47a12f91677aecae42cba42cb6ef perf: adds a cache to know if obsmarkers might affect a revision Phase information still needs to be thrown in the mix to compute the final information, but skipping reading the obsstore for most operation is a large win. Usage of this cache arrives in the next changeset. diff -r 017b971ba28f -r d6584ce58030 hgext3rd/evolve/obscache.py --- a/hgext3rd/evolve/obscache.py Mon May 01 08:07:05 2017 +0200 +++ b/hgext3rd/evolve/obscache.py Mon May 01 08:13:24 2017 +0200 @@ -7,6 +7,18 @@ # This software may be used and distributed according to the terms of the # GNU General Public License version 2 or any later version. +import hashlib +import struct +import weakref +import errno + +from mercurial import ( + localrepo, + obsolete, + phases, + node, +) + from . import ( exthelper, ) @@ -27,6 +39,7 @@ @eh.wrapfunction(obsstorefilecache, 'func') def obsstorewithcache(orig, repo): obsstore = orig(repo) + obsstore.obscache = obscache(repo.unfiltered()) class cachekeyobsstore(obsstore.__class__): @@ -138,3 +151,171 @@ startidx = keyobslength return True, startrev, startidx + +class obscache(object): + """cache the "does a rev" is the precursors of some obsmarkers data + + This is not directly holding the "is this revision obsolete" information, + because phases data gets into play here. However, it allow to compute the + "obsolescence" set without reading the obsstore content. + + Implementation note #1: + + The obsstore is implementing only half of the transaction logic it + should. It properly record the starting point of the obsstore to allow + clean rollback. However it still write to the obsstore file directly + during the transaction. Instead it should be keeping data in memory and + write to a '.pending' file to make the data vailable for hooks. + + This cache is not going futher than what the obstore is doing, so it does + not has any '.pending' logic. When the obsstore gains proper '.pending' + support, adding it to this cache should not be too hard. As the flag + always move from 0 to 1, we could have a second '.pending' cache file to + be read. If flag is set in any of them, the value is 1. For the same + reason, updating the file in place should be possible. + + Implementation note #2: + + Instead of having a large final update run, we could update this cache at + the level adding a new changeset or a new obsmarkers. More on this in the + 'update code'. + + Implementation note #3: + + Storage-wise, we could have a "start rev" to avoid storing useless + zero. That would be especially useful for the '.pending' overlay. + """ + + _filepath = 'cache/evoext-obscache-00' + _headerformat = '>I20sQQ20s' + + def __init__(self, repo): + self._vfs = repo.vfs + # The cache key parts are" + # - tip-rev, + # - tip-node, + # - obsstore-length (nb markers), + # - obsstore-file-size (in bytes), + # - obsstore "cache key" + self._cachekey = None + self._data = bytearray() + + def get(self, rev): + """return True if "rev" is used as "precursors for any obsmarkers + + Make sure the cache has been updated to match the repository content before using it""" + return self._data[rev] + + def clear(self): + """invalidate the cache content""" + self._cachekey = None + self._data = bytearray() + + def update(self, repo): + """Iteratively update the cache with new repository data""" + # If we do not have any data, try loading from disk + if self._cachekey is None: + self.load(repo) + + valid, startrev, startidx = upgradeneeded(repo, self._cachekey) + if not valid: + self.clear() + + if startrev is None and startidx is None: + return + + # process the new changesets + cl = repo.changelog + if startrev is not None: + node = cl.node + # Note: + # + # Newly added changeset might be affected by obsolescence markers + # we already have locally. So we needs to have soem global + # knowledge about the markers to handle that question. Right this + # requires parsing all markers in the obsstore. However, we could + # imagine using various optimisation (eg: bloom filter, other on + # disk cache) to remove this full parsing. + # + # For now we stick to the simpler approach or paying the + # performance cost on new changesets. + succs = repo.obsstore.successors + for r in cl.revs(startrev): + if node(r) in succs: + val = 1 + else: + val = 0 + self._data.append(val) + assert len(self._data) == len(cl), (len(self._data), len(cl)) + + # process the new obsmarkers + if startidx is not None: + rev = cl.nodemap.get + markers = repo.obsstore._all + # Note: + # + # There are no actually needs to load the full obsstore here, + # since we only read the latest ones. We do it for simplicity in + # the first implementation. Loading the full obsstore has a + # performance cost and should go away in this case too. We have + # two simples options for that: + # + # 1) provide and API to start reading markers from a byte offset + # (we have that data in the cache key) + # + # 2) directly update the cache at a lower level, in the code + # responsible for adding a markers. + # + # Option 2 is probably a bit more invasive, but more solid on the long run + + for i in xrange(startidx, len(repo.obsstore)): + r = rev(markers[i][0]) + # If markers affect a newly added nodes, it would have been + # caught in the previous loop, (so we skip < startrev) + if r is not None and (startrev is None or r < startrev): + self._data[r] = 1 + + # XXX note that there are a race condition here, since the repo "might" + # have changed side the cache update above. However, this code will + # mostly be running in a lock so we ignore the issue for now. + # + # To work around this, 'upgradeneeded' should return a bounded amount + # of changeset and markers to read with their associated cachekey. see + # 'upgradeneeded' for detail. + self._cachekey = getcachekey(repo) + + def save(self, repo): + """save the data to disk""" + + # XXX it happens that the obsstore is (buggilly) always up to date on disk + if self._cachekey is None: + return + + with repo.vfs(self._filepath, 'w', atomictemp=True) as cachefile: + headerdata = struct.pack(self._headerformat, *self._cachekey) + cachefile.write(headerdata) + cachefile.write(self._data) + + def load(self, repo): + """load data from disk""" + assert repo.filtername is None + + data = repo.vfs.tryread(self._filepath) + if not data: + return + + headersize = struct.calcsize(self._headerformat) + self._cachekey = struct.unpack(self._headerformat, data[:headersize]) + self._data = bytearray(data[headersize:]) + +@eh.reposetup +def setupcache(ui, repo): + + class obscacherepo(repo.__class__): + + @localrepo.unfilteredmethod + def destroyed(self): + if 'obsstore' in vars(self): + self.obsstore.obscache.clear() + + repo.__class__ = obscacherepo