1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
|
# Copyright (C) 2016 The OpenTimestamps developers
#
# This file is part of python-opentimestamps.
#
# It is subject to the license terms in the LICENSE file found in the top-level
# directory of this distribution.
#
# No part of python-opentimestamps including this file, may be copied,
# modified, propagated, or distributed except according to the terms contained
# in the LICENSE file.
"""Git integration"""
import dbm
import git
import io
import os
from opentimestamps.core.timestamp import Timestamp, DetachedTimestampFile, make_merkle_tree
from opentimestamps.core.op import OpAppend, OpPrepend, OpSHA256
class GitTreeTimestamper:
"""Efficient, privacy-preserving, git tree timestamping
Unfortunately, the way git calculates commit hashes is less than ideal for
timestamping. The first problem is of course the fact that they're SHA1
hashes: still good enough for timestamping, but all the same a dubious
choice of hash algorithm.
The second problem is more subtle: What if I want to extract a timestamp
proof for an individual file in the commit? Since git hashes tree objects
as linear blobs of data, the proof will contain a significant amount of
extraneous metadata about files other than the one you want - inefficient
and a privacy risk.
This class solves these problems by recursively re-hashing a git tree and all blobs
in it with SHA256, using a cache of previously calculated hashes for
efficiency. Each git tree is hashed as a merkle tree, allowing paths to
individual blobs to be extracted efficiently.
For privacy, we guarantee that given a timestamp for a single item in a
given tree, an attacker trying to guess the contents of any other item in
the tree can only do so by brute-forcing all other content in the tree
simultaneously. We achieve this by deterministically generating nonces for
each item in the tree based on the item's hash, and the contents of the
rest of the tree.
However, note that we do _not_ prevent the attacker from learning about the
directly structure of the repository, including the number of items in each
directory.
"""
def __init__(self, tree, db=None, file_hash_op=OpSHA256(), tree_hash_op=OpSHA256()):
self.tree = tree
if db is None:
os.makedirs(tree.repo.git_dir + '/ots', exist_ok=True)
# WARNING: change the version number if any of the following is
# changed; __init__() is consensus-critical!
db = dbm.open(tree.repo.git_dir + '/ots/tree-hash-cache-v3', 'c')
self.db = db
self.file_hash_op = file_hash_op
self.tree_hash_op = tree_hash_op
def do_item(item):
try:
return (item, Timestamp(db[item.hexsha]))
except KeyError:
timestamp = None
if isinstance(item, git.Blob):
timestamp = Timestamp(file_hash_op.hash_fd(item.data_stream[3]))
elif isinstance(item, git.Tree):
stamper = GitTreeTimestamper(item, db=db, file_hash_op=file_hash_op, tree_hash_op=tree_hash_op)
timestamp = stamper.timestamp
elif isinstance(item, git.Submodule):
# A submodule is just a git commit hash.
#
# Unfortunately we're not guaranteed to have the repo
# behind it, so all we can do is timestamp that SHA1 hash.
#
# We do run it through the tree_hash_op to make it
# indistinguishable from other things; consider the
# degenerate case where the only thing in a git repo was a
# submodule.
timestamp = Timestamp(tree_hash_op(item.binsha))
else:
raise NotImplementedError("Don't know what to do with %r" % item)
db[item.hexsha] = timestamp.msg
return (item, timestamp)
self.contents = tuple(do_item(item) for item in self.tree)
if len(self.contents) > 1:
# Deterministically nonce contents in an all-or-nothing transform. As
# mentioned in the class docstring, we want to ensure that the the
# siblings of any leaf in the merkle tree don't give the attacker any
# information about what else is in the tree, unless the attacker
# already knows (or can brute-force) the entire contents of the tree.
#
# While not perfect - a user-provided persistant key would prevent the
# attacker from being able to brute-force the contents - this option
# has the advantage of being possible to calculate deterministically
# using only the tree itself, removing the need to keep secret keys
# that can easily be lost.
#
# First, calculate a nonce_key that depends on the entire contents of
# the tree. The 8-byte tag ensures the key calculated is unique for
# this purpose.
contents_sum = b''.join(stamp.msg for item, stamp in self.contents) + b'\x01\x89\x08\x0c\xfb\xd0\xe8\x08'
nonce_key = tree_hash_op.hash_fd(io.BytesIO(contents_sum))
# Second, calculate per-item nonces deterministically from that key,
# and add those nonces to the timestamps of every item in the tree.
#
# While we usually use 128-bit nonces, here we're using full-length
# nonces. Additionally, we pick append/prepend pseudo-randomly. This
# helps obscure the directory structure, as a commitment for a git tree
# is indistinguishable from a inner node in the per-git-tree merkle
# tree.
def deterministically_nonce_stamp(private_stamp):
nonce1 = tree_hash_op(private_stamp.msg + nonce_key)
nonce2 = tree_hash_op(nonce1)
side = OpPrepend if nonce1[0] & 0b1 else OpAppend
nonce_added = private_stamp.ops.add(side(nonce2))
return nonce_added.ops.add(tree_hash_op)
nonced_contents = (deterministically_nonce_stamp(stamp) for item, stamp in self.contents)
# Note how the current algorithm, if asked to timestamp a tree
# with a single thing in it, will return the hash of that thing
# directly. From the point of view of just commiting to the data that's
# perfectly fine, and probably (slightly) better as it reveals a little
# less information about directory structure.
self.timestamp = make_merkle_tree(nonced_stamp for nonced_stamp in nonced_contents)
elif len(self.contents) == 1:
# If there's only one item in the tree, the fancy all-or-nothing
# transform above is just a waste of ops, so use the tree contents
# directly instead.
self.timestamp = tuple(self.contents)[0][1]
else:
raise AssertionError("Empty git tree")
def __getitem__(self, path):
"""Get a DetachedTimestampFile for blob at path
The timestamp object returned will refer to self.timestamp, so
modifying self.timestamp will modify the timestamp returned.
If path does not exist, FileNotFound error will be raised.
If path exists, but is not a blob, ValueError will be raised.
"""
for item, item_stamp in self.contents:
if item.path == path:
if isinstance(item, git.Blob):
return DetachedTimestampFile(self.file_hash_op, item_stamp)
else:
raise ValueError("Path %r is not a blob" % item.path)
elif path.startswith(item.path + '/'):
if isinstance(item, git.Tree):
# recurse
tree_stamper = GitTreeTimestamper(item, db=self.db, file_hash_op=self.file_hash_op, tree_hash_op=self.tree_hash_op)
tree_stamper.timestamp.merge(item_stamp)
return tree_stamper[path]
else:
raise FileNotFoundError("Path %r not found; prefix %r is a blob" % (path, item.path))
else:
raise FileNotFoundError("Path %r not found" % path)
|