gpt4 book ai didi

python - 在没有 git 的情况下分配目录的 git SHA

转载 作者:太空宇宙 更新时间:2023-11-04 08:03:03 25 4
gpt4 key购买 nike

所以,我发现了这个问题: How to assign a Git SHA1's to a file without Git?

但我不确定如何为目录执行此方法。如何在不使用 git 的情况下对程序中的目录进行哈希处理,使其与 git 给出的 sha1 相匹配?

最佳答案

事实证明这比我预期的要难,但我现在确实可以用了。

作为I commentedhobbs answered ,计算树哈希是非常重要的。您必须对每个子树中的每个文件进行哈希处理,计算这些子树的哈希值,并使用这些哈希值计算顶级树的哈希值。

附加的 python 代码似乎至少适用于某些测试用例(例如,为 git 源本身计算树哈希)。我在评论中加入了对我一路上发现的一些意想不到的事情的解释。

这现在也在 my github "scripts" repository 中.

[编辑:github 版本现在有一些 Python3 修复,并且通常会更新/更好。]

#! /usr/bin/env python

"""
Compute git hash values.

This is meant to work with both Python2 and Python3, but
has only been tested with Python2.7.
"""

from __future__ import print_function

import argparse
import os
import stat
import sys

from hashlib import sha1

def strmode(mode):
"""
Turn internal mode (octal with leading 0s suppressed) into
print form (i.e., left pad => right justify with 0s as needed).
"""
return mode.rjust(6, '0')

def classify(path):
"""
Return git classification of a path (as both mode,
100644/100755 etc, and git object type, i.e., blob vs tree).
Also throw in st_size field since we want it for file blobs.
"""
# We need the X bit of regular files for the mode, so
# might as well just use lstat rather than os.isdir().
st = os.lstat(path)
if stat.S_ISLNK(st.st_mode):
gitclass = 'blob'
mode = '120000'
elif stat.S_ISDIR(st.st_mode):
gitclass = 'tree'
mode = '40000' # note: no leading 0!
elif stat.S_ISREG(st.st_mode):
# 100755 if any execute permission bit set, else 100644
gitclass = 'blob'
mode = '100755' if (st.st_mode & 0111) != 0 else '100644'
else:
raise ValueError('un-git-able file system entity %s' % fullpath)
return mode, gitclass, st.st_size

def blob_hash(stream, size):
"""
Return (as hash instance) the hash of a blob,
as read from the given stream.
"""
hasher = sha1()
hasher.update(b'blob %u\0' % size)
nread = 0
while True:
# We read just 64K at a time to be kind to
# runtime storage requirements.
data = stream.read(65536)
if data == '':
break
nread += len(data)
hasher.update(data)
if nread != size:
raise ValueError('%s: expected %u bytes, found %u bytes' %
(stream.name, size, nread))
return hasher

def symlink_hash(path):
"""
Return (as hash instance) the hash of a symlink.
Caller must use hexdigest() or digest() as needed on
the result.
"""
hasher = sha1()
# XXX os.readlink produces a string, even though the
# underlying data read from the inode (which git will hash)
# are raw bytes. It's not clear what happens if the raw
# data bytes are not decode-able into Unicode; it might
# be nice to have a raw_readlink.
data = os.readlink(path).encode('utf8')
hasher.update(b'blob %u\0' % len(data))
hasher.update(data)
return hasher


def tree_hash(path, args):
"""
Return the hash of a tree. We need to know all
files and sub-trees. Since order matters, we must
walk the sub-trees and files in their natural (byte) order,
so we cannot use os.walk.

This is also slightly defective in that it does not know
about .gitignore files (we can't just read them since git
retains files that are in the index, even if they would be
ignored by a .gitignore directive).

We also do not (cannot) deal with submodules here.
"""
# Annoyingly, the tree object encodes its size, which requires
# two passes, one to find the size and one to compute the hash.
contents = os.listdir(path)
tsize = 0
to_skip = ('.', '..') if args.keep_dot_git else ('.', '..', '.git')
pass1 = []
for entry in contents:
if entry not in to_skip:
fullpath = os.path.join(path, entry)
mode, gitclass, esize = classify(fullpath)
# git stores as mode<sp><entry-name>\0<digest-bytes>
encoded_form = entry.encode('utf8')
tsize += len(mode) + 1 + len(encoded_form) + 1 + 20
pass1.append((fullpath, mode, gitclass, esize, encoded_form))

# Git's cache sorts foo/bar before fooXbar but after foo-bar,
# because it actually stores foo/bar as the literal string
# "foo/bar" in the index, rather than using recursion. That is,
# a directory name should sort as if it ends with '/' rather than
# with '\0'. Sort pass1 contents with funky sorting.
#
# (i[4] is the utf-8 encoded form of the name, i[1] is the
# mode which is '40000' for directories.)
pass1.sort(key = lambda i: i[4] + '/' if i[1] == '40000' else i[4])

args.depth += 1
hasher = sha1()
hasher.update(b'tree %u\0' % tsize)
for (fullpath, mode, gitclass, esize, encoded_form) in pass1:
sub_hash = generic_hash(fullpath, mode, esize, args)
if args.debug: # and args.depth == 0:
print('%s%s %s %s\t%s' % (' ' * args.depth,
strmode(mode), gitclass, sub_hash.hexdigest(),
encoded_form.decode('utf8')))

# Annoyingly, git stores the tree hash as 20 bytes, rather
# than 40 ASCII characters. This is why we return the
# hash instance (so we can use .digest() directly).
# The format here is <mode><sp><path>\0<raw-hash>.
hasher.update(b'%s %s\0' % (mode, encoded_form))
hasher.update(sub_hash.digest())
args.depth -= 1
return hasher

def generic_hash(path, mode, size, args):
"""
Hash an object based on its mode.
"""
if mode == '120000':
hasher = symlink_hash(path)
elif mode == '40000':
hasher = tree_hash(path, args)
else:
# 100755 if any execute permission bit set, else 100644
with open(path, 'rb') as stream:
hasher = blob_hash(stream, size)
return hasher

def main():
"""
Parse arguments and invoke hashers.
"""
parser = argparse.ArgumentParser('compute git hashes')
parser.add_argument('-d', '--debug', action='store_true')
parser.add_argument('-k', '--keep-dot-git', action='store_true')
parser.add_argument('path', nargs='+')
args = parser.parse_args()
args.depth = -1 # for debug print
status = 0
for path in args.path:
try:
try:
mode, gitclass, size = classify(path)
except ValueError:
print('%s: unhashable!' % path)
status = 1
continue
hasher = generic_hash(path, mode, size, args)
result = hasher.hexdigest()
if args.debug:
print('%s %s %s\t%s' % (strmode(mode), gitclass, result,
path))
else:
print('%s: %s hash = %s' % (path, gitclass, result))
except OSError as err:
print(str(err))
status = 1
sys.exit(status)

if __name__ == '__main__':
try:
sys.exit(main())
except KeyboardInterrupt:
sys.exit('\nInterrupted')

关于python - 在没有 git 的情况下分配目录的 git SHA,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/36657399/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com