I'm trying to reimplement the SHA1 algorithm in python, but the result of my function differs from the one in builtin hashlib, at first I thought the problem was in my rotate function, but after fixing it, there is still a difference in the outputs, here is my code:
from hashlib import sha1 as builtin_sha1
def rotl32(value: int, count: int) -> int:
return ((value << count) | (value >> (32 - count))) & 0xffffffff
def default_sha1(data: bytes) -> bytes:
return builtin_sha1(data).digest()
def sha1(data: bytes) -> bytes:
# initialize variables
h0 = 0x67452301
h1 = 0xefcdab89
h2 = 0x98badcfe
h3 = 0x10325476
h4 = 0xc3d2e1f0
msg_len = len(data)
# append 0x80
data += b"\x80"
# append 0x00 until msg_len % 64 == 56
data += b"\x00" * ((56 - msg_len % 64) % 64)
# append bit length as 64-bit big-endian integer
data += (msg_len * 8).to_bytes(8, "big")
# get the new length (now a multiple of 64)
msg_len = len(data)
for i in range(0, msg_len, 64):
# for each chunk of 64 bytes
# break the chunk into sixteen 32-bit big-endian words
words = [int.from_bytes(data[i + j:i + j + 4], "big")
for j in range(0, 64, 4)]
# extend the sixteen 32-bit words into eighty 32-bit words
for j in range(16, 80):
words.append(
rotl32((words[j - 3] ^ words[j - 8] ^ words[j - 14] ^ words[j - 16]), 1)
)
# initialize hash value for this chunk
a = h0
b = h1
c = h2
d = h3
e = h4
for j in range(80):
if 0 <= j <= 19:
f = (b & c) | ((~b) & d)
k = 0x5a827999
elif 20 <= j <= 39:
f = b ^ c ^ d
k = 0x6ed9eba1
elif 40 <= j <= 59:
f = (b & c) | (b & d) | (c & d)
k = 0x8f1bbcdc
else: # 60 <= j <= 79:
f = b ^ c ^ d
k = 0xca62c1d6
temp = (rotl32(a, 5) + f + e + k + words[j]) & 0xffffffff
e = d
d = c
c = rotl32(b, 30)
b = a
a = temp
# add this chunk's hash to result so far
h0 = (h0 + a) & 0xffffffff
h1 = (h1 + b) & 0xffffffff
h2 = (h2 + c) & 0xffffffff
h3 = (h3 + d) & 0xffffffff
h4 = (h4 + e) & 0xffffffff
# produce the final hash value
return ((h0 << 128) | (h1 << 96) | (h2 << 64) | (h3 << 32) | h4).to_bytes(20, "big")
if __name__ == "__main__":
assert(sha1(b"hello") == default_sha1(b"hello")) # diff
Maybe it's an endianness problem, but I use big endian everywhere, I used as reference the pseudo code on wikipedia SHA1.
EDIT: I was appending the byte length of the message instead of the bit length, now the code is corrected and appends the bit length but it still differs from the builtin implementation
Resolved, the issue was in the formula to calculate the padding, instead of
it should be
Since I calculated
len(msg)before appending0x80, I needed to add1tomsg_lento get the actual length.