Make binvis play better with restricted symbol sets.

This means that we can now measure entropy in, for instance, JavaScript files,
where we don't encounter the full 256 symbols in the byte spectrum.
This commit is contained in:
Aldo Cortesi 2012-01-09 14:21:51 +13:00
parent 8cf7eb9664
commit 52672e723c
2 changed files with 10 additions and 9 deletions

13
binvis
View File

@ -8,6 +8,9 @@ import Image, ImageDraw
class _Color:
def __init__(self, data, block):
self.data, self.block = data, block
s = list(set(data))
s.sort()
self.symbol_map = {v : i for (i, v) in enumerate(s)}
def __len__(self):
return len(self.data)
@ -23,10 +26,10 @@ class ColorHilbert(_Color):
def __init__(self, data, block):
_Color.__init__(self, data, block)
self.csource = scurve.fromSize("hilbert", 3, 256**3)
self.step = len(self.csource)/float(256)
self.step = len(self.csource)/float(len(self.symbol_map))
def getPoint(self, x):
c = ord(self.data[x])
c = self.symbol_map[self.data[x]]
return self.csource.point(int(c*self.step))
@ -44,7 +47,8 @@ class ColorClass(_Color):
class ColorEntropy(_Color):
def getPoint(self, x):
e = utils.entropy(self.data, 256, x)
e = utils.entropy(self.data, 256, x, len(self.symbol_map))
# http://www.wolframalpha.com/input/?i=plot+%284%28x-0.5%29-4%28x-0.5%29**2%29**4+from+0.5+to+1
def curve(v):
f = (4*v - 4*v**2)**4
f = max(f, 0)
@ -52,7 +56,6 @@ class ColorEntropy(_Color):
r = curve(e-0.5) if e > 0.5 else 0
b = e**2
return [
# http://www.wolframalpha.com/input/?i=plot+%284%28x-0.5%29-4%28x-0.5%29**2%29**4+from+0.5+to+1
int(255*r),
0,
int(255*b)
@ -159,7 +162,7 @@ def main():
base, _ = base.rsplit(".", 1)
dst = base + options.suffix + ".png"
if os.path.exists(dst):
if os.path.exists(dst) and len(args) < 2:
print >> sys.stderr, "Refusing to over-write '%s'. Specify explicitly if you really want to do this."%dst
sys.exit(1)

View File

@ -100,7 +100,7 @@ def bitrange(x, width, start, end):
return x >> (width-end) & ((2**(end-start))-1)
def entropy(data, blocksize, offset):
def entropy(data, blocksize, offset, symbols=256):
"""
Returns local byte entropy for a location in a file.
"""
@ -115,7 +115,7 @@ def entropy(data, blocksize, offset):
hist = {}
for i in data[start:start+blocksize]:
hist[i] = hist.get(i, 0) + 1
base = min(blocksize, 256)
base = min(blocksize, symbols)
entropy = 0
for i in hist.values():
p = i/float(blocksize)
@ -124,5 +124,3 @@ def entropy(data, blocksize, offset):
# between 0 and 1.
entropy += (p * math.log(p, base))
return -entropy