/* Candice Quates, Math 4350 Cryptography final project
*
* This is a very small program that calculates shannon entropy over
* the bytes of a file. It returns values between 0 and 8, so I suppose
* we say it calculates bytes of entropy.
*
* It works by reading each byte in the file and accumulating counts of
* each possible byte value (out of 256), which I use to determine
* the frequency of each byte's appearance in the file. This is p(x).
* Then, for each value 0 to 255, I add to the sum p(x)*log(px).
*
* I am using log base 2, because that is what I have in the C library.
* It might be nicer to use log base 256, to convert the entropy into
* bits, but a value out of 8 still appears to create meaning for values,
* and we are measuring 8 values at once for our bits, so I think that it
* still works. Besides, everyone I see doing this online for malware
* research uses values out of 8.
*
* The program takes a single argument of a file.
*
* Compile/link it with -lm ie: cc -o shannon shannon.c -lm
*
*/
#include
#include
#include
/* Since the actual program is only about 40-50 lines before comments,
I decided to make this standalone with no functions. */
int
main(int argc, char *argv[]) {
FILE *myfile;
int bytecollection[256]; /* where I'm accumulating data */
int i,j;
/* It may look a little odd to use a float for a counter,
but I've found that division in C needs to have one
value be in floating point to give floating point results. */
float count; /* number of bytes in the file */
float tmp; /* this holds the frequency value */
/* for these last two values I wanted lots of points behind decimal
during debugging, so I made them doubles. */
double lgtmp; /* holds the log of the frequency */
double total; /* total entropy accumulation */
if (argc==2) {
printf("filename: %s\n",argv[1]);
} else {
fprintf(stderr, "usage: %s filename", argv[0]);
exit (1);
}
/* Zeroing out the collection */
for (i=0; i<256; i++)
bytecollection[i]=0;
/* opening file, failing with error condition if appropriate */
myfile=fopen(argv[1],"r");
if (myfile == NULL) {
fprintf(stderr,"error: %s, %s\n", argv[0],strerror(errno));
exit (1);
}
j=0;
count=0;
/* Here we read each byte of the file, sequentially, and increment
the collection array using each byte to index it. */
while (j!=EOF) {
j=fgetc(myfile);
bytecollection[j]++;
count+=1; /* ++ is just for ints, I think. */
}
fclose(myfile);
fprintf(stdout,"bytes in file: %f\n",count);
total=0.0;
lgtmp=0.0;
/* This is where I do the calculation work. I used temporary
variables because it was much clearer to debug; in the process
of writing this I ended up in overflow hell a few times. */
for (i=0; i<256; i++) {
tmp=bytecollection[i] / count ; /*make frequency */
lgtmp=tmp*log(tmp); /* take log of it */
/* we only add this log if it exists, because log(0) behaves
funny. In math we would just add the 0. */
if (bytecollection[i]>0) {
total+=lgtmp;
}
}
/* Remember shannon entropy is: H(X) = -sum( P(X)*log(P(X))) from 1 to n */
/* don't forget to negate it at the end */
total*=-1;
printf("entropy: %2.10f\n",total);
return 0;
}
/* All rights reserved. Copyright 2010 Candice Quates. */