/*  Candice Quates, Math 4350 Cryptography final project
 *   
 *  This is a very small program that calculates shannon entropy over
 *  the bytes of a file.  It returns values between 0 and 8, so I suppose
 *  we say it calculates bytes of entropy.  
 * 
 *  It works by reading each byte in the file and accumulating counts of
 *  each possible byte value (out of 256), which I use to determine 
 *  the frequency of each byte's appearance in the file.  This is p(x).
 *  Then, for each value 0 to 255, I add to the sum p(x)*log(px).
 *
 *  I am using log base 2, because that is what I have in the C library. 
 *  It might be nicer to use log base 256, to convert the entropy into 
 *  bits, but a value out of 8 still appears to create meaning for values,
 *  and we are measuring 8 values at once for our bits, so I think that it
 *  still works.  Besides, everyone I see doing this online for malware
 *  research uses values out of 8.  
 * 
 *  The program takes a single argument of a file. 
 *  
 *  Compile/link it with -lm  ie: cc -o shannon shannon.c -lm
 *
 */ 

#include <stdio.h> 
#include <errno.h>
#include <math.h>

/* Since the actual program is only about 40-50 lines before comments,
   I decided to make this standalone with no functions. */

int
main(int argc, char *argv[]) {
    FILE *myfile;
    int bytecollection[256]; /* where I'm accumulating data */
    int i,j;
    /* It may look a little odd to use a float for a counter,
	but I've found that division in C needs to have one
	value be in floating point to give floating point results. */
    float count;	/* number of bytes in the file */
    float tmp;		/* this holds the frequency value */

    /* for these last two values I wanted lots of points behind decimal 
	during debugging, so I made them doubles. */
    double lgtmp;	/* holds the log of the frequency */
    double total;	/* total entropy accumulation */
    
    if (argc==2) {
	printf("filename:  %s\n",argv[1]);
    } else {
	fprintf(stderr, "usage: %s filename", argv[0]);
	exit (1);
    }

    /* Zeroing out the collection */
    for (i=0; i<256; i++) 
	bytecollection[i]=0;

    /* opening file, failing with error condition if appropriate */   
    myfile=fopen(argv[1],"r");
    if (myfile == NULL)  {
	fprintf(stderr,"error: %s, %s\n", argv[0],strerror(errno));
	exit (1);
    }  

    j=0;
    count=0;
    /* Here we read each byte of the file, sequentially, and increment
	the collection array using each byte to index it.  */
    while (j!=EOF) {
	j=fgetc(myfile);
	bytecollection[j]++;
	count+=1;  /* ++ is just for ints, I think. */
    }	

    fclose(myfile);
    fprintf(stdout,"bytes in file: %f\n",count); 

    total=0.0;
    lgtmp=0.0;
    /* This is where I do the calculation work.  I used temporary 
	variables because it was much clearer to debug; in the process
	of writing this I ended up in overflow hell a few times. */
    for (i=0; i<256; i++) {
	tmp=bytecollection[i] / count ; /*make frequency */
	lgtmp=tmp*log(tmp); 		/* take log of it */
	/* we only add this log if it exists, because log(0) behaves
	    funny. In math we would just add the 0. */
	if (bytecollection[i]>0) {	
		total+=lgtmp;
 	}
    }
    /* Remember shannon entropy is: H(X) = -sum( P(X)*log(P(X))) from 1 to n */
    /* don't forget to negate it at the end */
    total*=-1;
    printf("entropy: %2.10f\n",total);
    
    return 0;
}

/* All rights reserved.  Copyright 2010 Candice Quates.  */

