/*
 * Utility to take a split data file and merge it into a unified data file.
 *
 * Justin Carlson <justinca+@ri.cmu.edu>
 */

#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdarg.h>
#include <string.h>
#include <stdlib.h>
#include <errno.h>
#include <utils/CannedDataID.h>
#include <utils/port.h>

#define BUFSIZE (1024*1024)

static char usage_msg[] = "Usage: cdcrunch <input index file> <output file name>\n";

static void die(char *fmt, ...)
{
	va_list(ap);
	va_start(ap, fmt);
	vfprintf(stderr, fmt, ap);
	va_end(ap);
	exit(-1);
}


typedef struct {
	int   fd;
	off_t size;
} file_info_t;


/*
 * Open all the files assocated with the set, and grab their sizes.  Store
 * the info is a malloc()ed array at (*info_ptr).  Index 0 is the index
 * file, 1 is the first data file, and so forth. 
 * 
 * Return the number of files opened.
 *
 * Also, warn the user if the total dataset is larger than 2GB, as
 * the operation will probably fail due to filesize limits
 */
static int open_files(char *idx_filename, file_info_t **info_ptr)
{
	int i, tmp;
	char *filename;
	uint64_t total_size;
	int filename_size = strlen(idx_filename) + 20;
	int fd;

	struct stat st;
	filename = (char*) malloc(filename_size);
	if (!filename) {
		die("Out of memory");
	}
	
	
	fd = open(idx_filename, O_RDONLY);
	if (fd < 1) {
		die("Couldn't open %s for input: %s\n",
		    idx_filename, strerror(errno));
	}

	if (fstat(fd, &st)) {
		die("Stat of file %s failed after successful open: %s\n",
		    idx_filename, strerror(errno));
	}
	total_size = st.st_size;
	(*info_ptr) = (file_info_t *)malloc(sizeof(file_info_t));
	if (!(*info_ptr)) {
		die("Out of memory!!");
	}
	(*info_ptr)[0].fd = fd;
	(*info_ptr)[0].size = st.st_size;

	i = 1;
	while(1) {

		tmp = snprintf(filename, filename_size, "%s.data.%i", idx_filename, i-1);
		if (tmp >= filename_size) {
			filename_size++;
			free(filename);
			filename = (char *)malloc(filename_size);
			if (!filename) {
				die("Out of memory");
			}
			continue;
		}
		fd = open(filename, O_RDONLY);
		if (fd < 1) {
			if (!i) {
				die("No data files to be crunched!\n");
			}
			break;
		}
		if (fstat(fd, &st)) {
			die("Stat of file %s failed after successful open: %s\n",
			    filename, strerror(errno));
		}
		(*info_ptr) = (file_info_t *)realloc((*info_ptr), (i + 1) * sizeof(file_info_t));
		(*info_ptr)[i].size = st.st_size;
		(*info_ptr)[i].fd = fd;

		total_size += st.st_size;
		i++;
	}
	free(filename);
	if (total_size > 2000000000) {
		fprintf(stderr, "WARNING: Total data size is over 2 gigs.  "
			"Unless you know what you're doing (e.g. you're on a "
			"nontraditional filesystem like XFS or ReiserFS) this "
			"operation will most likely fail.  Trying anyways...\n");
	}
	return i;
}


/*
 * Check if filename exists, complain and die if it does
 */
static void check_clobber(char *filename)
{
	struct stat st;
	if ((!stat(filename, &st))
	    || (errno != ENOENT)) {
		die("Action would clobber file %s.  Aborting.\n",
		    filename);
	}
}


/*
 * Helper function; read an index record and make sure the complete
 * record exists in the index file.  Return the size of the record
 * if it is complete, -1 if it is not.
 *
 * The fd file pointer is left just past the size marker for this
 * record
 */
static int get_idx_size_and_check(FILE *fd, uint32_t idx_file_size)
{
	uint16_t size;
	if (!fread(&size, 2, 1, fd)) {
		return -1;
	}
	size = le_to_host_uint16(size);
	if ((ftell(fd) + size) > (int) idx_file_size) {
		return -1;
	}
	return size;
}

/*
 * If we aborted in the middle of dataset creation we could
 * have a couple of problems:
 * 
 * - The last index record could be incomplete
 * - The last data record could not all be there
 * - There could be no terminating END record in the
 *   index
 *
 * This routine takes care of all of those problems
 */
 
static void sanitize_index(FILE *outfd, file_info_t *files, int num_files)
{
	uint16_t id, size;
	int tmp;
	uint32_t rec_size;
	uint32_t rec_ofs;
	uint8_t fileno;
	long last_good_ofs = 12; 

	fseek(outfd, 12, SEEK_SET);
	while (fread(&id, 2, 1, outfd)) {
		id = le_to_host_uint16(id);
		tmp = get_idx_size_and_check(outfd, files[0].size);
		if (tmp < 0) {
			break;
		}
		switch(id) {
		case utils::CDID_IDX:
			/*
			 * Check that the entire data record actually exists in a data file
			 */
			fseek(outfd, 16, SEEK_CUR);
			fread(&rec_ofs, 4, 1, outfd);
			fread(&rec_size, 4, 1, outfd);
			fread(&fileno, 1, 1, outfd);
			rec_ofs = le_to_host_uint32(rec_ofs);
			rec_size = le_to_host_uint32(rec_size);
			if (files[fileno].size < int(rec_ofs + rec_size)) {
				goto out;
			}
			break;
		case utils::CDID_END:
			/* Woohoo.  The index is complete, no sanitizing necessary. */
			return;
		default:
			/* Unknown, and we don't care what it is.  Skip it */
			fseek(outfd, tmp, SEEK_CUR);
			break;
		}
		last_good_ofs = ftell(outfd);
	}
 out:
	/* 
	 * So, when we get here the last_good_ofs points just past the end of the last record
	 * that was complete and referenced a data point that existed in a data file. 
	 * All we have to do now is write an end record out.
	 */
	fseek(outfd, last_good_ofs, SEEK_SET);
	id = host_to_le_uint16(utils::CDID_END);
	fwrite(&id, 2, 1, outfd);
	size = 0;
	fwrite(&size, 2, 1, outfd);

	/*
	 * We've quite possibly changed the size of this file.  If it's shorter, then
	 * the data files to come will overwrite the superfluous junk, if it's longer
	 * than we've extended the file.  Either way, we need to update the
	 * stored information so we get the file offsets right later on
	 */
	files[0].size = ftell(outfd);
}


/*
 * Do the actual concatenation of files.  Sanitize the index file after
 * copying it, but before adding any data files.
 */
static FILE *crunch_files(char *filename, file_info_t *files, int num_files)
{
	int i;
	FILE *outfd, *tmp;
	char *buf;

	check_clobber(filename);
	buf = (char *)malloc(BUFSIZE);
	if (!buf) {
		die("Out of memory\n");
	}


	outfd = fopen(filename, "w+");
        if (!outfd)
          return NULL;
	for (i = 0; i < num_files; i++) {
		tmp = fdopen(files[i].fd, "r");
		if (!tmp) {
			die("fdopen failed: %s\n", strerror(errno));
		}
		while (!feof(tmp)) {
			fwrite(buf, 1, fread(buf, 1, BUFSIZE, tmp), outfd);
		}
		fclose(tmp);
		if (!i) {
			sanitize_index(outfd, files, num_files);
		}
	}
	free(buf);
	return outfd;
}


/*
 * Assuming we start at a header record, find the next record of type
 * IDX.
 */
static int find_next_idx(FILE *fd)
{
	uint16_t id = utils::CDID_INVALID;
	uint16_t size = 0;
	while((id != utils::CDID_END)
	      && (id != utils::CDID_IDX)) {
		fseek(fd, size, SEEK_CUR);
		if (!(fread(&id, 2, 1, fd)
		      && fread(&size, 2, 1, fd))) {
			die("Unexpected read failure!\n");
		}
		id = le_to_host_uint16(id);
		size = le_to_host_uint16(size);
	}
	if (id == utils::CDID_END) {
		return 0;
	}
	return 1;
}


/*
 * Scan through the newly created concatenation file and set
 * all IDX filenos to 0 (meaning, the index file), and fixup
 * all record offsets
 */
static void fixup_offsets(FILE *fd, file_info_t *files, int num_files)
{
	uint8_t fileno = 0;
	uint32_t ofs, tmp;
	int i;
	uint32_t *ofs_bases;
	
	ofs_bases = (uint32_t *)malloc(num_files * sizeof(uint32_t));
	if (!ofs_bases) {
		die("Out of memory\n");
	}

	tmp = 0;
	for (i = 0; i < num_files; i++) {
		ofs_bases[i] = tmp;
		tmp += files[i].size;
	}

	/* Skip the fixed header information */
	fseek(fd, 12, SEEK_SET);

	/* Fixup each record.  Skip any index records we don't recognize.
	 * rewrite the fileno and offset fields to reflect the merged file
	 */
	while (find_next_idx(fd)) {
		fseek(fd, 24, SEEK_CUR);
		fread(&fileno, 1, 1, fd);
		fseek(fd, -9, SEEK_CUR);
		fread(&ofs, 4, 1, fd);
		ofs = le_to_host_uint32(ofs);

		if (fileno >= num_files) {
			die("Index references nonexistant data file.  Dataset looks corrupt\n");
		}
		
		ofs += ofs_bases[fileno];
		fileno = 0;

		fseek(fd, -4, SEEK_CUR);
		ofs = host_to_le_uint32(ofs);
		fwrite(&ofs, 4, 1, fd);
		fseek(fd, 4, SEEK_CUR);
		fwrite(&fileno, 1, 1, fd);
	}
	free(ofs_bases);
}

int main(int argc, char *argv[])
{
	char *idx_filename, *new_filename;
	FILE *outfd;
	file_info_t *files;
	int num_files;
	if (argc != 3) {
		fprintf(stderr, "%s", usage_msg);
		return -1;
	}
	idx_filename = argv[1];
	new_filename = argv[2];
	
	num_files = open_files(idx_filename, &files);

	outfd = crunch_files(new_filename, files, num_files);
        if (!outfd) {
          fprintf(stderr, "Could not create %s\n", new_filename);
          return -1;
        }

	fixup_offsets(outfd, files, num_files);
	
	fclose(outfd);
	free(files);
	return 0;
}
