/*
 * Internal journal
 */

#include <sys/types.h>		/* [s]size_t */
#include <sys/stat.h>		/* open() */
#include <fcntl.h>		/* open() */
#include <unistd.h>		/* f[data]sync(), close() */
#include <stdlib.h>		/* malloc() and friends */
#include <limits.h>		/* PATH_MAX */
#include <string.h>		/* memcpy() */
#include <stdio.h>		/* fprintf() */
#include <errno.h>		/* errno */
#include <stdint.h>		/* uintX_t */
#include <arpa/inet.h>		/* htonl() and friends */
#include <netinet/in.h>		/* htonl() and friends (on some platforms) */

#include "libjio.h"
#include "common.h"
#include "compat.h"
#include "journal.h"
#include "trans.h"


/*
 * On-disk structures
 *
 * Each transaction will be stored on disk as a single file, composed of a
 * header, operation information, and a trailer. The operation information is
 * composed of repeated operation headers followed by their corresponding
 * data, one for each operation. A special operation header containing all 0s
 * marks the end of the operations.
 * 
 * Visually, something like this:
 * 
 *  +--------+---------+----------+---------+----------+-----+-----+---------+
 *  | header | op1 hdr | op1 data | op2 hdr | op2 data | ... | eoo | trailer |
 *  +--------+---------+----------+---------+----------+-----+-----+---------+
 *             \                                             /
 *              +--------------- operations ----------------+ 
 *
 * The details of each part can be seen on the following structures. All
 * integers are stored in network byte order.
 */

/** Transaction file header */
struct on_disk_hdr {
	uint16_t ver;
	uint16_t flags;
	uint32_t trans_id;
} __attribute__((packed));

/** Transaction file operation header */
struct on_disk_ophdr {
	uint32_t len;
	uint64_t offset;
} __attribute__((packed));

/** Transaction file trailer */
struct on_disk_trailer {
	uint32_t numops;
	uint32_t checksum;
} __attribute__((packed));


/* Convert structs to/from host to network (disk) endian */

static void hdr_hton(struct on_disk_hdr *hdr)
{
	hdr->ver = htons(hdr->ver);
	hdr->flags = htons(hdr->flags);
	hdr->trans_id = htonl(hdr->trans_id);
}

static void hdr_ntoh(struct on_disk_hdr *hdr)
{
	hdr->ver = ntohs(hdr->ver);
	hdr->flags = ntohs(hdr->flags);
	hdr->trans_id = ntohl(hdr->trans_id);
}

static void ophdr_hton(struct on_disk_ophdr *ophdr)
{
	ophdr->len = htonl(ophdr->len);
	ophdr->offset = htonll(ophdr->offset);
}

static void ophdr_ntoh(struct on_disk_ophdr *ophdr)
{
	ophdr->len = ntohl(ophdr->len);
	ophdr->offset = ntohll(ophdr->offset);
}

static void trailer_hton(struct on_disk_trailer *trailer) {
	trailer->numops = htonl(trailer->numops);
	trailer->checksum = htonl(trailer->checksum);
}

static void trailer_ntoh(struct on_disk_trailer *trailer) {
	trailer->numops = ntohl(trailer->numops);
	trailer->checksum = ntohl(trailer->checksum);
}


/*
 * Helper functions
 */

/** Get a new transaction id */
static unsigned int get_tid(struct jfs *fs)
{
	unsigned int curid, rv;

	/* lock the whole file */
	plockf(fs->jfd, F_LOCKW, 0, 0);

	/* read the current max. curid */
	curid = *(fs->jmap);

	fiu_do_on("jio/get_tid/overflow", curid = -1);

	/* increment it and handle overflows */
	rv = curid + 1;
	if (rv == 0)
		goto exit;

	/* write to the file descriptor */
	*(fs->jmap) = rv;

exit:
	plockf(fs->jfd, F_UNLOCK, 0, 0);
	return rv;
}

/** Free a transaction id */
static void free_tid(struct jfs *fs, unsigned int tid)
{
	unsigned int curid, i;
	char name[PATH_MAX];

	/* lock the whole file */
	plockf(fs->jfd, F_LOCKW, 0, 0);

	/* read the current max. curid */
	curid = *(fs->jmap);

	/* if we're the max tid, scan the directory looking up for the new
	 * max; the detailed description can be found in the "doc/" dir */
	if (tid == curid) {
		/* look up the new max. */
		for (i = curid - 1; i > 0; i--) {
			get_jtfile(fs, i, name);
			if (access(name, R_OK | W_OK) == 0) {
				break;
			} else if (errno != EACCES) {
				/* Real error, stop looking for a new max. It
				 * doesn't hurt us because it's ok if the max
				 * is higher than it could be */
				break;
			}
		}

		/* and save it */
		*(fs->jmap) = i;
	}

	plockf(fs->jfd, F_UNLOCK, 0, 0);
	return;
}


static int already_warned_about_sync = 0;

/** fsync() a directory */
static int fsync_dir(int fd)
{
	int rv;

	rv = fsync(fd);

	if (rv != 0 && (errno == EINVAL || errno == EBADF)) {
		/* it seems to be legal that fsync() on directories is not
		 * implemented, so if this fails with EINVAL or EBADF, just
		 * call a global sync(); which is awful (and might still
		 * return before metadata is done) but it seems to be the
		 * saner choice; otherwise we just fail */
		sync();
		rv = 0;

		if (!already_warned_about_sync) {
			fprintf(stderr, "libjio warning: falling back on " \
					"sync() for directory syncing\n");
			already_warned_about_sync = 1;
		}
	}

	return rv;
}

/** Corrupt a journal file. Used as a last resource to prevent an applied
 * transaction file laying around */
static int corrupt_journal_file(struct journal_op *jop)
{
	off_t pos;
	struct on_disk_trailer trailer;

	/* We set the number of operations to 0, and the checksum to
	 * 0xffffffff, so there is no chance it's considered valid after a new
	 * transaction overwrites this one */
	trailer.numops = 0;
	trailer.checksum = 0xffffffff;

	pos = lseek(jop->fd, 0, SEEK_END);
	if (pos == (off_t) -1)
		return -1;

	if (pwrite(jop->fd, (void *) &trailer, sizeof(trailer), pos)
			!= sizeof(trailer))
		return -1;

	if (fdatasync(jop->fd) != 0)
		return -1;

	return 0;
}

/** Mark the journal as broken. To do so, we just create a file named "broken"
 * inside the journal directory. Used internally to mark severe journal errors
 * that should prevent further journal use to avoid potential corruption, like
 * failures to remove transaction files. The mark is removed by jfsck(). */
static int mark_broken(struct jfs *fs)
{
	char broken_path[PATH_MAX];
	int fd;

	snprintf(broken_path, PATH_MAX, "%s/broken", fs->jdir);
	fd = creat(broken_path, 0600);
	close(fd);

	return fd >= 0;
}

/** Check if the journal is broken */
static int is_broken(struct jfs *fs)
{
	char broken_path[PATH_MAX];

	snprintf(broken_path, PATH_MAX, "%s/broken", fs->jdir);
	return access(broken_path, F_OK) == 0;
}


/*
 * Journal functions
 */

/** Create a new transaction in the journal. Returns a pointer to an opaque
 * jop_t (that is freed using journal_free), or NULL if there was an error. */
struct journal_op *journal_new(struct jfs *fs, unsigned int flags)
{
	int fd, id;
	ssize_t rv;
	char *name = NULL;
	struct journal_op *jop = NULL;
	struct on_disk_hdr hdr;
	struct iovec iov[1];

	if (is_broken(fs))
		goto error;

	jop = malloc(sizeof(struct journal_op));
	if (jop == NULL)
		goto error;

	name = (char *) malloc(PATH_MAX);
	if (name == NULL)
		goto error;

	id = get_tid(fs);
	if (id == 0)
		goto error;

	/* open the transaction file */
	get_jtfile(fs, id, name);
	fd = open(name, O_RDWR | O_CREAT | O_TRUNC, 0600);
	if (fd < 0)
		goto error;

	if (plockf(fd, F_LOCKW, 0, 0) != 0)
		goto unlink_error;

	jop->id = id;
	jop->fd = fd;
	jop->numops = 0;
	jop->name = name;
	jop->csum = 0;
	jop->fs = fs;

	fiu_exit_on("jio/commit/created_tf");

	/* save the header */
	hdr.ver = 1;
	hdr.trans_id = id;
	hdr.flags = flags;
	hdr_hton(&hdr);

	iov[0].iov_base = (void *) &hdr;
	iov[0].iov_len = sizeof(hdr);
	rv = swritev(fd, iov, 1);
	if (rv != sizeof(hdr))
		goto unlink_error;

	jop->csum = checksum_buf(jop->csum, (unsigned char *) &hdr,
			sizeof(hdr));

	fiu_exit_on("jio/commit/tf_header");

	return jop;

unlink_error:
	unlink(name);
	free_tid(fs, id);
	close(fd);

error:
	free(name);
	free(jop);

	return NULL;
}

/** Save a single operation in the journal file */
int journal_add_op(struct journal_op *jop, unsigned char *buf, size_t len,
		off_t offset)
{
	ssize_t rv;
	struct on_disk_ophdr ophdr;
	struct iovec iov[2];

	ophdr.len = len;
	ophdr.offset = offset;
	ophdr_hton(&ophdr);

	iov[0].iov_base = (void *) &ophdr;
	iov[0].iov_len = sizeof(ophdr);
	jop->csum = checksum_buf(jop->csum, (unsigned char *) &ophdr,
			sizeof(ophdr));

	iov[1].iov_base = (void *) buf;
	iov[1].iov_len = len;
	jop->csum = checksum_buf(jop->csum, buf, len);

	fiu_exit_on("jio/commit/tf_pre_addop");

	rv = swritev(jop->fd, iov, 2);
	if (rv != sizeof(ophdr) + len)
		goto error;

	fiu_exit_on("jio/commit/tf_addop");

	jop->numops++;

	return 0;

error:
	return -1;
}

/** Prepares to commit the operation. Can be omitted. */
void journal_pre_commit(struct journal_op *jop)
{
	/* In an attempt to reduce journal_commit() fsync() waiting time, we
	 * submit the sync here, hoping that at least some of it will be ready
	 * by the time we hit journal_commit() */
	sync_range_submit(jop->fd, 0, 0);
}

/** Commit the journal operation */
int journal_commit(struct journal_op *jop)
{
	ssize_t rv;
	struct on_disk_ophdr ophdr;
	struct on_disk_trailer trailer;
	struct iovec iov[2];

	/* write the empty ophdr to mark there are no more operations, and
	 * then the trailer */
	ophdr.len = 0;
	ophdr.offset = 0;
	ophdr_hton(&ophdr);
	iov[0].iov_base = (void *) &ophdr;
	iov[0].iov_len = sizeof(ophdr);
	jop->csum = checksum_buf(jop->csum, (unsigned char *) &ophdr,
			sizeof(ophdr));

	trailer.checksum = jop->csum;
	trailer.numops = jop->numops;
	trailer_hton(&trailer);
	iov[1].iov_base = (void *) &trailer;
	iov[1].iov_len = sizeof(trailer);

	rv = swritev(jop->fd, iov, 2);
	if (rv != sizeof(ophdr) + sizeof(trailer))
		goto error;

	/* this is a simple but efficient optimization: instead of doing
	 * everything O_SYNC, we sync at this point only, this way we avoid
	 * doing a lot of very small writes; in case of a crash the
	 * transaction file is only useful if it's complete (ie. after this
	 * point) so we only flush here (both data and metadata) */
	if (fsync(jop->fd) != 0)
		goto error;
	if (fsync_dir(jop->fs->jdirfd) != 0)
		goto error;

	fiu_exit_on("jio/commit/tf_sync");

	return 0;

error:
	return -1;
}

/** Free a journal operation.
 * NOTE: It can't assume the save completed successfuly, so we can call it
 * when journal_save() fails.  */
int journal_free(struct journal_op *jop, int do_unlink)
{
	int rv;

	if (!do_unlink) {
		rv = 0;
		goto exit;
	}

	rv = -1;

	if (unlink(jop->name)) {
		/* we do not want to leave a possibly complete transaction
		 * file around when the transaction was not commited and the
		 * unlink failed, so we attempt to truncate it, and if that
		 * fails we corrupt it as a last resort. */
		if (ftruncate(jop->fd, 0) != 0) {
			if (corrupt_journal_file(jop) != 0) {
				mark_broken(jop->fs);
				goto exit;
			}
		}
	}

	if (fsync_dir(jop->fs->jdirfd) != 0) {
		mark_broken(jop->fs);
		goto exit;
	}

	fiu_exit_on("jio/commit/pre_ok_free_tid");
	free_tid(jop->fs, jop->id);

	rv = 0;

exit:
	close(jop->fd);

	free(jop->name);
	free(jop);

	return rv;
}

/** Fill a transaction structure from a mmapped transaction file. Useful for
 * checking purposes.
 * @returns 0 on success, -1 if the file was broken, -2 if the checksums didn't
 *	match
 */
int fill_trans(unsigned char *map, off_t len, struct jtrans *ts)
{
	int rv;
	unsigned char *p;
	struct operation *op, *tmp;
	struct on_disk_hdr hdr;
	struct on_disk_ophdr ophdr;
	struct on_disk_trailer trailer;

	rv = -1;

	if (len < sizeof(hdr) + sizeof(ophdr) + sizeof(trailer))
		return -1;

	p = map;

	memcpy(&hdr, p, sizeof(hdr));
	p += sizeof(hdr);

	hdr_ntoh(&hdr);
	if (hdr.ver != 1)
		return -1;

	ts->id = hdr.trans_id;
	ts->flags = hdr.flags;
	ts->numops_r = 0;
	ts->numops_w = 0;
	ts->len_w = 0;

	for (;;) {
		if (p + sizeof(ophdr) > map + len)
			goto error;

		memcpy(&ophdr, p,  sizeof(ophdr));
		p += sizeof(ophdr);

		ophdr_ntoh(&ophdr);

		if (ophdr.len == 0 && ophdr.offset == 0) {
			/* This header marks the end of the operations */
			break;
		}

		if (p + ophdr.len > map + len)
			goto error;

		op = malloc(sizeof(struct operation));
		if (op == NULL)
			goto error;

		op->len = ophdr.len;
		op->offset = ophdr.offset;
		op->direction = D_WRITE;

		op->buf = (void *) p;
		p += op->len;

		op->pdata = NULL;

		if (ts->op == NULL) {
			ts->op = op;
			op->prev = NULL;
			op->next = NULL;
		} else {
			for (tmp = ts->op; tmp->next != NULL; tmp = tmp->next)
				;
			tmp->next = op;
			op->prev = tmp;
			op->next = NULL;
		}

		ts->numops_w++;
		ts->len_w += op->len;
	}

	if (p + sizeof(trailer) > map + len)
		goto error;

	memcpy(&trailer, p, sizeof(trailer));
	p += sizeof(trailer);

	trailer_ntoh(&trailer);

	if (trailer.numops != ts->numops_w)
		goto error;

	if (checksum_buf(0, map, len - sizeof(trailer)) != trailer.checksum) {
		rv = -2;
		goto error;
	}

	return 0;

error:
	while (ts->op != NULL) {
		tmp = ts->op->next;
		free(ts->op);
		ts->op = tmp;
	}
	return rv;
}