#include "blk256.h"

int blk256_cmp(blk256_t *op1, blk256_t *op2)
{
	if (blk256_sign_bit(op1) != blk256_sign_bit(op2))
		return blk256_sign_bit(op1) ? -1 : 1;

	int neg = blk256_sign(op1);

	if (blk256_high_bit(op1) != blk256_high_bit(op2))
		return blk256_high_bit(op1) ? neg : -neg;

	/* Compare limbs most significant to least */
	int i;
	for (i = 3; i >= 0; i--)
		if (op1->x[i] != op2->x[i])
			return op1->x[i] < op2->x[i] ? -neg : neg;
	return 0;
}

int blk256_cmp_blk128(blk256_t *op1, blk128_t *op2)
{
	if (blk256_sign_bit(op1))
		return -1;

	if (blk256_high_bit(op1) || op1->x[3] || op1->x[2])
		return 1;

	if (op1->x[1] != op2->hi)
		return op1->x[1] < op2->hi ? -1 : 1;

	if (op1->x[0] != op2->lo)
		return op1->x[0] < op2->lo ? -1 : 1;

	return 0;
}

/**
 * NOTE Does not handle negative numbers!
 */
void blk256_inc(blk256_t *op)
{
	/* Takes advantage of short-circuit OR. If increasing x[i] causes all
	 * bits to be set to zero, we must carry and the next arm in the OR
	 * expression will be evaluated.
	 *
	 * Consider "++x0 || ++x1", x1 will only be incremented in the case
	 * where "++x0" evaluates to 0 (ie in the case all bits are 1 prior to
	 * increment).
	 */
	if (++op->x[0] || ++op->x[1] || ++op->x[2])
		return;

	if (!++op->x[3])
		blk256_set_high_bit(op);
}

/**
 * NOTE Does not handle negative numbers!
 */
void blk256_dec(blk256_t *op)
{
	if (op->x[0]-- || op->x[1]-- || op->x[2]--)
		return;

	if (!op->x[3]--)
		blk256_clear_high_bit(op);
}

/**
 *
 */
void blk256_add(blk256_t *rop, blk256_t *op1, blk256_t *op2)
{
	/**
	 * Handle cases where one of the arguments is negative
	 */
	if (blk256_sign_bit(op1) ^ blk256_sign_bit(op2)) {
		if (blk256_sign_bit(op1)) {
			blk256_t x;
			blk256_cp(&x, op1);
			blk256_clear_sign_bit(&x);
			blk256_sub(rop, op2, &x);
		} else {
			blk256_clear_sign_bit(op2);
			blk256_sub(rop, op1, op2);
			blk256_set_sign_bit(op2);
		}
		return;
	}

	/**
	 * Now either (1) both are positive, or (2) both a negative
	 */
	if (rop != op1)
		blk256_cp(rop, op1);

	/* deref */
	uint64_t *opx = rop->x;
	uint64_t *opy = op2->x;

	opx[0] += opy[0];
	if (opx[0] >= opy[0] || ++opx[1] || ++opx[2])
		goto add1;

	if (!++opx[3])
		blk256_set_high_bit(rop);

add1:
	opx[1] += opy[1];
	if (opx[1] >= opy[1] || ++opx[2])
		goto add2;

	if (!++opx[3])
		blk256_set_high_bit(rop);

add2:
	opx[2] += opy[2];
	if (opx[2] >= opy[2])
		goto add3;

	if (!++opx[3])
		blk256_set_high_bit(rop);

add3:
	opx[3] += opy[3];
	if (opx[3] < opy[3])
		blk256_set_high_bit(rop);
	else
		/**
		 * This assignment deals with two things:
		 *
		 * (1) setting the high bit, if present on either op1 or op2
		 * (2) setting the sign bit, if present on both op1 and op2
		 */
		rop->mx |= op2->mx;
}

void blk256_add_blk128(blk256_t *rop, blk256_t *op, blk128_t *inc)
{
	if (rop != op)
		blk256_cp(rop, op);

	rop->x[1] += inc->hi;

	/* Propagate carry */
	if (rop->x[1] >= inc->hi || ++rop->x[2])
		goto add_lo;

	if (!++rop->x[3])
		rop->mx = 1;

add_lo:
	rop->x[0] += inc->lo;

	/* Propagate carry */
	if (rop->x[0] >= inc->lo || ++rop->x[1] || ++rop->x[2])
		return;

	if (!++rop->x[3])
		rop->mx |= 1;
}

void blk256_sub(blk256_t *rop, blk256_t *op1, blk256_t *op2)
{
	blk256_t r;

	/**
	 * Handle negative numbers
	 */
	int neg = blk256_sign_bit(op1) << 1 | blk256_sign_bit(op2);
	switch (neg) {
	case 1:
		/**
		 * op1 >= 0 and op2 < 0
		 *
		 */
		blk256_clear_sign_bit(op2);
		blk256_add(rop, op1, op2);
		blk256_set_sign_bit(op2);
		return;
	case 2:
		/**
		 * op1 < 0 and op2 >= 0
		 */
		blk256_cp(&r, op1);
		blk256_clear_sign_bit(&r);

		blk256_add(rop, &r, op2);
		blk256_set_sign_bit(rop);
		return;
	default:
		/* Nothing */
		;
	}

	/**
	 * This subtraction assumes op1 >= op2, so result is positive!
	 *
	 * At this point either op1, op2 >= 0 or op1, op2 < 0.
	 *
	 * We need to figure if abs(op1) < abs(op2)
	 */
	int abs_cmp = blk256_sign(op1) * blk256_cmp(op1, op2);

	blk256_t *sop;
	if (abs_cmp == -1) {
		/* abs(opx) < abs(opy) */
		blk256_cp(&r, op2);
		sop = op1;
		if (neg)
			blk256_clear_sign_bit(&r);
		else
			blk256_set_sign_bit(&r);
	} else {
		/* abs(opx) >= abs(opy) */
		blk256_cp(&r, op1);
		sop = op2;
		if (neg)
			blk256_set_sign_bit(&r);
		else
			blk256_clear_sign_bit(&r);
	}

	/**
	 * Arranged such that abs(opx) >= abs(opy)
	 */
	uint64_t *opx = r.x;
	uint64_t *opy = sop->x;

	/* Subtract opy[0] */
	if (opx[0] < opy[0] && !opx[1]-- && !opx[2]-- && !opx[3]--)
		blk256_clear_high_bit(&r);
	opx[0] -= opy[0];

	/* Subtract opy[1] */
	if (opx[1] < opy[1] && !opx[2]-- && !opx[3]--)
		blk256_clear_high_bit(&r);
	opx[1] -= opy[1];

	/* Subtract opy[2] */
	if (opx[2] < opy[2] && !opx[3]--)
		blk256_clear_high_bit(&r);
	opx[2] -= opy[2];

	/* Subtract opy[3] */
	if (opx[3] < opy[3])
		blk256_clear_high_bit(&r);
	else
		r.mx |= ((blk256_high_bit(&r) - blk256_high_bit(sop))) & 1;
	opx[3] -= opy[3];

	/* Copy result into rop */
	blk256_cp(rop, &r);
}

// TODO Handle negative numbers
void blk256_sub_blk128(blk256_t *rop, blk256_t *op, blk128_t *dec)
{
	/* Copy op1 into rop */
	if (rop != op)
		blk256_cp(rop, op);

	/* deref */
	uint64_t *opx = rop->x;

	if (opx[1] >= dec->hi || opx[2]--)
		goto sub_hi;

	if (!opx[3]--)
		rop->mx = 0;
sub_hi:
	opx[1] -= dec->hi;

	if (opx[0] >= dec->lo || opx[1]-- || opx[2]--)
		goto sub_lo;

	if (!opx[3]--)
		rop->mx = 0;
sub_lo:
	opx[0] -= dec->lo;
}

// TODO Handle negative numbers
void blk256_sub_u64(blk256_t *rop, blk256_t *op, uint64_t x)
{
	if (rop != op)
		blk256_cp(rop, op);

	/* deref */
	uint64_t *n = rop->x;
	if (n[0] < x)
		if (n[1]-- || n[2]-- || n[3]--)
			rop->mx = 0;
	n[0] -= x;
}

/**
 * TODO Consider: what is the top limit for operands (without overflow)?
 * TODO After above: How is this handled?
 */
void blk256_mul_blk128(blk256_t *rop, blk256_t *op1, blk128_t *op2)
{
	/**
	 * We store our result here until the end
	 */
	uint32_t w[12] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

	/**
	 * Represent op2 as four 32-bit numbers
	 */
	uint32_t yt[4] = { lo(op2->lo), hi(op2->lo), lo(op2->hi), hi(op2->hi) };

	/**
	 * - c is the carry
	 * - y is the current 32 bits in op2 (from yt)
	 * - t contains 64-bit results from 32-bit multiplications
	 */
	uint32_t c, y;
	uint64_t t;
	int i, j;
	for (i = 0; i < 4; i++) {
		y = yt[i];
		for (j = 0; j < 8; j += 2) {
			t = w[i+j] + lo(op1->x[j>>1]) * y + c;
			w[i+j] = lo(t);
			c = hi(t);

			t = w[i+j+1] + hi(op1->x[j>>1]) * y + c;
			w[i+j+1] = lo(t);
			c = hi(t);
		}
		w[i+8] = c;
		c = 0;
	}

	char mx = (char)w[8] & 1;
	if (blk256_sign_bit(op1))
		mx |= 0x2;

	/**
	 * TODO Consider: This multiplication is "modulo" (2^257), ie the result
	 * must fit in a 257-bit number. Is this a reasonable assumption?
	 */
	blk256_set(rop, mx
			, u64(w[7]) << 32 | u64(w[6])
			, u64(w[5]) << 32 | u64(w[4])
			, u64(w[3]) << 32 | u64(w[2])
			, u64(w[1]) << 32 | u64(w[0]));
}

__inline__ double blk256_as_double(blk256_t *op)
{
	return (double)(blk256_sign(op)) * ((double)blk256_high_bit(op) * B256
			+ (double)op->x[3] * B192
			+ (double)op->x[2] * B128
			+ (double)op->x[1] * B64
			+ (double)op->x[0]);
}

__inline__ double blk128_as_double(blk128_t *op)
{
	return (double)op->hi * B64 + (double)op->lo;
}

__inline__ void double_as_blk256(blk256_t *rop, double x)
{
	int sgn = !!(x < 0);
	if (sgn)
		x = -x;

	if (x < 1) {
		blk256_set(rop, sgn << 1, 0, 0, 0, 1);
		return;
	}
	rop->mx  = (sgn << 1) | (((char)(x / B256)) & 1);
	rop->x[3] = u64(x / B192);
	rop->x[2] = u64(x / B128);
	rop->x[1] = u64(x / B64);
	rop->x[0] = u64(x);
}

/**
 * Division of 256-bit number by 128-bit number. This is useful for
 * decomposition in encoding
 *
 * The method employed is converting numbers to double representation and
 * dividing these. This is a much faster method than any iterative method.
 */
void blk256_div_blk128(blk256_t *q, blk128_t *r, blk256_t *x, blk128_t *d)
{
	blk256_t qq, rr, t, m;

	blk256_set(&qq, 0, 0, 0, 0, 0);
	blk256_cp(&rr, x);

	double div = blk128_as_double(d);

	double z, xd;
	while (blk256_cmp_blk128(&rr, d) > -1
			|| blk256_sign_bit(&rr)) {
		xd = blk256_as_double(&rr);
		z = xd / div;

		double_as_blk256(&t, z);

		/**
		 * Compute quotient (t) times divisor (d)
		 *
		 * If t*d is larger than the remaing value (rr) then our found
		 * quotient is too large, and we iteratively make it smaller.
		 */
		blk256_mul_blk128(&m, &t, d);

		blk256_add(&qq, &qq, &t); /* Add to divisor found so far */
		blk256_sub(&rr, &rr, &m);
	}

	/**
	 * Set quotient
	 */
	blk256_cp(q, &qq);

	/**
	 * Set remainder
	 */
	blk128_set(r, rr.x[1], rr.x[0]);
}