/*
 * <<< bitonic.c >>>
 *
 * --- Sample application for isis 'bitonic sort' - for uniprocessor
 *     Copyright (C) 2000-2003 Amano Lab., Keio University. ---
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License along
 *  with this program; if not, write to the Free Software Foundation, Inc.,
 *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
 */

#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define VALUEBIT		16
#define VALUEMASK		(~(~0 << VALUEBIT))
#define DEFAULT_SIZE	16
#define MAX_PRINT_SIZE	256
#define RADIXBIT		(VALUEBIT >> 1)
#define TMP_BUFSIZE		16					/* TMP_BUFSIZE >= 4 */

typedef unsigned data_t;

#define rnd() ((unsigned)((seed = 1566083941UL * seed + 1) >> 16))
static unsigned long seed = 1;

static void show_help(void);
static int fix_number_to_exp2n(int);
static void init(data_t*, int);
static void show(data_t*, int);
static void show_raw(data_t*, int);
static int check(data_t*, int);
static void radix_sort(data_t*, int);
static void radix_rev_sort(data_t*, int);
static void bitonic_split(data_t*, data_t*, int);
static void bitonic_rev_split(data_t*, data_t*, int);
static void bitonic_to_sequence(data_t*, int);
static void bitonic_to_rev_sequence(data_t*, int);
void bitonic_sort(data_t*, int);

void show_help(void)
{
	puts("usage: bitonic [options] [size]");
	puts("options:");
	puts("  -r<n>  set radix number.");
	puts("  -h     print this message.");
	puts("  -i     print timing statistics.");
	puts("  -t     check to make sure all keys are sorted correctly.");
	puts("  -v     verbosely output.");
}

int fix_number_to_exp2n(int x)
{
	int i = 0;
	while (x != 0) {
		x >>= 1;
		i++;
	}
	return (i == 0) ? 0 : (1 << (i - 1));
}

void init(data_t *a, int size)
{
	int	i;
	for (i = 0; i < size; i++) {
		unsigned long rnd_num = ((unsigned long)rnd() << 16) | rnd();
		a[i] = ((rnd_num >> (32 - VALUEBIT)) & VALUEMASK);
	}
}

void show(data_t *a, int size)
{
	int i;
	for (i = 0; ; i++) {
		printf("%0*x", ((VALUEBIT + 3) / 4), a[i]);
		if (i == size - 1) break;
		putchar((i % 16 != 15) ? '.' : '\n');
	}
	putchar('\n');
}

void show_raw(data_t *a, int size)
{
	int i;
	for (i = 0; ; i++) {
		printf("%0*x", ((VALUEBIT + 3) / 4), a[i]);
		if (i == size - 1) break;
		putchar('.');
	}
}

int check(data_t *a, int size)
{
	int i;
	for (i = 1; i < size; i++) {
		if (a[i] < a[i - 1]) return 0;
	}
	return 1;
}

void radix_sort(data_t *a, int size)
{
	static data_t w[TMP_BUFSIZE];
	static int c[1 << RADIXBIT];
	const int radix = (1 << RADIXBIT), radixmask = radix - 1;
	int i, j;
	for (i = 0; i < radix; i++) c[i] = 0;
	for (i = 0; i < size; i++) c[a[i] & radixmask]++;
	for (j = c[0], i = 1; i < radix; i++) j = (c[i] += j);
	for (i = size - 1; i >= 0; i--) w[--c[a[i] & radixmask]] = a[i];
	for (i = 0; i < radix; i++) c[i] = 0;
	for (i = 0; i < size; i++) c[w[i] >> RADIXBIT]++;
	for (j = c[0], i = 1; i < radix; i++) j = (c[i] += j);
	for (i = size - 1; i >= 0; i--) a[--c[w[i] >> RADIXBIT]] = w[i];
}

void radix_rev_sort(data_t *a, int size)
{
	static data_t w[TMP_BUFSIZE];
	static int c[1 << RADIXBIT];
	const int radix = (1 << RADIXBIT), radixmask = radix - 1;
	int i, j;
	for (i = 0; i < radix; i++) c[i] = 0;
	for (i = 0; i < size; i++) c[a[i] & radixmask]++;
	for (j = c[radix - 1], i = radix - 2; i >= 0; i--) j = (c[i] += j);
	for (i = size - 1; i >= 0; i--) w[--c[a[i] & radixmask]] = a[i];
	for (i = 0; i < radix; i++) c[i] = 0;
	for (i = 0; i < size; i++) c[w[i] >> RADIXBIT]++;
	for (j = c[radix - 1], i = radix - 2; i >= 0; i--) j = (c[i] += j);
	for (i = size - 1; i >= 0; i--) a[--c[w[i] >> RADIXBIT]] = w[i];
}

int main(int argc, char **argv)
{
	data_t *a;
	clock_t init_start_time, init_end_time, calc_start_time, calc_end_time;
	int size = DEFAULT_SIZE, verbose_flag = 0, check_flag = 0, info_flag = 0,
		result;

	/* read arguments */
	while (*++argv != NULL) {
		if (**argv == '-') {
			switch (*++*argv) {
			case 'h':
				show_help();
				return 0;
			case 'i':
				info_flag = 1;
				break;
			case 't':
				check_flag = 1;
				break;
			case 'v':
				verbose_flag = 1;
				break;
			default:
				break;
			}
		} else if (isdigit((int)**argv)) {
			size = atoi(*argv);
			size = (size < 0) ? DEFAULT_SIZE : size;
		}
	}
	size = fix_number_to_exp2n(size);
	if (size < TMP_BUFSIZE * 2) size = TMP_BUFSIZE * 2;
	if (verbose_flag) {
		printf("size:%d value:0-%#x\n", size, VALUEMASK);
	}
	init_start_time = init_end_time = calc_start_time = calc_end_time = 0;

	/* initialize */
	a = (data_t*)malloc(size * sizeof(data_t));
	if (a == NULL) {
		fputs("Out of memory.\n", stderr);
		exit(1);
	}
	if (info_flag) init_start_time = clock();
	init(a, size);
	if (info_flag) init_end_time = clock();
	if (verbose_flag) {
		if (size <= MAX_PRINT_SIZE) {
			puts("source:");
			show(a, size);
		}
	}

	/* calculate */
	if (info_flag) calc_start_time = clock();
	bitonic_sort(a, size);
	if (info_flag) calc_end_time = clock();

	/* check */
	result = ((check_flag) ? check(a, size) : 1);
	if (verbose_flag) {
		if (check_flag) {
			if (result) {
				puts("success.");
			} else {
				puts("failed.");
			}
		}
		if (size <= MAX_PRINT_SIZE) {
			puts("destination:");
			show(a, size);
		}
	}

	/* show timing statistics */
	if (info_flag) {
		printf("init start: %10ld\n"
			   "init end:   %10ld\n"
			   "calc start: %10ld\n"
			   "calc end:   %10ld\n"
			   "init time:  %10ld\n"
			   "calc time:  %10ld\n",
			   (long)init_start_time, (long)init_end_time,
			   (long)calc_start_time, (long)calc_end_time,
			   (long)(init_end_time - init_start_time),
			   (long)(calc_end_time - calc_start_time));
	}

	return result ? 0 : 1;
}

void bitonic_split(data_t *a, data_t *b, int n)
{
	int i;
#ifdef DEBUG
	printf("bitonic_split(n:%d):\n", n);
	printf("< "); show_raw(a, n); printf(" : "); show_raw(b, n); printf("\n");
#endif /* DEBUG */
	for (i = 0; i < n; i++) {
		if (a[i] > b[i]) {
			data_t tmp;
			tmp = a[i]; a[i] = b[i]; b[i] = tmp;
		}
	}
#ifdef DEBUG
	printf("> "); show_raw(a, n); printf(" : "); show_raw(b, n); printf("\n");
#endif /* DEBUG */
}

void bitonic_rev_split(data_t *a, data_t *b, int n)
{
	int i;
#ifdef DEBUG
	printf("bitonic_rev_split(n:%d):\n", n);
	printf("< "); show_raw(a, n); printf(" : "); show_raw(b, n); printf("\n");
#endif /* DEBUG */
	for (i = 0; i < n; i++) {
		if (a[i] < b[i]) {
			data_t tmp;
			tmp = a[i]; a[i] = b[i]; b[i] = tmp;
		}
	}
#ifdef DEBUG
	printf("> "); show_raw(a, n); printf(" : "); show_raw(b, n); printf("\n");
#endif /* DEBUG */
}

void bitonic_to_sequence(data_t *a, int n)
{
	static data_t w[TMP_BUFSIZE];
	int i, j, k, inc_flag;
#ifdef DEBUG
	printf("bitonic_to_sequence(n:%d):\n", n);
	printf("< "); show_raw(a, n); printf("\n");
#endif /* DEBUG */
	/* search maximum and minimum point */
	for (i = 0; i < n - 2; i++) {
		if (a[i] < a[i + 1]) {
			inc_flag = 1;
			break;
		} else if (a[i] > a[i + 1]) {
			inc_flag = 0;
			break;
		}
	}
	if (i == n - 2) return;
	if (inc_flag) {
		/* first direction is increase */
		while (a[i] <= a[i + 1]) {
			i++;
			if (i == n - 1) return;
		}
		j = 0; while (i < n) { w[j] = a[i]; i++, j++; }
		i = 0; while (j < n) { w[j] = a[i]; i++, j++; }
		i = 0, j = n - 1;
		for (k = n - 1; k >= 0; k--) {
			if (w[i] >= w[j]) {
				a[k] = w[i]; i++;
			} else {
				a[k] = w[j]; j--;
			}
		}
	} else {
		/* first direction is decrease */
		while (a[i] >= a[i + 1]) {
			i++;
			if (i == n - 1) break;
		}
		j = 0; while (i < n) { w[j] = a[i]; i++, j++; }
		i = 0; while (j < n) { w[j] = a[i]; i++, j++; }
		i = 0, j = n - 1;
		for (k = 0; k < n; k++) {
			if (w[i] <= w[j]) {
				a[k] = w[i]; i++;
			} else {
				a[k] = w[j]; j--;
			}
		}
	}
#ifdef DEBUG
	printf("> "); show_raw(a, n); printf("\n");
#endif /* DEBUG */
}

void bitonic_to_rev_sequence(data_t *a, int n)
{
	static data_t w[TMP_BUFSIZE];
	int i, j, k, inc_flag;
#ifdef DEBUG
	printf("bitonic_to_rev_sequence(n:%d):\n", n);
	printf("< "); show_raw(a, n); printf("\n");
#endif /* DEBUG */
	/* search maximum and minimum point */
	for (i = 0; i < n - 2; i++) {
		if (a[i] < a[i + 1]) {
			inc_flag = 1;
			break;
		} else if (a[i] > a[i + 1]) {
			inc_flag = 0;
			break;
		}
	}
	if (i == n - 2) return;
	if (inc_flag) {
		/* first direction is increase */
		while (a[i] <= a[i + 1]) {
			i++;
			if (i == n - 1) break;
		}
		j = 0; while (i < n) { w[j] = a[i]; i++, j++; }
		i = 0; while (j < n) { w[j] = a[i]; i++, j++; }
		i = 0, j = n - 1;
		for (k = 0; k < n; k++) {
			if (w[i] >= w[j]) {
				a[k] = w[i]; i++;
			} else {
				a[k] = w[j]; j--;
			}
		}
	} else {
		/* first direction is decrease */
		while (a[i] >= a[i + 1]) {
			i++;
			if (i == n - 1) return;
		}
		j = 0; while (i < n) { w[j] = a[i]; i++, j++; }
		i = 0; while (j < n) { w[j] = a[i]; i++, j++; }
		i = 0, j = n - 1;
		for (k = n - 1; k >= 0; k--) {
			if (w[i] <= w[j]) {
				a[k] = w[i]; i++;
			} else {
				a[k] = w[j]; j--;
			}
		}
	}
#ifdef DEBUG
	printf("> "); show_raw(a, n); printf("\n");
#endif /* DEBUG */
}

void bitonic_sort(data_t *a, int size)
{
	int i, j, k, l;
	for (i = 0; i < size; i += (TMP_BUFSIZE << 1)) {
		radix_sort(a + i, TMP_BUFSIZE);
		radix_rev_sort(a + i + TMP_BUFSIZE, TMP_BUFSIZE);
	}
	for (i = (TMP_BUFSIZE << 2); i <= size; i <<= 1) {
		const int i_div_2 = (i >> 1);
		for (j = 0; j < size; j += i) {
			for (k = i_div_2; k > TMP_BUFSIZE; k >>= 1) {
				const int k_div_2 = (k >> 1);
				for (l = 0; l < i_div_2; l += k) {
					const int k1 = j + l, k2 = i_div_2 + j + l;
					bitonic_split(a + k1, a + k1 + k_div_2, k_div_2);
					bitonic_rev_split(a + k2, a + k2 + k_div_2, k_div_2);
				}
			}
			for (k = 0; k < i_div_2; k += TMP_BUFSIZE) {
				bitonic_to_sequence(a + j + k, TMP_BUFSIZE);
				bitonic_to_rev_sequence(a + j + k + i_div_2, TMP_BUFSIZE);
			}
		}
	}
	for (i = size; i > TMP_BUFSIZE; i >>= 1) {
		const int i_div_2 = (i >> 1);
		for (j = 0; j < size; j += i) {
			bitonic_split(a + j, a + j + i_div_2, i_div_2);
		}
	}
	for (i = 0; i < size; i += TMP_BUFSIZE) {
		bitonic_to_sequence(a + i, TMP_BUFSIZE);
	}
}
