/*
 * <<< sh_radix.c >>>
 *
 * --- Sample application for isis 'radix sort' - for multiprocessor
 *     Copyright (C) 1995-2000 Amano Lab., Keio University. ---
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License along
 *  with this program; if not, write to the Free Software Foundation, Inc.,
 *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
 */

#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include "osiris.h"
#include "shlib.h"

#define MAXSIZE				0x100000
#define MAXRADIXBIT			16
#define VALUEBIT			16
#define VALUEMASK			(~(~0 << VALUEBIT))

#define DEFAULT_SIZE		16
#define DEFAULT_RADIXBIT	8

typedef unsigned data_t;

#define rnd() ((unsigned)((seed = 1566083941UL * seed + 1) >> 16))
#define srnd(x) (seed = (x))
static unsigned long seed;

static void radix_sort(data_t*, int, int);
static void init(data_t*, int);
static void show(data_t*, int);
static int check(data_t*, int);

int main(int argc, char **argv)
{
	data_t *shared_a;
	int punum = get_punum(), puid = get_puid();
	int size = DEFAULT_SIZE, radixbit = DEFAULT_RADIXBIT,
		verbose_flag = 0, check_flag = 0, result;

	/* read arguments */
	while (*++argv != NULL) {
		if (**argv == '-') {
			switch (*++*argv) {
			case 'r':
				radixbit = (isdigit((int)*++(*argv))) ?
						   atoi(*argv) : atoi(*++argv);
				radixbit = (radixbit < 1) ? DEFAULT_RADIXBIT :
						   (radixbit > MAXRADIXBIT) ? MAXRADIXBIT : radixbit;
				break;
			case 't':
				check_flag = 1;
				break;
			case 'v':
				verbose_flag = 1;
				break;
			default:
				break;
			}
		} else if (isdigit((int)**argv)) {
			size = atoi(*argv);
			size = (size < 0) ? DEFAULT_SIZE :
				   ((size > MAXSIZE) ? MAXSIZE : size);
		}
	}
	if (puid == 0 && verbose_flag) {
		printf("size:%d value:0-%#x radix:%#x\n", size, VALUEMASK,
				(~(~0 << radixbit)));
	}

	/* initialize */
	shared_a = shared_malloc(size * sizeof(*shared_a));
	if (shared_a == NULL) {
		fprintf(stderr, "shared_malloc: out of memory in %s:%d.\n", __FILE__,
			__LINE__);
		exit(1);
	}
	{
		srnd(puid + 1); rnd();
		init(shared_a + size / punum * puid, size / punum);
	}
	if (puid == 0 && verbose_flag) {
		if (size <= 64) {
			puts("source:");
			show(shared_a, size);
		}
	}

	/* calculate */
	radix_sort(shared_a, size, radixbit);
	if (puid == 0) {
		result = ((check_flag) ? check(shared_a, size) : 1);
		if (verbose_flag) {
			if (check_flag) {
				if (result) {
					puts("success.");
				} else {
					puts("failed.");
				}
			}
			if (size <= 64) {
				puts("destination:");
				show(shared_a, size);
			}
		}
		return result ? 0 : 1;
	} else {
		return 0;
	}
}

void radix_sort(data_t *shared_a, const int size, const int radixbit)
{
	const int punum = get_punum(), puid = get_puid();
	data_t *shared_w = shared_a + MAXSIZE;
	int *shared_c = (int*)(shared_w + MAXSIZE);
	const int radix = (1 << radixbit), radixmask = radix - 1;
	int shiftbit, flag;
	for (flag = shiftbit = 0;
		 shiftbit < VALUEBIT;
		 shiftbit += radixbit, flag = !flag) {
		static int c[1 << MAXRADIXBIT];
		data_t *const src = (!flag) ? shared_a : shared_w;
		data_t *const dst = (!flag) ? shared_w : shared_a;
		const int start = (puid == 0) ? 0 : size / punum * puid,
				  end = (puid == punum - 1) ? size : size / punum * (puid + 1);
		int i;
		for (i = 0; i < radix; i++) c[i] = 0;
		barrier();
		for (i = start; i < end; i++) c[(src[i] >> shiftbit) & radixmask]++;
		if (puid == 0) {
			for (i = 0; i < radix; i++) shared_c[i] = c[i], c[i] = 0;
		}
		for (i = 1; i < punum; i++) {
			barrier();
			if (puid == i) {
				int j;
				for (j = 0; j < radix; j++) {
					data_t tmp = shared_c[j];
					shared_c[j] += c[j];
					c[j] = tmp;
				}
			}
		}
		barrier();
		if (puid == 0) {
			data_t sum = shared_c[0];
			shared_c[0] = c[0] = 0;
			for (i = 1; i < radix; i++) {
				data_t tmp = c[i] = sum;
				sum += shared_c[i];
				shared_c[i] = tmp;
			}
		}
		barrier();
		if (puid > 0) {
			for (i = 0; i < radix; i++) c[i] += shared_c[i];
		}
		for (i = start; i < end; i++) {
			dst[c[(src[i] >> shiftbit) & radixmask]++] = src[i];
		}
	}
	barrier();
	if (flag) {
		int i;
		for (i = puid; i < size; i += punum) shared_a[i] = shared_w[i];
		barrier();
	}
}

void init(data_t *a, int size)
{
	int	i;
	for (i = 0; i < size; i++) {
		unsigned long rnd_num = ((unsigned long)rnd() << 16) | rnd();
		a[i] = ((rnd_num >> (32 - VALUEBIT)) & VALUEMASK);
	}
}

void show(data_t *a, int size)
{
	int i;
	for (i = 0; ; i++) {
		printf("%0*x", ((VALUEBIT + 3) / 4), a[i]);
		if (i == size - 1) break;
		putchar((i % 16 != 15) ? '.' : '\n');
	}
	putchar('\n');
}

int check(data_t *a, int size)
{
	int i;
	for (i = 1; i < size; i++) {
		if (a[i] < a[i - 1]) return 0;
	}
	return 1;
}
