#include <fcntl.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <aio.h>
#include <errno.h>
#include <sys/times.h>

#define CHUNK_SIZE  (1024 * 1024)
#define CHUNK_COUNT (4 * 64)

#define CHUNK_PARTS 8
static int ready[CHUNK_PARTS];
static long done[CHUNK_PARTS];

static char buffer[CHUNK_SIZE];

long sum;

static int factorial(int v) {
  int r = 1;
  while (v > 1) {
    r *= v;
    v /= 2;
  }
  return r;
}

static void do_some_work(int fuel)
{
  int j;
  for (j = 0; (fuel > 0) && (j < CHUNK_PARTS); j++) {
    if (ready[j]) {
      long pos = done[j], offset = j * (CHUNK_SIZE / CHUNK_PARTS);
      while ((fuel > 0) && (pos < CHUNK_SIZE / CHUNK_PARTS)) {
        sum += factorial(buffer[offset + pos++]);
        fuel--;
      }
      done[j] = pos;
    }
  }
}

int main(int argc, char **argv)
{
  int i, j, read_iters = 0, write_iters = 0, sent, count;
  struct tms t;
  struct aiocb aio, aios[CHUNK_PARTS];
  const struct aiocb * aio_ps[CHUNK_PARTS];
  clock_t start, end;

  start = times(&t);

  for (i = 0; i < CHUNK_COUNT; i++) {
    for (j = 0; j < CHUNK_PARTS; j++) {
      ready[j] = 0;
      done[j] = 0;
    }

    for (j = 0; j < CHUNK_PARTS; j++) {
      memset(&aios[j], 0, sizeof(aios[j]));
      aios[j].aio_fildes = 0;
      aios[j].aio_nbytes = CHUNK_SIZE / CHUNK_PARTS;
      aios[j].aio_offset = (i * CHUNK_SIZE) + (j * (CHUNK_SIZE / CHUNK_PARTS));
      aios[j].aio_buf = buffer + (j * (CHUNK_SIZE / CHUNK_PARTS));
      aios[j].aio_sigevent.sigev_notify = SIGEV_NONE;
      if (aio_read(&aios[j])) {
        perror(strerror(errno));
        exit(1);
      }
      aio_ps[j] = &aios[j];
    }

    while (1) {
      read_iters++;
      for (j = 0, count = 0; j < CHUNK_PARTS; j++) {
        if (aio_error(&aios[j]) == EINPROGRESS) 
          break;
        else {
          count++;
          if (!ready[j]) {
            ready[j] = 1;
            do_some_work(CHUNK_SIZE / CHUNK_PARTS);
          }
        }
      }
      if (count < CHUNK_PARTS) {
        aio_suspend(aio_ps, CHUNK_PARTS, NULL);
      } else
        break;
    }
    for (j = 0; j < CHUNK_PARTS; j++) {
      if (aio_error(&aios[j])) {
        perror(strerror(aio_error(&aios[j])));
        exit(1);
      }
      aio_return(&aios[j]);
    }

    for (sent = 0; sent < CHUNK_SIZE; ) {
      memset(&aio, 0, sizeof(aio));
      aio.aio_fildes = 1;
      aio.aio_nbytes = CHUNK_SIZE;
      aio.aio_offset = (i * CHUNK_SIZE) + sent;
      aio.aio_buf = buffer + sent;
      aio.aio_sigevent.sigev_notify = SIGEV_NONE;
      if (aio_write(&aio)) {
        perror(strerror(errno));
        exit(1);
      }

      do_some_work(CHUNK_SIZE);

      aio_ps[0] = &aio;
      aio_suspend(aio_ps, 1, NULL);

      sent += aio_return(&aio);
      write_iters++;
    }
  }
  
  end = times(&t);
  fprintf(stderr, 
          "%s: %lx %ld + %ld = %ld / %ld  %d %d\n", 
          argv[1],
          sum,
          t.tms_stime, 
          t.tms_utime, 
          t.tms_stime + t.tms_utime,
          end - start,
          read_iters - CHUNK_COUNT,
          write_iters - CHUNK_COUNT);

  return 0;
}
