/*
gcc testperf.c -o ttp `pkg-config --cflags --libs eina ecore` -O3 -march=native

./ttp 1 2 100000000
 */
#include <Eina.h>
#include <Ecore.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

#define PAYLOAD 64

Eina_Thread_Queue *thq = NULL;
int pipefd[2];
struct thqmsg {
   Eina_Thread_Queue_Msg m;
   char dat[PAYLOAD];
};

struct msg {
   int size;
   char dat[PAYLOAD];
};

static void *
thread_main(void *data, Eina_Thread thread)
{
   for (;;)
     {
        void *ref;
        struct thqmsg *dat;

        dat = eina_thread_queue_wait(thq, &ref);
        if (dat->dat[0] == 0)
          {
             eina_thread_queue_wait_done(thq, ref);
             return NULL;
          }
        eina_thread_queue_wait_done(thq, ref);
     }
   return NULL;
}

static void *
thread_main2(void *data, Eina_Thread thread)
{
   int size;
   char dat[PAYLOAD];

   for (;;)
     {
        read(pipefd[0], &size, sizeof(int));
        read(pipefd[0], dat, size);
        if (dat[0] == 0)
          {
             return NULL;
          }
     }
   return NULL;
}

static          int v1 = 0;
static __thread int v2 = 0;
static __thread int v3 = 0;
static double t0, t, tbase, thdbase;
static int i, x, y, l;

static void
dothingv1(int i)
{
   v1^=i+1;
}

static void
dothingv2(int i)
{
   v2^=i+1;
}

static void
dothingv3(int i)
{
   v3^=i+1;
}

static void *
thread_main3(void *data, Eina_Thread thread)
{
   int i;

   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        dothingv3(i);
        asm("");
     }
   t = ecore_time_get();
   x += v3;
   printf("__thd2 %2.5f %08x = %1.3f ns / get\n", t - t0, x,
          ((t - t0 - thdbase) * 1000000000.0) / (double)l);
   return NULL;
}


int main(int argc, char **argv)
{
   Eina_Spinlock spin;
   Eina_Lock lock;
   Eina_Thread thread;

   x = atoi(argv[1]);
   y = atoi(argv[2]);
   l = atoi(argv[3]);
   eina_init();
   ecore_init();

   v1 = y;
   v2 = y;
   v3 = y;

   eina_spinlock_new(&spin);
   eina_lock_new(&lock);

   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        x *= time(NULL) / l;
     }
   t = ecore_time_get();
   printf("============= x l....%i %i %2.5f\n", x, l, t - t0);

   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        x = x + 1;
        asm("");
     }
   t = ecore_time_get();
   printf("ref++  %2.5f %08x = 0.0 ns\n", t - t0, x);

   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        __sync_fetch_and_add(&x, 1);
        asm("");
     }
   t = ecore_time_get();
   printf("ref++a %2.5f %08x = %1.3f ns / op\n", t - t0, x,
          ((t - t0) * 1000000000.0) / (double)l);

   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        if (EINA_UNLIKELY(y)) x = x + 1;
        else x = x + 2;
        asm("");
     }
   t = ecore_time_get();
   printf("ref++c %2.5f %08x = 0.0 ns\n", t - t0, x);

   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        x^=y+l;
        asm("");
     }
   t = ecore_time_get();
   printf("none   %2.5f %08x = 0.0 ns\n", t - t0, x);
   tbase = t - t0;

   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        __sync_fetch_and_xor(&x, y+l);
        asm("");
     }
   t = ecore_time_get();
   printf("atomic %2.5f %08x = %1.3f ns / op\n", t - t0, x,
          ((t - t0 - tbase) * 1000000000.0) / (double)l);

   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        eina_spinlock_take(&spin);
        x^=y+l;
        eina_spinlock_release(&spin);
        asm("");
     }
   t = ecore_time_get();
   printf("spin   %2.5f %08x = %1.3f ns / lock+release\n", t - t0, x,
          ((t - t0 - tbase) * 1000000000.0) / (double)l);

   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        eina_lock_take(&lock);
        x^=y+l;
        eina_lock_release(&lock);
        asm("");
     }
   t = ecore_time_get();
   printf("lock   %2.5f %08x = %1.3f ns / lock+release\n", t - t0, x,
          ((t - t0 - tbase) * 1000000000.0) / (double)l);

   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        x^=(int)eina_thread_self();
        asm("");
     }
   t = ecore_time_get();
   printf("self   %2.5f %08x = %1.3f ns / self_get\n", t - t0, x,
          ((t - t0 - tbase) * 1000000000.0) / (double)l);

   Eina_TLS tls;
   int v;
   eina_tls_new(&tls);
   eina_tls_set(tls, &v);
   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        int *vv = eina_tls_get(tls);
        *vv^=y+1;
        asm("");
     }
   t = ecore_time_get();
   x += v;
   printf("tls    %2.5f %08x = %1.3f ns / get\n", t - t0, x,
          ((t - t0 - thdbase) * 1000000000.0) / (double)l);

   Eina_TLS tls2;
   eina_tls_new(&tls2);
   eina_tls_set(tls2, &v);
   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        int *vv = eina_tls_get(tls2);
        *vv^=y+1;
        asm("");
     }
   t = ecore_time_get();
   x += v;
   printf("tls2   %2.5f %08x = %1.3f ns / get\n", t - t0, x,
          ((t - t0 - thdbase) * 1000000000.0) / (double)l);

   Eina_TLS tls3;
   eina_tls_new(&tls3);
   eina_tls_set(tls3, &v);
   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        int *vv = eina_tls_get(tls3);
        *vv^=y+1;
        asm("");
     }
   t = ecore_time_get();
   x += v;
   printf("tls3   %2.5f %08x = %1.3f ns / get\n", t - t0, x,
          ((t - t0 - thdbase) * 1000000000.0) / (double)l);

   Eina_TLS tls4;
   eina_tls_new(&tls4);
   eina_tls_set(tls4, &v);
   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        int *vv = eina_tls_get(tls4);
        *vv^=y+1;
        asm("");
     }
   t = ecore_time_get();
   x += v;
   printf("tls4   %2.5f %08x = %1.3f ns / get\n", t - t0, x,
          ((t - t0 - thdbase) * 1000000000.0) / (double)l);

   Eina_TLS *tls5;
   tls5 = malloc(sizeof(Eina_TLS));
   eina_tls_new(&(tls5[0]));
   eina_tls_set(tls5[0], &v);
   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        int *vv = eina_tls_get(tls5[0]);
        *vv^=y+1;
        asm("");
     }
   t = ecore_time_get();
   x += v;
   printf("tls5   %2.5f %08x = %1.3f ns / get\n", t - t0, x,
          ((t - t0 - thdbase) * 1000000000.0) / (double)l);
   free(tls5);

   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        dothingv1(i);
        asm("");
     }
   t = ecore_time_get();
   thdbase = t - t0;
   x += v1;
   printf("thnone %2.5f %08x = %1.3f ns / get\n", t - t0, x,
          ((t - t0) * 1000000000.0) / (double)l);

   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        dothingv2(i);
        asm("");
     }
   t = ecore_time_get();
   x += v2;
   printf("__thrd %2.5f %08x = %1.3f ns / get\n", t - t0, x,
          ((t - t0 - thdbase) * 1000000000.0) / (double)l);

   if (!eina_thread_create(&thread, EINA_THREAD_NORMAL, -1, thread_main3, NULL))
     exit(-1);
   eina_thread_join(thread);

   l /= 10;

   if (!eina_thread_create(&thread, EINA_THREAD_NORMAL, -1, thread_main, NULL))
     exit(-1);
   thq = eina_thread_queue_new();
   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        void *ref;
        struct thqmsg *dat;

        dat = eina_thread_queue_send(thq, sizeof(struct thqmsg), &ref);
        if (i == (l - 1)) memset(dat->dat, 0, PAYLOAD);
        else memset(dat->dat, 1, PAYLOAD);
        eina_thread_queue_send_done(thq, ref);
        asm("");
     }
   t = ecore_time_get();
   eina_thread_join(thread);
   printf("thq    %2.5f %08x = %1.3f ns / msg\n", t - t0, x,
          ((t - t0) * 1000000000.0) / (double)l);

   pipe(pipefd);
   if (!eina_thread_create(&thread, EINA_THREAD_NORMAL, -1, thread_main2, NULL))
     exit(-1);
   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        struct msg m;

        m.size = PAYLOAD;
        if (i == (l - 1)) memset(m.dat, 0, PAYLOAD);
        else memset(m.dat, 1, PAYLOAD);
        write(pipefd[1], &m, sizeof(struct msg));
        asm("");
     }
   t = ecore_time_get();
   eina_thread_join(thread);
   printf("pipe   %2.5f %08x = %1.3f ns / msg\n", t - t0, x,
          ((t - t0) * 1000000000.0) / (double)l);

   l *= 10;

   char *buf = malloc(32 * 1024 * 1024);
   char *buf2 = malloc(32 * 1024 * 1024);

   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        memset(buf, 1, 1 * 1024);
        asm("");
     }
   t = ecore_time_get();
   printf("mset1  %2.5f %08x = %1.3f ns / byte\n", t - t0, x,
          ((t - t0) * 1000000000.0) / ((double)l * 1 * 1024));

   l /= 10;

   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        memset(buf, 1, 8 * 1024);
        asm("");
     }
   t = ecore_time_get();
   printf("mset8  %2.5f %08x = %1.3f ns / byte\n", t - t0, x,
          ((t - t0) * 1000000000.0) / ((double)l * 8 * 1024));

   l /= 10;

   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        memset(buf, 1, 16 * 1024);
        asm("");
     }
   t = ecore_time_get();
   printf("mset16 %2.5f %08x = %1.3f ns / byte\n", t - t0, x,
          ((t - t0) * 1000000000.0) / ((double)l * 16 * 1024));

   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        memset(buf, 1, 32 * 1024);
        asm("");
     }
   t = ecore_time_get();
   printf("mset32 %2.5f %08x = %1.3f ns / byte\n", t - t0, x,
          ((t - t0) * 1000000000.0) / ((double)l * 32 * 1024));


   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        memset(buf, 1, 64 * 1024);
        asm("");
     }
   t = ecore_time_get();
   printf("mset64 %2.5f %08x = %1.3f ns / byte\n", t - t0, x,
          ((t - t0) * 1000000000.0) / ((double)l * 64 * 1024));

   l /= 100;

   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        memset(buf, 1, 1 * 1024 * 1024);
        asm("");
     }
   t = ecore_time_get();
   printf("mset1m %2.5f %08x = %1.3f ns / byte\n", t - t0, x,
          ((t - t0) * 1000000000.0) / ((double)l * 1 * 1024 * 1024));

   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        memset(buf, 1, 2 * 1024 * 1024);
        asm("");
     }
   t = ecore_time_get();
   printf("mset2m %2.5f %08x = %1.3f ns / byte\n", t - t0, x,
          ((t - t0) * 1000000000.0) / ((double)l * 2 * 1024 * 1024));

   l /= 10;

   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        memset(buf, 1, 4 * 1024 * 1024);
        asm("");
     }
   t = ecore_time_get();
   printf("mset4m %2.5f %08x = %1.3f ns / byte\n", t - t0, x,
          ((t - t0) * 1000000000.0) / ((double)l * 4 * 1024 * 1024));

   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        memset(buf, 1, 8 * 1024 * 1024);
        asm("");
     }
   t = ecore_time_get();
   printf("mset8m %2.5f %08x = %1.3f ns / byte\n", t - t0, x,
          ((t - t0) * 1000000000.0) / ((double)l * 8 * 1024 * 1024));

   l *= 10;
   l *= 10;
   l *= 10;
   l *= 100;

   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        memcpy(buf2, buf, 1 * 1024);
        asm("");
     }
   t = ecore_time_get();
   printf("mcpy1  %2.5f %08x = %1.3f ns / byte\n", t - t0, x,
          ((t - t0) * 1000000000.0) / ((double)l * 1 * 1024));

   l /= 10;

   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        memcpy(buf2, buf, 8 * 1024);
        asm("");
     }
   t = ecore_time_get();
   printf("mcpy8  %2.5f %08x = %1.3f ns / byte\n", t - t0, x,
          ((t - t0) * 1000000000.0) / ((double)l * 8 * 1024));

   l /= 10;

   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        memcpy(buf2, buf, 16 * 1024);
        asm("");
     }
   t = ecore_time_get();
   printf("mcpy16 %2.5f %08x = %1.3f ns / byte\n", t - t0, x,
          ((t - t0) * 1000000000.0) / ((double)l * 16 * 1024));

   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        memcpy(buf2, buf, 32 * 1024);
        asm("");
     }
   t = ecore_time_get();
   printf("mcpy32 %2.5f %08x = %1.3f ns / byte\n", t - t0, x,
          ((t - t0) * 1000000000.0) / ((double)l * 32 * 1024));

   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        memcpy(buf2, buf, 64 * 1024);
        asm("");
     }
   t = ecore_time_get();
   printf("mcpy64 %2.5f %08x = %1.3f ns / byte\n", t - t0, x,
          ((t - t0) * 1000000000.0) / ((double)l * 64 * 1024));

   l /= 100;

   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        memcpy(buf2, buf, 1 * 1024 * 1024);
        asm("");
     }
   t = ecore_time_get();
   printf("mcpy1m %2.5f %08x = %1.3f ns / byte\n", t - t0, x,
          ((t - t0) * 1000000000.0) / ((double)l * 1 * 1024 * 1024));

   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        memcpy(buf2, buf, 2 * 1024 * 1024);
        asm("");
     }
   t = ecore_time_get();
   printf("mcpy2m %2.5f %08x = %1.3f ns / byte\n", t - t0, x,
          ((t - t0) * 1000000000.0) / ((double)l * 2 * 1024 * 1024));

   l /= 10;

   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        memcpy(buf2, buf, 4 * 1024 * 1024);
        asm("");
     }
   t = ecore_time_get();
   printf("mcpy4m %2.5f %08x = %1.3f ns / byte\n", t - t0, x,
          ((t - t0) * 1000000000.0) / ((double)l * 4 * 1024 * 1024));

   t0 = ecore_time_get();
   for (i = 0; i < l; i++)
     {
        memcpy(buf2, buf, 8 * 1024 * 1024);
        asm("");
     }
   t = ecore_time_get();
   printf("mcpy8m %2.5f %08x = %1.3f ns / byte\n", t - t0, x,
          ((t - t0) * 1000000000.0) / ((double)l * 8 * 1024 * 1024));

}
