/*
 * Copyright © 2011 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#include "piglit-util-gl.h"

/**
 * @file time-elapsed.c
 *
 * Test TIME_ELAPSED and TIMESTAMP queries.
 */

PIGLIT_GL_TEST_CONFIG_BEGIN

	config.supports_gl_compat_version = 10;
	config.window_visual = PIGLIT_GL_VISUAL_DOUBLE | PIGLIT_GL_VISUAL_RGBA;

PIGLIT_GL_TEST_CONFIG_END

static char *vs_text =
	"#version 110\n"
	"void main()\n"
	"{\n"
	"  gl_Position = gl_Vertex;\n"
	"}\n";

/**
 * Time-wasting fragment shader.  This fragment shader computes:
 *
 *   x = (sum(i=0..(iters-1): 2*i) % iters) / iters
 *
 * This value should always work out to 0.0, but it's highly unlikely
 * that an optimizer will figure this out.  Hence we can use this
 * shader to waste an arbitrary amount of time (by suitable choice of
 * the value of iters).
 *
 * The shader outputs a color of (x, 1.0, 0.0, 0.0).
 */
static char *fs_text =
	"#version 110\n"
	"uniform int iters;\n"
	"void main()\n"
	"{\n"
	"  int cumulative_sum = 0;\n"
	"  for (int i = 0; i < iters; ++i) {\n"
	"    cumulative_sum += i;\n"
	"    if (cumulative_sum >= iters)\n"
	"      cumulative_sum -= iters;\n"
	"    cumulative_sum += i;\n"
	"    if (cumulative_sum >= iters)\n"
	"      cumulative_sum -= iters;\n"
	"  }\n"
	"  float x = float(cumulative_sum) / float(iters);\n"
	"  gl_FragColor = vec4(x, 1.0, 0.0, 0.0);\n"
	"}\n";

enum {
	TIME_ELAPSED,
	TIMESTAMP
} test = TIME_ELAPSED;

GLuint prog;
GLint iters_loc;

static float
draw(GLuint *q, int iters)
{
	int64_t start_time, end_time;

	glUseProgram(prog);
	glUniform1i(iters_loc, iters);

	start_time = piglit_time_get_nano();

	if (test == TIMESTAMP) {
		glQueryCounter(q[0], GL_TIMESTAMP);
	} else {
		glBeginQuery(GL_TIME_ELAPSED, q[0]);
	}
	piglit_draw_rect(-1, -1, 2, 2);
	if (test == TIMESTAMP) {
		glQueryCounter(q[1], GL_TIMESTAMP);
	} else {
		glEndQuery(GL_TIME_ELAPSED);
	}

	/* This glFinish() is important, since this is used in a
	 * timing loop.
	 */
	glFinish();

	end_time = piglit_time_get_nano();

	return (end_time - start_time)/ 1000.0 / 1000.0 / 1000.0;
}

static float
get_gpu_time(GLuint *q)
{
	GLint64EXT elapsed;

	if (test == TIMESTAMP) {
		GLint64 start, end;
		glGetQueryObjecti64vEXT(q[0], GL_QUERY_RESULT, &start);
		glGetQueryObjecti64vEXT(q[1], GL_QUERY_RESULT, &end);
		elapsed = end - start;
	} else {
		glGetQueryObjecti64vEXT(q[0], GL_QUERY_RESULT, &elapsed);
	}

	return elapsed / 1000.0 / 1000.0 / 1000.0;
}

enum piglit_result
piglit_display(void)
{
	bool pass = true;
	float green[4] = {0.0, 1.0, 0.0, 0.0};
	GLuint q[2];
	int iters;
#define	NUM_RESULTS 5
	float cpu_time[NUM_RESULTS];
	float gpu_time[NUM_RESULTS];
	float delta[NUM_RESULTS];
	float cpu_time_mean;
	float delta_mean, delta_stddev;
	float cpu_overhead;
	float t, t_cutoff;
	int i;

	glColor4f(0.0, 1.0, 0.0, 0.0);
	glGenQueries(2, q);

	/* Prime the drawing pipe before we start measuring time,
	 * since the first draw call is likely to be slower than all
	 * others.
	 */
	draw(q, 1);

	/* Figure out some baseline difference between GPU time
	 * elapsed and CPU time elapsed for a single draw call (CPU
	 * overhead of timer query and glFinish()).
	 *
	 * Note that this doesn't take into account any extra CPU time
	 * elapsed from start to finish if multiple batchbuffers are
	 * accumulated by the driver in getting to our 1/10th of a
	 * second elapsed time goal, and some other client sneaks
	 * rendering in in between those batches.
	 *
	 * Part of the rendering size being relatively large is to
	 * hopefully avoid that, though it might be better to have
	 * some time-consuming shader with a single draw call instead.
	 */
	cpu_overhead = 0;
	for (i = 0; i < NUM_RESULTS; i++) {
		cpu_time[i] = draw(q, 1);
		gpu_time[i] = get_gpu_time(q);

		cpu_overhead += cpu_time[i] - gpu_time[i];
	}
	cpu_overhead /= NUM_RESULTS;

	/* Find a number of draw calls that takes about 1/10th of a
	 * second.
	 */
retry:
	for (iters = 1; ; iters *= 2) {
		if (draw(q, iters) > 0.1)
			break;
		if (iters * 2 <= iters) {
			printf("Couldn't find appropriate number of iterations\n");
			piglit_report_result(PIGLIT_FAIL);
		}
	}

	/* Now, do several runs like this so we can determine if the
	 * timer matches up with wall time.
	 */
	for (i = 0; i < NUM_RESULTS; i++) {
		cpu_time[i] = draw(q, iters);
		gpu_time[i] = get_gpu_time(q);
	}

	cpu_time_mean = 0;
	delta_mean = 0;
	for (i = 0; i < NUM_RESULTS; i++) {
		delta[i] = cpu_time[i] - cpu_overhead - gpu_time[i];
		cpu_time_mean += cpu_time[i];
		delta_mean += delta[i];
	}
	cpu_time_mean /= NUM_RESULTS;
	delta_mean /= NUM_RESULTS;

	/* There's some risk of our "get to 0.1 seconds" loop deciding
	 * that a small number of iters was sufficient if we got
	 * scheduled out for a while.  Re-run if so.
	 *
	 * We wouldn't have that problem if we could rely on the GPU
	 * time elapsed query, but that's the thing we're testing.
	 */
	if (cpu_time_mean < 0.05)
		goto retry;

	/* Calculate stddevs. */
	delta_stddev = 0;
	for (i = 0; i < NUM_RESULTS; i++) {
		float d = delta[i] - delta_mean;
		delta_stddev += d * d / (NUM_RESULTS - 1);
	}
	delta_stddev = sqrt(delta_stddev);

	/* Dependent t-test for paired samples.
	 *
	 * This is a good test, because we expect the two times (cpu
	 * and gpu) of the samples to be correlated, and we expect the
	 * stddev to match (since time it should arise from system
	 * variables like scheduling of other tasks and state of the
	 * caches).  Unless maybe the variance of cpu time is greater
	 * than gpu time, because we may see scheduling accounted for
	 * in our CPU (wall) time, while scheduling other tasks
	 * doesn't end up counted toward our GPU time.
	 */
	t = delta_mean / (delta_stddev / sqrt(NUM_RESULTS));

	/* Integral of Student's t distribution for 4 degrees of
	 * freedom (NUM_RESULTS = 5), two-tailed (we care about
	 * difference above or below 0, not just one direction), at
	 * p = .05.
	 */
	t_cutoff = 2.776;

	/* Now test that our sampled distribution (rate of clock
	 * advance between CPU and GPU) was within expectations for a
	 * delta of 0.  I actually want to be testing the likelihood
	 * that the real difference is enough that we actually care.
	 * I didn't find an easy way to account for that after a bunch
	 * of wikipedia browsing, so I'll punt on proper analysis for
	 * now and just check that the sampled delta isn't too small
	 * to care about.
	 */
	if (t > t_cutoff && fabs(delta_mean) > .05 * cpu_time_mean) {
		fprintf(stderr, "GPU time didn't match CPU time\n");
		printf("Estimated CPU overhead: %f\n", cpu_overhead);
		printf("Difference: %f secs (+/- %f secs)\n",
		       delta_mean, delta_stddev);
		printf("t = %f\n", t);

		printf("%20s %20s %20s\n",
		       "gpu_time", "cpu_time", "delta");
		for (i = 0; i < NUM_RESULTS; i++) {
			printf("%20f %20f %20f\n",
			       gpu_time[i], cpu_time[i], delta[i]);
		}

		pass = false;
	}

	pass = piglit_probe_rect_rgba(0, 0, piglit_width, piglit_height,
				      green) && pass;

	piglit_present_results();

	glDeleteQueries(2, q);

	return pass ? PIGLIT_PASS : PIGLIT_FAIL;
}

void
piglit_init(int argc, char **argv)
{
	piglit_require_gl_version(20);

	prog = piglit_build_simple_program(vs_text, fs_text);
	iters_loc = glGetUniformLocation(prog, "iters");

	piglit_require_extension("GL_EXT_timer_query");

	if (argc == 2 && strcmp(argv[1], "timestamp") == 0) {
		piglit_require_extension("GL_ARB_timer_query");
		test = TIMESTAMP;
	}
}