@conference {ZhuAwa16A,
	title = {ONAC: Optimal Number of Active Cores Detector for Energy Efficient GPU Computing},
	booktitle = {Proceedings of the International Conference on Computer Design (ICCD)},
	year = {2016},
	month = {October},
	abstract = {Graphics Processing Units (GPUs) have become a prevalent platform for high throughput general purpose computing. The peak computational throughput of GPUs has been steadily increasing with each technology node by scaling the number of cores on the chip. Although this vastly improves the performance of several compute-intensive applications, our experiments show that some applications can achieve peak performance without utilizing all cores on the chip. We refer to the number of cores at which performance of an application saturates as the optimal number of active cores (Nopt). We propose executing the application on Nopt cores, and power-gating the unused cores to reduce static power consumption.

Towards this target, we present ONAC (Optimal Number of Active Cores detector), a runtime technique to detect Nopt. ONAC uses a novel estimation model, which significantly reduces the number of hardware samples taken to detect the optimal core count, compared to a sequential detection technique (SeqDet). We implement ONAC and Seq-Det in a cycle-level GPU performance simulator and analyze their effect on performance, power and energy. Our evaluation shows that ONAC and Seq-Det can reduce energy consumption by 20\% and 10\% on average for memory-intensive applications, without sacrificing more than 2\% performance. The higher energy savings for ONAC comes from reducing the detection time by 45\% as compared to Seq-Det.},
	author = {Xian Zhu and Mihir Awatramani and Diane Rover and Joseph Zambreno}
}
@conference {AwaZhu15A,
	title = {Phase Aware Warp Scheduling: Mitigating Effects of Phase Behavior in GPGPU Applications},
	booktitle = {Proceedings of the International Conference on Parallel Architectures and Compilation Techniques (PACT)},
	year = {2015},
	month = {October},
	abstract = {Graphics Processing Units (GPUs) have been widely adopted as accelerators for high performance computing due to the immense amount of computational throughput they offer over their CPU counterparts. As GPU architectures are optimized for throughput, they execute a large number of SIMD threads (warps) in parallel and use hardware multithreading to hide the pipeline and memory access latencies. While the Two-Level Round Robin (TLRR) and Greedy Then Oldest (GTO) warp scheduling policies have been widely accepted in the  academic research community, there is no consensus regarding which policy works best for all applications. 

In this paper, we show that the disparity regarding which scheduling policy works better depends on the characteristics of instructions in different regions (phases) of the application. We identify these phases at compile time and design a novel warp scheduling policy that uses information regarding them to make scheduling decisions at runtime. By mitigating the adverse effects of application phase behavior, our policy always performs closer to the better of the two existing policies for each application. We evaluate the performance of the warp schedulers on 35 kernels from the Rodinia and CUDA SDK benchmark suites. For applications that have a better performance with the GTO scheduler, our warp scheduler matches the performance of GTO with 99.2\% accuracy and achieves an average speedup of 6.31\% over RR. Similarly, for applications that perform better with RR, the performance of our scheduler is within of 98\% of RR and achieves an average speedup of 6.65\% over GTO.},
	author = {Mihir Awatramani and Xian Zhu and Joseph Zambreno and Diane Rover}
}
@conference {AwaRov14A,
	title = {Perf-Sat: Runtime Detection of Performance Saturation for GPGPU Applications},
	booktitle = {Proceedings of the International Workshop on Scheduling and Resource Management for Parallel and Distributed Systems (SRMPDS)},
	year = {2014},
	month = {September},
	abstract = {Graphic Processing Units (GPUs) achieve latency tolerance by exploiting massive amounts of thread level parallelism. Each core executes several hundred to a few thousand simultaneously active threads. The work scheduler tries to maximize the number of active threads on each core by launching threads until at least one of the required resources is completely utilized. The rationale is, more threads would give the thread scheduler more opportunities to hide memory latency and thus would result in better performance. In this work, we show that launching the maximum number of threads is not always necessary to achieve the best performance. Applications have an optimal thread count value at which the performance saturates. Increasing the number of threads beyond this value results in no better and sometimes worse performance. To this end, we develop Perf-Sat: a mechanism to detect the optimal number of threads required on each core at runtime. Perf-Sat is integrated into the hardware work scheduler and guides it to either increase or decrease the number of active threads. We evaluate the performance impact of our scheduler on two GPU generations and show that Perf-Sat scales well to different applications as well as architectures. With performance loss of less than 1\%, Perf-Sat is able to achieve core resource savings of 18.32\% on average.},
	author = {Mihir Awatramani and Diane Rover and Joseph Zambreno}
}
@conference {MihZam13A,
	title = {Increasing GPU Throughput using Kernel Interleaved Thread Block Scheduling},
	booktitle = {Proceedings of the International Conference on Computer Design (ICCD)},
	year = {2013},
	month = {October},
	abstract = {The number of active threads required to achieve peak application throughput on graphics processing units (GPUs) depends largely on the ratio of time spent on computation to the time spent accessing data from memory. While compute-intensive applications can achieve peak throughput with a low number of threads, memory-intensive applications might not achieve good throughput even at the maximum supported thread count. In this paper, we study the effects of scheduling work from multiple applications on the same GPU core. We claim that interleaving workload from different applications on a GPU core can improve the utilization of computational units and reduce the load on memory subsystem. Experiments on 17 application pairs from the Rodinia benchmark suite show that overall throughput increases by 7\% on average.},
	author = {Mihir Awatramani and Joseph Zambreno and Diane Rover}
}