@article {QasDen20A,
	title = {Benchmarking Vision Kernels and Neural Network Inference Accelerators on Embedded Platforms},
	journal = {Journal of Systems Architecture},
	volume = {113},
	year = {2021},
	month = {February},
	abstract = {<p>Developing efficient embedded vision applications requires exploring various algorithmic optimization trade-offs and a broad spectrum of hardware architecture choices. This makes navigating the solution space and finding the design points with optimal performance trade-offs a challenge for developers. To help provide a fair baseline comparison, we conducted comprehensive benchmarks of accuracy, run-time, and energy efficiency of a wide range of vision kernels and neural networks on multiple embedded platforms: ARM57 CPU, Nvidia Jetson TX2 GPU and Xilinx ZCU102 FPGA. Each platform utilizes their optimized libraries for vision kernels (OpenCV, VisionWorks and xfOpenCV) and neural networks (OpenCV DNN, TensorRT and Xilinx DPU). For vision kernels, our results show that the GPU achieves an energy/frame reduction ratio of 1.1\&ndash;3.2x compared to the others for simple kernels. However, for more complicated kernels and complete vision pipelines, the FPGA outperforms the others with energy/frame reduction ratios of 1.2\&ndash;22.3x. For neural networks [Inception-v2 and ResNet-50, ResNet-18, Mobilenet-v2 and SqueezeNet], it shows that the FPGA achieves a speed up of [2.5, 2.1, 2.6, 2.9 and 2.5]x and an EDP reduction ratio of [1.5, 1.1, 1.4, 2.4 and 1.7]x compared to the GPU FP16 implementations, respectively.</p>
},
	author = {Murad Qasaimeh and Kristof Denolf and Alireza Khodamoradi and Michaela Blott and Jack Lo and Lisa Halder and Kees Vissers and Joseph Zambreno and Phillip Jones}
}
@conference {QasDen19A,
	title = {Comparing Energy Efficiency of CPU, GPU and FPGA Implementations for Vision Kernels},
	booktitle = {Proceedings of the IEEE International Conference on Embedded Software and Systems (ICESS)},
	year = {2019},
	month = {June},
	abstract = {Developing high performance embedded vision applications requires balancing run-time performance with energy constraints. Given the mix of hardware accelerators that exist for embedded computer vision (e.g. multi-core CPUs, GPUs, and FPGAs), and their associated vendor optimized vision libraries,
it becomes a challenge for developers to navigate this fragmented solution space. To aid with determining which embedded platform is most suitable for their application, we conduct a comprehensive benchmark of the run-time performance and energy efficiency of a wide range of vision kernels. We discuss
rationales for why a given underlying hardware architecture innately performs well or poorly based on the characteristics of a range of vision kernel categories. Specifically, our study is performed for three commonly used HW accelerators for embedded vision applications: ARM57 CPU, Jetson TX2 GPU and ZCU102 FPGA, using their vendor optimized vision libraries: OpenCV, VisionWorks and xfOpenCV. Our results show that the GPU achieves an energy/frame reduction ratio of 1.1{\textendash}3.2X compared to the others for simple kernels. While for more complicated kernels and complete vision pipelines, the FPGA outperforms the others with energy/frame reduction ratios of 1.2{\textendash}22.3X. It is also observed that the FPGA performs increasingly better as a vision application{\textquoteright}s pipeline complexity grows.},
	author = {Murad Qasaimeh and Kristof Denolf and Jack Lo and Kees Vissers and Joseph Zambreno and Phillip Jones}
}