@article {2023, title = {Efficient Unmanned Aerial Systems Navigation With Collision Avoidance in Dense Urban Environments}, journal = {IEEE Transactions on Intelligent Transportation Systems (T-ITS)}, volume = {24}, year = {2023}, month = {August}, abstract = {
Unmanned Aerial Systems (UAS) are an emerging type of airborne traffic under active research expected to carry cargo and passengers in the future over dense population centers. One challenge is identifying algorithms which can compactly represent and navigate the available airspace while avoiding conflict with buildings and other UAS. In this paper, we explore a decentralized method coupling a medial axis graph for global navigation through the city with local collision avoidance of buildings and other UAS to obtain collision-free efficient navigation through an urban environment. We study trade-offs of using Optimal Reciprocal Collision Avoidance (ORCA), Rapidly-exploring Random Trees (RRT and RRT*), and Fast Markov Decision Process (FastMDP) as the collision avoidance algorithms. We examine low-altitude UAS navigating through a portion of New York City dense with skyscrapers to study the effectiveness of the algorithms in a challenging environment. We show that ORCA, RRT, RRT*, and FastMDP all are fairly efficient for 2D problems, but as the problem becomes more realistic (3D, constrained motion, aware of other UAS), FastMDP provides the best overall performance among the four algorithms studied.
}, author = {Joshua Bertram and Joseph Zambreno and Peng Wei} } @conference {2023, title = {An FPGA Implementation of SipHash}, booktitle = {Proceedings of the Reconfigurable Architectures Workshop (RAW)}, year = {2023}, month = {May}, abstract = {Cryptographic hash functions play a critical role in ensuring the security and veracity of network transactions; for example, they constitute the backbone of hash-based message authentication codes (HMACs), distributed hash tables (DHTs), and blockchain. However, cryptographic hashing can incur significant CPU overhead, especially for applications that commonly process large inputs exceeding 1 MB. This can make it infeasible to implement HMACs, DHTs, etc. in resource-constrained embedded systems or servers with strict response time requirements.\ As a solution, we present an FPGA architecture to accelerate SipHash, a promising cryptographic hash function.\ Our design constitutes the first SipHash implementation that targets maximum performance on an FPGA. The proposed architecture\&$\#$39;s throughput and acceleration vs. software were measured on Xilinx\&$\#$39;s Zynq-7000 and Ultrascale+ SoCs for a wide range of input sizes.\ These results show one core can provide single-threaded throughput of up to 13.7 Gbps on a modern FPGA fabric, and multiple parallel cores can exceed 100 Gbps, allowing applications like blockchain and peer-to-peer file sharing to scale with emerging high-bandwidth networks.\ A single core can keep pace with 10 Gigabit Ethernet, and further parallelization can empower FPGA designs to fully utilize higher network bandwidths.
}, author = {Benjamin Welte and Joseph Zambreno} } @article {2021, title = {A Fast Markov Decision Process based Algorithm for Collision Avoidance in Urban Air Mobility}, journal = {IEEE Transactions on Intelligent Transportation Systems (T-ITS)}, volume = {23}, year = {2022}, month = {September}, abstract = {The smart city landscape is rife with opportunities for mobility and economic optimization, but also presents many security concerns spanning the range of components and systems in the smart ecosystem. One key enabler for this ecosystem is smart transportation and transit, which is foundationally built upon connected vehicles. Ensuring vehicular security, while necessary to guarantee passenger and pedestrian safety, is itself challenging due to the broad attack surfaces of modern automotive systems. A single car contains dozens to hundreds of small embedded computing devices known as electronic control units (ECUs) executing 100s of millions of lines of code; the inherent complexity of this tightly-integrated cyber-physical system (CPS) is one of the key problems that frustrates effective security. We describe an approach to help reduce the complexity of security analyses by leveraging unsupervised machine learning to learn clusters of messages passed between ECUs that correlate with changes in the CPS state of a vehicle as it moves through the world. Our approach can help to improve the security of vehicles in a smart city, and can leverage smart city infrastructure to further enrich and refine the quality of the machine learning output.
}, author = {Uchenna Ezeobi and Habeeb Olufowobi and Clinton Young and Joseph Zambreno and Gedare Bloom} } @article {QasDen20A, title = {Benchmarking Vision Kernels and Neural Network Inference Accelerators on Embedded Platforms}, journal = {Journal of Systems Architecture}, volume = {113}, year = {2021}, month = {February}, abstract = {Developing efficient embedded vision applications requires exploring various algorithmic optimization trade-offs and a broad spectrum of hardware architecture choices. This makes navigating the solution space and finding the design points with optimal performance trade-offs a challenge for developers. To help provide a fair baseline comparison, we conducted comprehensive benchmarks of accuracy, run-time, and energy efficiency of a wide range of vision kernels and neural networks on multiple embedded platforms: ARM57 CPU, Nvidia Jetson TX2 GPU and Xilinx ZCU102 FPGA. Each platform utilizes their optimized libraries for vision kernels (OpenCV, VisionWorks and xfOpenCV) and neural networks (OpenCV DNN, TensorRT and Xilinx DPU). For vision kernels, our results show that the GPU achieves an energy/frame reduction ratio of 1.1\–3.2x compared to the others for simple kernels. However, for more complicated kernels and complete vision pipelines, the FPGA outperforms the others with energy/frame reduction ratios of 1.2\–22.3x. For neural networks [Inception-v2 and ResNet-50, ResNet-18, Mobilenet-v2 and SqueezeNet], it shows that the FPGA achieves a speed up of [2.5, 2.1, 2.6, 2.9 and 2.5]x and an EDP reduction ratio of [1.5, 1.1, 1.4, 2.4 and 1.7]x compared to the GPU FP16 implementations, respectively.
}, author = {Murad Qasaimeh and Kristof Denolf and Alireza Khodamoradi and Michaela Blott and Jack Lo and Lisa Halder and Kees Vissers and Joseph Zambreno and Phillip Jones} } @conference {2021, title = {An Efficient Hardware Architecture for Sparse Convolution using Linear Feedback Shift Registers}, booktitle = {Proceedings of the International Conference on Application-specific Systems, Architectures and Processors (ASAP)}, year = {2021}, month = {July}, abstract = {Deep convolutional neural networks (CNNs) have shown remarkable success in many computer vision tasks. However, their intensive storage, bandwidth and computational requirements limit their deployment to embedded platforms. Although several research efforts have shown that pruning redundant weights could significantly reduce storage and computations, working with sparse weights remains challenging. The irregular computation of sparse weights and the overhead of managing their representation limit the efficiency of the underlaying hardware. To address these issues, we propose a hardware-friendly pruning algorithm that generates structured sparse weights. In this algorithm, locations of non-zero weights are derived on-chip in real-time using Linear Feedback Shift Registers (LFSRs) to eliminate the overhead of managing sparse weight representations. In this paper, we also propose a hardware inference engine for sparse convolution on FPGAs. It uses LFSRs to localize non-zero weights within weights tensors and avoids copying sparse weights indices by generating them on-chip. Experimental results show that the proposed pruning method can reduce the size of VGG16, ResNet50, and InceptionV3 models by 80\%, 76\% and 65\% with less than 2\% accuracy loss. Experiments also demonstrate that our accelerator can achieve 456-534 effective GOP/s for the modern CNNs on Xilinx ZCU102, which provides a 1.2-2.7\× speedup over previous sparse CNN accelerators on FPGAs.
}, doi = {QasZam21A}, author = {Murad Qasaimeh and Joseph Zambreno and Phillip Jones} } @conference {BerZam21A, title = {Scalable FastMDP for Pre-departure Airspace Reservation and Strategic De-conflict}, booktitle = {Proceedings of the AIAA SciTech Forum}, year = {2021}, month = {January}, abstract = {Pre-departure flight plan scheduling for Urban Air Mobility (UAM) and cargo delivery drones will require on-demand scheduling of large numbers of aircraft. We demonstrate an algorithm known as FastMDP-GPU that performs first-come-first-served pre-departure flight plan scheduling where conflict free flight plans are generated on demand. We demonstrate a parallelized implementation of the algorithm on a Graphics Processor Unit (GPU) and show the level of performance and scaling that can be achieved. Our results show that on commodity GPU hardware we can perform flight plan scheduling against 2000-3000 known flight plans and with server-class hardware the performance can be higher. Our results show promise for implementing a large scale UAM scheduler capable of performing on-demand flight scheduling that would be suitable for both centralized or distributed flight planning systems.
}, author = {Joshua Bertram and Joseph Zambreno and Peng Wei} } @article {SahDuw20A, title = {CyNAPSE: A Low-power Reconfigurable Neural Inference Accelerator for Spiking Neural Networks}, journal = {Journal of Signal Processing Systems}, volume = {92}, year = {2020}, abstract = {While neural network models keep scaling in depth and computational requirements, biologically accurate models are becoming more interesting for low-cost inference. Coupled with the need to bring more computation to the edge in resource-constrained embedded and IoT devices, specialized ultra-low power accelerators for spiking neural networks are being developed. Having a large variance in the models employed in these networks, these accelerators need to be flexible, user-configurable, performant and energy efficient. In this paper, we describe CyNAPSE, a fully digital accelerator designed to emulate neural dynamics of diverse spiking networks. Since the use case of our implementation is primarily concerned with energy efficiency, we take a closer look at the factors that could improve its energy consumption. We observe that while majority of its dynamic power consumption can be credited to memory traffic, its on-chip components suffer greatly from static leakage. Given that the event-driven spike processing algorithm is naturally memory-intensive and has a large number of idle processing elements, it makes sense to tackle each of these problems towards a more efficient hardware implementation. With a diverse set of network benchmarks, we incorporate a detailed study of memory patterns that ultimately informs our choice of an application-specific network-adaptive memory management strategy to reduce dynamic power consumption of the chip. Subsequently, we also propose and evaluate a leakage mitigation strategy for runtime control of idle power. Using both the RTL implementation and a software simulation of CyNAPSE, we measure the relative benefits of these undertakings. Results show that our adaptive memory management policy results in up to 22\% more reduction in dynamic power consumption.}, author = {Saunak Saha and Henry Duwe and Joseph Zambreno} } @conference {KemZha20A, title = {Embedding Online Runtime Verification for Fault Disambiguation on Robonaut2}, booktitle = {Proceedings of the International Conference on Formal Modeling and Analysis of Timed Systems (FORMATS)}, year = {2020}, month = {September}, abstract = {Robonaut2 (R2) is a humanoid robot onboard the International Space Station (ISS), performing specialized tasks in collaboration with astronauts. After deployment, R2 developed an unexpected emergent behavior. R2{\textquoteright}s inability to distinguish between knee-joint faults (e.g., due to sensor drift versus violated environmental assumptions) began triggering mid-task, safety-preserving freezes-in-place in the confined space of the ISS, preventing further motion until a ground-control operator determines the root-cause and initiates proper corrective action. Runtime verification (RV) algorithms can efficiently disambiguate the temporal signatures of different faults in real-time. However, no previous RV engine can operate within the limited available resources and specialized platform constraints of R2{\textquoteright}s hardware architecture. An attempt to deploy the only runtime verification engine designed for embedded flight systems, R2U2, failed due to resource constraints. We present a significant redesign of the core R2U2 algorithms to adapt to severe resource and certification constraints and prove their correctness, time complexity, and space requirements. We further define optimizations enabled by our new algorithms and implement the new version of R2U2. We encode specifications describing real-life faults occurring onboard Robonaut2 using MLTL and detail our process of specification debugging, validation, and refinement. We deploy this new version of R2U2 on Robonaut2, demonstrating successful real-time fault disambiguation and mitigation triggering of R2{\textquoteright}s knee-joint faults without false positives.}, author = {Brian Kempa and Pei Zhang and Phillip Jones and Joseph Zambreno and Kristin Yvonne Rozier} } @conference {PivJon20A, title = {ParaHist: FPGA Implementation of Parallel Event-Based Histogram for Optical Flow Calculation}, booktitle = {Proceedings of the International Conference on Application-specific Systems, Architectures and Processors (ASAP)}, year = {2020}, month = {July}, abstract = {In this paper, we present an FPGA-based architecture for histogram generation to support event-based camera optical flow calculation. Our proposed histogram generation mechanism reduces memory and logic resources by storing the time difference between consecutive events, instead of the absolute time of each event. Additionally, we explore the trade-off between system resource usage and histogram accuracy as a function of the precision at which time is encoded. Our results show that across three event-based camera benchmarks we can reduce the encoding of time from 32 to 7 bits with a loss of only approximately 3\% in histogram accuracy. In comparison to a software implementation, our architecture shows a significant speedup.}, author = {Mohammad Pivezhandi and Phillip Jones and Joseph Zambreno} } @article {OluYou19A, title = {SAIDuCANT: Specification-based Automotive Intrusion Detection using Controller Area Network (CAN) Timing}, journal = {IEEE Transactions on Vehicular Technology}, volume = {69}, year = {2020}, month = {February}, abstract = {The proliferation of embedded devices in modern vehicles has opened the traditionally-closed vehicular system to the risk of cybersecurity attacks through physical and remote access to the in-vehicle network such as the controller area network (CAN). The CAN bus does not implement a security protocol that can protect the vehicle against the increasing cyber and physical attacks. To address this risk, we introduce a novel algorithm to extract the real-time model parameters of the CAN bus and develop SAIDuCANT, a specification-based intrusion detection system (IDS) using anomaly-based supervised learning with the real-time model as input.We evaluate the effectiveness of SAIDuCANT with real CAN logs collected from two passenger cars and on an open source CAN dataset collected from real-world scenarios. Experimental results show that SAIDuCANT can effectively detect data injection attacks with low false positive rates and outperforms other detection approaches using the timing features of CAN messages.}, author = {Habeeb Olufowobi and Clinton Young and Joseph Zambreno and Gedare Bloom} } @conference {YouSvo20A, title = {Towards Reverse Engineering Controller Area Network Messages Using Machine Learning}, booktitle = {Proceedings of the IEEE World Forum on Internet of Things (WF-IoT)}, year = {2020}, month = {April}, abstract = {The automotive Controller Area Network (CAN) allows Electronic Control Units (ECUs) to communicate with each other and control various vehicular functions such as engine and braking control. Consequently CAN and ECUs are high priority targets for hackers. As CAN implementation details are held as proprietary information by vehicle manufacturers, it can be challenging to decode and correlate CAN messages to specific vehicle operations. To understand the precise meanings of CAN messages, reverse engineering techniques that are time-consuming, manually intensive, and require a physical vehicle are typically used. This work aims to address the process of reverse engineering CAN messages for their functionality by creating a machine learning classifier that analyzes messages and determines their relationship to other messages and vehicular functions. Our work examines CAN traffic of different vehicles and standards to show that it can be applied to a wide arrangement of vehicles. The results show that the function of CAN messages can be determined without the need to manually reverse engineer a physical vehicle. }, author = {Clinton Young and Jordan Svoboda and Joseph Zambreno} } @conference {SahDuw19A, title = {An Adaptive Memory Management Strategy Towards Energy Efficient Machine Inference in Event-Driven Neuromorphic Accelerators}, booktitle = {Proceedings of the International Conference on Application-specific Systems, Architectures and Processors (ASAP)}, year = {2019}, month = {July}, abstract = {Spiking neural networks are viable alternatives to classical neural networks for edge processing in low-power embedded and IoT devices. To reap their benefits, neuromorphic network accelerators that tend to support deep networks still have to expend great effort in fetching synaptic states from a large remote memory. Since local computation in these networks is event-driven, memory becomes the major part of the system{\textquoteright}s energy consumption. In this paper, we explore various opportunities of data reuse that can help mitigate the redundant traffic for retrieval of neuron meta-data and post-synaptic weights. We describe CyNAPSE, a baseline neural processing unit and its accompanying software simulation as a general template for exploration on various levels. We then investigate the memory access patterns of three spiking neural network benchmarks that have significantly different topology and activity. With a detailed study of locality in memory traffic, we establish the factors that hinder conventional cache management philosophies from working efficiently for these applications. To that end, we propose and evaluate a domain-specific management policy that takes advantage of the forward visibility of events in a queue-based event-driven simulation framework. Subsequently, we propose network-adaptive enhancements to make it robust to network variations. As a result, we achieve 13-44\% reduction in system power consumption and 8-23\% improvement over conventional replacement policies.}, author = {Saunak Saha and Henry Duwe and Joseph Zambreno} } @conference {OluEze19A, title = {Anomaly Detection Approach Using Adaptive Cumulative Sum Algorithm for Controller Area Network}, booktitle = {Proceedings of the ACM Workshop on Automotive Cybersecurity (AutoSec)}, year = {2019}, month = {March}, abstract = {The modern vehicle has transformed from a purely mechanical system to a system that embeds several electronic devices. These devices communicate through the in-vehicle network for enhanced safety and comfort but are vulnerable to cyber-physical risks and attacks. A well-known technique of detecting these attacks and unusual events is by using intrusion detection systems. Anomalies in the network occur at unknown points and produce abrupt changes in the statistical features of the message stream. In this paper, we propose an anomaly-based intrusion detection approach using the cumulative sum (CUSUM) change-point detection algorithm to detect data injection attacks on the controller area network (CAN) bus. We leverage the parameters required for the change-point algorithm to reduce false alarm rate and detection delay. Using real dataset generated from a car in normal operation, we evaluate our detection approach on three different kinds of attack scenarios.}, author = {Habeeb Olufowobi and Uchenna Ezeobi and Eric Muhati and Gaylon Robinson and Clinton Young and Joseph Zambreno and Gedare Bloom} } @conference {YouOlu19A, title = {Automotive Intrusion Detection Based on Constant CAN Message Frequencies Across Vehicle Driving Modes}, booktitle = {Proceedings of the ACM Workshop on Automotive Cybersecurity (AutoSec)}, year = {2019}, month = {March}, abstract = {The modern automobile relies on numerous electronic control units communicating over the de facto standard of the controller area network (CAN) bus. This communication network was not developed with cybersecurity in mind. Many methods based on constant time intervals between messages have been proposed to address this lack of security issue with the CAN bus. However, these existing methods may struggle to handle variable time intervals between messages during transitions of vehicle driving modes. This paper proposes a simple and cost-effective method to ensure the security of the CAN bus that is based on constant message frequencies across vehicle driving modes. This proposed method does not require any modifications on the existing CAN bus and it is designed with the intent for efficient execution in platforms with very limited computational resources. Test results with the proposed method against two different vehicles and a frequency domain analysis are also presented in the paper.}, author = {Clinton Young and Habeeb Olufowobi and Gedare Bloom and Joseph Zambreno} } @conference {QasDen19A, title = {Comparing Energy Efficiency of CPU, GPU and FPGA Implementations for Vision Kernels}, booktitle = {Proceedings of the IEEE International Conference on Embedded Software and Systems (ICESS)}, year = {2019}, month = {June}, abstract = {Developing high performance embedded vision applications requires balancing run-time performance with energy constraints. Given the mix of hardware accelerators that exist for embedded computer vision (e.g. multi-core CPUs, GPUs, and FPGAs), and their associated vendor optimized vision libraries, it becomes a challenge for developers to navigate this fragmented solution space. To aid with determining which embedded platform is most suitable for their application, we conduct a comprehensive benchmark of the run-time performance and energy efficiency of a wide range of vision kernels. We discuss rationales for why a given underlying hardware architecture innately performs well or poorly based on the characteristics of a range of vision kernel categories. Specifically, our study is performed for three commonly used HW accelerators for embedded vision applications: ARM57 CPU, Jetson TX2 GPU and ZCU102 FPGA, using their vendor optimized vision libraries: OpenCV, VisionWorks and xfOpenCV. Our results show that the GPU achieves an energy/frame reduction ratio of 1.1{\textendash}3.2X compared to the others for simple kernels. While for more complicated kernels and complete vision pipelines, the FPGA outperforms the others with energy/frame reduction ratios of 1.2{\textendash}22.3X. It is also observed that the FPGA performs increasingly better as a vision application{\textquoteright}s pipeline complexity grows.}, author = {Murad Qasaimeh and Kristof Denolf and Jack Lo and Kees Vissers and Joseph Zambreno and Phillip Jones} } @article {YouOlu19B, title = {Survey of Automotive Controller Area Network Intrusion Detection Systems}, journal = {IEEE Design and Test}, volume = {36}, year = {2019}, month = {December}, abstract = {Control Area Network (CAN) is one of the most popular targets for malicious attacks and exploitations in modern automotive systems. The goal of intrusion detection systems (IDS) is to identify and mitigate security attacks; consequently, they are of paramount importance to automotive security. This article surveys the state of the art in IDS, with special emphasis on techniques for detecting attacks on CAN modules.}, author = {Clinton Young and Habeeb Olufowobi and Gedare Bloom and Joseph Zambreno} } @article {GriDav18A, title = {ARMOR: A Recompilation and Instrumentation-free Monitoring Architecture for Detecting Memory Exploits}, journal = {IEEE Transactions on Computers (TC)}, volume = {67}, year = {2018}, abstract = {Software written in programming languages that permit manual memory management, such as C and C++, are often littered with exploitable memory errors. These memory bugs enable attackers to leak sensitive information, hijack program control flow, or otherwise compromise the system and are a critical concern for computer security. Many runtime monitoring and protection approaches have been proposed to detect memory errors in C and C++ applications, however, they require source code recompilation or binary instrumentation, creating compatibility challenges for applications using proprietary or closed source code, libraries, or plug-ins. This paper introduces a new approach for detecting heap memory errors that does not require applications to be recompiled or instrumented. We show how to leverage the calling convention of a processor to track all dynamic memory allocations made by an application during runtime. We also present a transparent tracking and caching architecture to efficiently verify program heap memory accesses. Performance simulations of our architecture using SPEC benchmarks and real-world application workloads show our architecture achieves hit rates over 95\% for a 256-entry cache, resulting in only 2.9\% runtime overhead. Security analysis using a software prototype shows our architecture detects 98\% of heap memory errors from selected test cases in the Juliet Test Suite and real-world exploits.}, author = {Alex Grieve and Michael Davies and Phillip Jones and Joseph Zambreno} } @conference {AveJon18A, title = {An FPGA-based Hardware Accelerator for Iris Segmentation}, booktitle = {Proceedings of the International Conference on ReConFigurable Computing and FPGAs (ReConFig)}, year = {2018}, month = {December}, abstract = {Biometric authentication is becoming an increasingly prevalent way to identify a person based on unique physical traits such as their fingerprints, face, and iris. The iris stands out particularly among these traits due to its relative invariability with time and high uniqueness. However, iris recognition without special, dedicated tools like near-infrared (NIR) cameras and stationary high-performance computers is a challenge. Solutions have been proposed to target mobile platforms like smart phones and tablets by making use of the Red-Green-Blue (RGB) camera commonly found on those platforms. These solutions tend to be slower than the former due to the reduced performance available in mobile processors. In this paper, we detail a Field Programmable Gate Array (FPGA)-based System on Chip (SoC) approach to help address the mobility and performance challenges that exist in current iris segmentation solutions. Our SoC architecture allows us to run the iris recognition system in software, while accelerating slower parts of the system by using parallel, dedicated hardware modules. Our approach showed a speedup in segmentation of 22x when compared to an x86-64 platform and 468x when compared to an ARMv7 platform.}, author = {Joe Avey and Phillip Jones and Joseph Zambreno} } @conference {CauZam18A, title = {HW/SW Configurable LQG Controller using a Sequential Discrete Kalman Filter}, booktitle = {Proceedings of the International Conference on ReConFigurable Computing and FPGAs (ReConFig)}, year = {2018}, month = {December}, abstract = {A hardware/software configurable architecture for Linear Quadratic Gaussian (LQG) control is presented, which is a combination of a Linear Quadratic Regulator (LQR) control law with a Discrete Kalman Filter (DKF) state estimator. LQG controllers are ideal candidates for hardware acceleration, since the DKF algorithm requires matrix inversion, which is time consuming and potentially parallelizable. In this design, a functionally equivalent DKF method, called the Sequential Discrete Kalman Filter (SDKF), is used to transform the matrix inversion into an iterative scalar inversion. The proposed design acts as an Intellectual Property (IP) Core, where the user can adjust scaling parameters in hardware and configuration parameters in software to tailor the given architecture to a wide range of physical systems. This differentiates the proposed design from other LQG implementations since it is not application specific; in fact, this architecture, which was targeted for a Xilinx Zynq-7020 FPGA, allows for systems of state size 4 to 128 and achieves a speedup of 23.6 to 167 over a 2.7GHz quad-core processor. The goal of this approach is to support a design methodology for bridging the gap between control theory and embedded systems applications. For evaluation, this architecture was compared to a pure software LQG implementation. Additionally, the approach and results of recent LQG and LQG-related hardware designs were analyzed and compared to the proposed design.}, author = {Matthew Cauwels and Joseph Zambreno and Phillip Jones} } @conference {ZhuWer18A, title = {Improving First Level Cache Efficiency for GPUs Using Dynamic Line Protection}, booktitle = {Proceedings of the International Conference on Parallel Processing (ICPP)}, year = {2018}, month = {August}, abstract = { A modern Graphics Processing Unit (GPU) utilizes L1 Data (L1D) caches to reduce memory bandwidth requirements and latencies. However, the L1D cache can easily be overwhelmed by many memory requests from GPU function units, which can bottleneck GPU performance. It has been shown that the performance of L1D caches is greatly reduced for many GPU applications as a large amount of L1D cache lines are replaced before they are re-referenced. By examining the cache access patterns of these applications, we observe L1D caches with low associativity have difficulty capturing data locality for GPU applications with diverse reuse patterns. These patterns result in frequent line replacements and low data re-usage. To improve the efficiency of L1D caches, we design a Dynamic Line Protection scheme (DLP) that can both preserve valuable cache lines and increase cache line utilization. DLP collects data reuse information from the L1D cache. This information is used to predict protection distances for each memory instruction at runtime, which helps maintain a balance between exploitation of data locality and over-protection of cache lines with long reuse distances. When all cache lines are protected in a set, redundant cache misses are bypassed to reduce the contention for the set. The evaluation result shows that our proposed solution improves cache hits while reducing cache traffic for cache-insufficient applications, achieving up to 137\% and an average of 43\% IPC improvement over the baseline.}, author = {Xian Zhu and Robert Wernsman and Joseph Zambreno} } @conference {QasZam18A, title = {A Runtime Configurable Hardware Architecture for Computing Histogram-based Feature Descriptors}, booktitle = {Proceedings of the International Symposium on Field-Programmable Logic and Applications (FPL)}, year = {2018}, month = {August}, abstract = {Feature description is an essential component of many computer vision applications. It encodes the visual contents of images in a manner that is robust against various image transformations. Computing these descriptors is computationally expensive, which causes a performance bottleneck in many embedded vision systems. Although many hardware architectures have been proposed to accelerate feature description computation, most target a single feature description algorithm under specific constraints. The lack of flexibility of such implementations increases development effort if deployed applications need to be modified or upgraded. In this paper, we propose a software configurable hardware architecture capable of computing different types of histogram-based feature descriptors without the need for re-synthesizing the hardware. The architecture takes advantage of data streaming to reduce the computational complexity of computing this class of descriptor. To illustrate the efficiency of our architecture, we deploy two of the most commonly used descriptors (SIFT and HOG) and compare their quality with software implementations. The architecture is also evaluated in terms of execution speed and resource usage and compared with dedicated hardware architectures. Our flexible architecture shows a speed up of 3x and 5x compared to state-of-the-art dedicated hardware architectures for SIFT and HOG, with resource usage overheads [LUTs, FFs, and DSPs] of [1.1x, 15x, and 1.6x] and [6.4x, 7x, and 32x] for SIFT and HOG, respectively.}, author = {Murad Qasaimeh and Joseph Zambreno and Phillip Jones} } @conference {OluBlo18A, title = {Work-in-Progress: Real-Time Modeling for Intrusion Detection in Automotive Controller Area Network}, booktitle = {Proceedings of the IEEE Real-Time Systems Symposium (RTSS)}, year = {2018}, month = {December}, abstract = {Security of vehicular networks has often been an afterthought since they are designed traditionally to be a closed system. An attack could lead to catastrophic effect which may include loss of human life or severe injury to the driver and passengers of the vehicle. In this paper, we propose a novel algorithm to extract the real-time model of the controller area network (CAN) and develop a specification-based intrusion detection system (IDS) using anomaly-based supervised learning with the real-time model as input. We evaluate IDS performance with real CAN logs collected from a sedan car.}, author = {Habeeb Olufowobi and Gedare Bloom and Clinton Young and Joseph Zambreno} } @article {ZhaMil17A, title = {The Design and Integration of a Software Configurable and Parallelized Coprocessor Architecture for LQR Control}, journal = {Journal of Parallel and Distributed Computing (JPDC)}, volume = {106}, year = {2017}, pages = {121-131}, abstract = {We present a software configurable and parallelized coprocessor architecture for LQR control that can control physical processes representable by a linear state-space model. Our proposed architecture has distinct advantages over purely software or purely hardware approaches. It differs from other hardware controllers in that it is not hardwired to control one or a small range of plant types (e.g. only electric motors). Via software, an embedded systems engineer can easily reconfigure the controller to suit a wide range of controls applications that can be represented as a state-space model. One goal of our approach is to support a design methodology to help bridge the gap between controls and embedded system software engineering. Control of the well-understood inverted pendulum on a cart is used as an illustrative example of how the proposed hardware accelerator architecture supports our envisioned design methodology for helping bridge this gap. Additionally, we explore the design space of our co-processor{\textquoteright}s parallel architecture in terms of computing speed and resource utilization. Our performance results show a 3.4 to 100 factor speedup over a 666 MHz embedded ARM processor, for plants that can be represented by 4 to 128 states respectively. Finally we describe integration of the controller with the input and output interfaces needed to control a real inverted pendulum. }, author = {Pei Zhang and Aaron Mills and Joseph Zambreno and Phillip Jones} } @conference {ZhaZam17A, title = {An Embedded Scalable Linear Model Predictive Hardware-based Controller using ADMM}, booktitle = {Proceedings of the International Conference on Application-specific Systems, Architectures and Processors (ASAP)}, year = {2017}, month = {July}, abstract = {Model predictive control (MPC) is a popular advanced model-based control algorithm for controlling systems that must respect a set of system constraints (e.g. actuator force limitations). However, the computing requirements of MPC limits the suitability of deploying its software implementation into embedded controllers requiring high update rates. This paper presents a scalable embedded MPC controller implemented on a field-programmable gate array (FPGA) coupled with an on-chip ARM processor. Our architecture implements an Alternating Direction Method of Multipliers (ADMM) approach for computing MPC controller commands. All computations are performed using floating-point arithmetic. We introduce a software/hardware (SW/HW) co-design methodology, for which the ARM software can configure on-chip Block RAM to allow users to 1) configure the MPC controller for a wide range of plants, and 2) update at runtime the desired trajectory to track. Our hardware architecture has the flexibility to compromise between the amount of hardware resources used (regarding Block RAMs and DSPs) and the controller computing speed. For example, this flexibility gives the ability to control plants modeled by a large number of decision variables (i.e. a plant model using many Block RAMs) with a small number of computing resources (i.e. DSPs) at the cost of increased computing time. The hardware controller is verified using a Plant-on-Chip (PoC), which is configured to emulate a mass-spring system in real-time. A major driving goal of this work is to architect an SW/HW platform that brings FPGAs a step closer to being widely adopted by advanced control algorithm designers for deploying their algorithms into embedded systems.}, author = {Pei Zhang and Joseph Zambreno and Phillip Jones} } @conference {QasJon17A, title = {A Modified Sliding Window Architecture for Efficient BRAM Resource Utilization}, booktitle = {Proceedings of the Reconfigurable Architectures Workshop (RAW)}, year = {2017}, month = {May}, abstract = {Sliding window is one of the most commonly used techniques in image processing algorithms. Implementing it in hardware requires buffering image rows on-chip to exploit data locality and avoid redundant off-chip pixel transfers. However, scaling this architecture to large window sizes or high resolutions linearly increases on-chip memory utilization. This imposes limitations on porting many image processing algorithms into hardware efficiently. In this paper, we propose a new sliding window architecture that utilizes less on-chip memory resources while maintaining performance as compared to the traditional method. The proposed architecture exploits that most natural images have smooth color variations with fine details in between these variations to compress images. It decomposes non-zero image pixels into their wavelet components and represents each wavelet coefficient with a minimum number of bits. The architecture is also flexible to use lossless or lossy compression based on a configurable threshold value. The FPGA implementation of our proposed architecture shows memory saving of 25-70\% compared to the traditional architecture using lossless compression, and for lossy compression with up to a mean square error of 5 achieves up to 84\% in memory savings.}, author = {Murad Qasaimeh and Phillip Jones and Joseph Zambreno} } @conference {ZhuAwa16A, title = {ONAC: Optimal Number of Active Cores Detector for Energy Efficient GPU Computing}, booktitle = {Proceedings of the International Conference on Computer Design (ICCD)}, year = {2016}, month = {October}, abstract = {Graphics Processing Units (GPUs) have become a prevalent platform for high throughput general purpose computing. The peak computational throughput of GPUs has been steadily increasing with each technology node by scaling the number of cores on the chip. Although this vastly improves the performance of several compute-intensive applications, our experiments show that some applications can achieve peak performance without utilizing all cores on the chip. We refer to the number of cores at which performance of an application saturates as the optimal number of active cores (Nopt). We propose executing the application on Nopt cores, and power-gating the unused cores to reduce static power consumption. Towards this target, we present ONAC (Optimal Number of Active Cores detector), a runtime technique to detect Nopt. ONAC uses a novel estimation model, which significantly reduces the number of hardware samples taken to detect the optimal core count, compared to a sequential detection technique (SeqDet). We implement ONAC and Seq-Det in a cycle-level GPU performance simulator and analyze their effect on performance, power and energy. Our evaluation shows that ONAC and Seq-Det can reduce energy consumption by 20\% and 10\% on average for memory-intensive applications, without sacrificing more than 2\% performance. The higher energy savings for ONAC comes from reducing the detection time by 45\% as compared to Seq-Det.}, author = {Xian Zhu and Mihir Awatramani and Diane Rover and Joseph Zambreno} } @conference {WanZam16A, title = {Parallelizing Latent Semantic Indexing Using an FPGA-based Architecture}, booktitle = {Proceedings of the International Conference on Computer Design (ICCD)}, year = {2016}, month = {October}, abstract = {Latent Semantic Indexing (LSI) has played a significant role in discovering patterns on the relationships between query terms and unstructured documents. However, the inherent characteristics of complex matrix factorization in LSI make it difficult to meet stringent performance requirements. In this paper, we present a deeply pipelined reconfigurable architecture for LSI, which parallelizes the matrix factorization and dimensionality reduction, computation of cosine similarity between vectors, and the ranking of documents. Our architecture implements the reduced Singular Value Decomposition with Hestenes-Jacobi algorithm, in which both singular values and orthogonal vectors are collected, and its components can be reconfigured to update query vector coordinate and calculate query-document similarity. In addition, an ordered tree structure is used to reduce the matrix dimension and rank the documents. Analysis of our design indicates the potential to achieve a performance of 8.9 GFLOPS with dimension-dependent speedups over an optimized software implementation that range from 3.8x to 10.1x in terms of computation time.}, author = {Xinying Wang and Joseph Zambreno} } @conference {MilJon16A, title = {Parameterizable FPGA-based Kalman Filter Coprocessor Using Piecewise Affine Modeling}, booktitle = {Proceedings of the Reconfigurable Architectures Workshop (RAW)}, year = {2016}, month = {May}, abstract = {The Kalman Filter is a robust tool often employed as a plant observer in control systems. However, in the general case the high computational cost, especially for large system models or fast sample rates, makes it an impractical choice for typical low-power microcontrollers. Industry trends towards tighter integration and subsystem consolidation point to the use of powerful high-end SoCs, but this complicates the ability for a controls engineer to verify correct behavior of the system under all conditions, which is important in safety-critical systems. Dedicated FPGA hardware can provide computational speedup, in addition to firmer design partitioning in mixed-criticality systems and fully deterministic timing, which helps ensure a control system behaves as close as possible to offline simulations. We introduce and compare two variants of a software-configurable FPGA-based implementation of a Kalman Filter. The first is an implementation of an Extended Kalman Filter, while the second is a novel approach{\textendash}the Piecewise-Affine Kalman Filter{\textendash}which may offer significant advantages for certain types of applications. The state estimate update time and resource requirements are analyzed for plant models up to 28 states. For large models, the designs provide a speedup of 7-12x compared to reference ARM9020T software implementations. An application-agnostic performance analysis demonstrates how the Piecewise-Affine Kalman Filter reduces the software workload and the communication overhead compared to the standard mixed hardware-software Extended Kalman Filter approach.}, author = {Aaron Mills and Phillip Jones and Joseph Zambreno} } @article {NelTow16A, title = {RAMPS: A Reconfigurable Architecture for Minimal Perfect Sequencing}, journal = {IEEE Transactions on Parallel and Distributed Systems (TPDS)}, volume = {27}, year = {2016}, abstract = {The alignment of many short sequences of DNA, called reads, to a long reference genome is a common task in molecular biology. When the problem is expanded to handle typical workloads of billions of reads, execution time becomes critical. In this paper we present a novel reconfigurable architecture for minimal perfect sequencing (RAMPS). While existing solutions attempt to align a high percentage of the reads using a small memory footprint, RAMPS focuses on performing fast exact matching. Using the human genome as a reference, RAMPS aligns short reads hundreds of thousands of times faster than current software implementations such as SOAP2 or Bowtie, and about a thousand times faster than GPU implementations such as SOAP3. Whereas other aligners require hours to preprocess reference genomes, RAMPS can preprocess the reference human genome in a few minutes, opening the possibility of using new reference sources that are more genetically similar to the newly sequenced data.}, author = {Chad Nelson and Kevin Townsend and Osama Attia and Phillip Jones and Joseph Zambreno} } @conference {YouZam16A, title = {Towards a Fail-Operational Intrusion Detection System for In-Vehicle Networks}, booktitle = {Proceedings of the Workshop on Security and Dependability of Critical Embedded Real-Time Systems (CERTS)}, year = {2016}, month = {November}, abstract = {The landscape of automotive in-vehicle networks is changing driven by the vast options for infotainment features and progress toward fully-autonomous vehicles. However, the security of automotive networks is lagging behind feature-driven technologies, and new vulnerabilities are constantly being discovered. In this paper, we introduce a road map towards a security solution for in-vehicle networks that can detect anomalous and failed states of the network and adaptively respond in real-time to maintain a fail-operational system.}, author = {Clinton Young and Joseph Zambreno and Gedare Bloom} } @conference {AttGri15A, title = {Accelerating All-Pairs Shortest Path Using A Message-Passing Reconfigurable Architecture}, booktitle = {Proceedings of the International Conference on ReConFigurable Computing and FPGAs (ReConFig)}, year = {2015}, month = {December}, abstract = {In this paper, we study the design and implementation of a reconfigurable architecture for graph processing algorithms. The architecture uses a message-passing model targeting shared-memory multi-FPGA platforms. We take advantage of our architecture to showcase a parallel implementation of the all-pairs shortest path algorithm (APSP) for unweighted directed graphs. Our APSP implementation adopts a fine-grain processing methodology while attempting to minimize communication and synchronization overhead. Our design utilizes 64 parallel kernels for vertex-centric processing. We evaluate a prototype of our system on a Convey HC-2 high performance computing platform, in which our performance results demonstrates an average speedup of 7.9x over the sequential APSP algorithm and an average speedup of 2.38x over a multi-core implementation. }, author = {Osama Attia and Alex Grieve and Kevin Townsend and Phillip Jones and Joseph Zambreno} } @conference {WanJon15A, title = {A Configurable Architecture for Sparse LU Decomposition on Matrices with Arbitrary Patterns}, booktitle = {Proceedings of the International Symposium on Highly-Efficient Accelerators and Reconfigurable Technologies (HEART)}, year = {2015}, month = {June}, abstract = {Sparse LU decomposition has been widely used to solve sparse linear systems of equations found in many scientific and engineering applications, such as circuit simulation, power system modeling and computer vision. However, it is considered a computationally expensive factorization tool. While parallel implementations have been explored to accelerate sparse LU decomposition, irregular sparsity patterns often limit their performance gains. Prior FPGA-based accelerators have been customized to domain-specific sparsity patterns of pre-ordered symmetric matrices. In this paper, we present an efficient architecture for sparse LU decomposition that supports both symmetric and asymmetric sparse matrices with arbitrary sparsity patterns. The control structure of our architecture parallelizes computation and pivoting operations. Also, on-chip resource utilization is configured based on properties of the matrices being processed. Our experimental results show a 1.6 to 14x speedup over an optimized software implementation for benchmarks containing a wide range of sparsity patterns.}, author = {Xinying Wang and Phillip Jones and Joseph Zambreno} } @article {WanJon16A, title = {A Configurable Architecture for Sparse LU Decomposition on Matrices with Arbitrary Patterns}, journal = {ACM Computer Architecture News (CAN)}, volume = {43}, year = {2015}, month = {September}, abstract = {Sparse LU decomposition has been widely used to solve sparse linear systems of equations found in many scientific and engineering applications, such as circuit simulation, power system modeling and computer vision. However, it is considered a computationally expensive factorization tool. While parallel implementations have been explored to accelerate sparse LU decomposition, irregular sparsity patterns often limit their performance gains. Prior FPGA-based accelerators have been customized to domain-specific sparsity patterns of pre-ordered symmetric matrices. In this paper, we present an efficient architecture for sparse LU decomposition that supports both symmetric and asymmetric sparse matrices with arbitrary sparsity patterns. The control structure of our architecture parallelizes computation and pivoting operations. Also, on-chip resource utilization is configured based on properties of the matrices being processed. Our experimental results show a 1.6 to 14x speedup over an optimized software implementation for benchmarks containing a wide range of sparsity patterns.}, author = {Xinying Wang and Phillip Jones and Joseph Zambreno} } @conference {MilZam15A, title = {Estimating State of Charge and State of Health of Rechargable Batteries on a Per-Cell Basis}, booktitle = {Proceedings of the Workshop on Modeling and Simulation of Cyber-Physical Energy Systems (MSCPES)}, year = {2015}, month = {April}, abstract = {Much of current research on State-of-Charge (SOC) and State-of-Health (SOH) tracking for rechargeable batteries such as Li-ion focuses primarily on analyzing single cells, or otherwise treat a set of series-connected cells as a homogeneous unit. Since no two cells have precisely the same properties, for applications involving large batteries this can severely limit the accuracy and utility of the approach. In this paper we develop an model-driven approach using a Dual Unscented Kalman Filter to allow a Battery Monitoring System (BMS) to monitor in real time both SOC and SOH of each cell in a battery. A BMS is an example of a Cyber-Physical System (CPS) which requires deep understanding of the behavior of the physical system (i.e., the battery) in order to obtain reliability in demanding applications. In particular, since the SOH of a cell changes extremely slowly compared to SOC, our dual filter operates on two timescales to improve SOH tracking. We show that the use of the Unscented Kalman Filter instead of the more common Extended Kalman Filter simplifies the development of the system model equations in the multiscale case. We also show how a single {\textquotedblleft}average{\textquotedblright} cell model can be used to accurately estimate SOH for different cells and cells of different ages. }, author = {Aaron Mills and Joseph Zambreno} } @article {GupVya15A, title = {A Fault-aware Toolchain Approach for FPGA Fault Tolerance}, journal = {ACM Transactions on Design Automation of Electronic Systems (TODAES)}, volume = {20}, number = {2}, year = {2015}, abstract = {As the size and density of silicon chips continue to increase, maintaining acceptable manufacturing yields has become increasingly difficult. Recent works suggest that lithography techniques are reaching their limits with respect to enabling high yield fabrication of small-scale devices, thus there is an increasing need for techniques that can tolerate fabrication time defects. One candidate technology to help combat these defects is reconfigurable hardware. The flexible nature of reconfigurable devices, such as Field Programmable Gate Arrays (FPGAs), makes it possible for them to route around defective areas of a chip after the device has been packaged and deployed into the field. This work presents a technique that aims to increase the effective yield of FPGA manufacturing by re-claiming a portion of chips that would be ordinarily classified as unusable. In brief, we propose a modification to existing commercial toolchain flows to make them fault aware. A phase is added to identify faults within the chip. The locations of these faults are then used by the toolchain to avoid faults during the placement and routing phase. Specifically, we have applied our approach to the Xilinx commercial toolchain flow and evaluated its tolerance to both logic and routing resource faults. Our findings show that, at a cost of 5-10\% in device frequency performance, the modified toolchain flow can tolerate up to 30\% of logic resources being faulty and, depending on the nature of the target application, can tolerate 1-30\% of the device{\textquoteright}s routing resources being faulty. These results provide strong evidence that commercial toolchains not designed for the purpose of tolerating faults can still be greatly leveraged in the presence of faults to place and route circuits in an efficient manner.}, author = {Adwait Gupte and Sudhanshu Vyas and Phillip Jones} } @article {JohRog15A, title = {An FPGA Architecture for the Recovery of WPA/WPA2 Keys}, journal = {Journal of Circuits, Systems, and Computers (JCSC)}, volume = {24}, year = {2015}, abstract = {Wi-Fi protected access (WPA) has provided serious improvements over the now deprecated wired equivalent privacy (WEP) protocol. WPA, however, still has some flaws that allow an attacker to obtain the passphrase. One of these flaws is exposed when the access point (AP) is operating in the WPA personal mode. This is the most common mode, as it is the quickest and easiest to configure. This vulnerability requires the attacker to capture the traffic from the four-way handshake between the AP and client, and then have enough compute time to reverse the passphrase. Increasing the rate at which passphrases can be reversed reduces the amount of time required to construct a repository of service set identifiers (SSIDs) and passphrases, which can increase the chances an attack is successful, or, alternatively, reduce the difficulty of auditing a wireless network for security purposes. This work focuses on creating a Field programmable gate array (FPGA)-based architecture to accelerate the generation of a WPA/WPA2 pairwise master key (PMK) lookup table (LUT) for the recovery of the passphrase, with special emphasis on the secure hash algorithm-1 (SHA-1) implementation. PMK generation relies heavily on SHA-1 hashing and, as this work shows, an optimized SHA-1 implementation can achieve up to a 40 speedup over an unoptimized implementation when generating PMKs}, author = {Tyler Johnson and Daniel Roggow and Phillip Jones and Joseph Zambreno} } @conference {TowSun15A, title = {k-NN Text Classification using an FPGA-Based Sparse Matrix Vector Multiplication Accelerator}, booktitle = {Proceedings of the IEEE International Conference on Electro/Information Technology (EIT)}, year = {2015}, month = {May}, abstract = {Text classification is an important enabling technology for a wide range of applications such as Internet search, email filtering, network intrusion detection, and data mining electronic documents in general. The k Nearest Neighbors (k-NN) text classification algorithm is among the most accurate classification approaches, but is also among the most computationally expensive. In this paper, we propose accelerating k-NN using a novel reconfigurable hardware based architecture. More specifically, we accelerate a k-NN application{\textquoteright}s core with an FPGA-based sparse matrix vector multiplication coprocessor. On average our implementation shows a speed up factor of 15 over a naive single threaded CPU implementation of k-NN text classification for our datasets, and a speed up factor of 1.5 over a 32-threaded parallelized CPU implementation.}, author = {Kevin Townsend and Song Sun and Tyler Johnson and Osama Attia and Phillip Jones and Joseph Zambreno} } @conference {TowZam15A, title = {A Multi-Phase Approach to Floating-Point Compression}, booktitle = {Proceedings of the IEEE International Conference on Electro/Information Technology (EIT)}, year = {2015}, month = {May}, abstract = {This paper presents a lossless double-precision floating point compression algorithm. Floating point compression can reduce the cost of storing and transmitting large amounts of data associated with big data problems. A previous algorithm called FPC performs well and uses predictors. However, predictors have limitations. Our program (fzip) overcomes some of these limitations. fzip has 2 phases, first BWT compression, second value and prefix compression with variable length arithmetic encoding. This approach has the advantage that the phases work together and each phase compresses a different type of pattern. On average fzip achieves a 20\% higher compression ratio than other algorithms.}, author = {Kevin Townsend and Joseph Zambreno} } @conference {AwaZhu15A, title = {Phase Aware Warp Scheduling: Mitigating Effects of Phase Behavior in GPGPU Applications}, booktitle = {Proceedings of the International Conference on Parallel Architectures and Compilation Techniques (PACT)}, year = {2015}, month = {October}, abstract = {Graphics Processing Units (GPUs) have been widely adopted as accelerators for high performance computing due to the immense amount of computational throughput they offer over their CPU counterparts. As GPU architectures are optimized for throughput, they execute a large number of SIMD threads (warps) in parallel and use hardware multithreading to hide the pipeline and memory access latencies. While the Two-Level Round Robin (TLRR) and Greedy Then Oldest (GTO) warp scheduling policies have been widely accepted in the academic research community, there is no consensus regarding which policy works best for all applications. In this paper, we show that the disparity regarding which scheduling policy works better depends on the characteristics of instructions in different regions (phases) of the application. We identify these phases at compile time and design a novel warp scheduling policy that uses information regarding them to make scheduling decisions at runtime. By mitigating the adverse effects of application phase behavior, our policy always performs closer to the better of the two existing policies for each application. We evaluate the performance of the warp schedulers on 35 kernels from the Rodinia and CUDA SDK benchmark suites. For applications that have a better performance with the GTO scheduler, our warp scheduler matches the performance of GTO with 99.2\% accuracy and achieves an average speedup of 6.31\% over RR. Similarly, for applications that perform better with RR, the performance of our scheduler is within of 98\% of RR and achieves an average speedup of 6.65\% over GTO.}, author = {Mihir Awatramani and Xian Zhu and Joseph Zambreno and Diane Rover} } @conference {RogUhi15A, title = {A Project-Based Embedded Systems Design Course Using a Reconfigurable SoC Platform}, booktitle = {Proceedings of the International Conference on Microelectronic Systems Education (MSE)}, year = {2015}, month = {May}, abstract = {Embedded systems are becoming increasingly complex, as typical system components, such as sensors and other specialized processors, are blended together with more traditional microprocessors to form complex systems-on-chips (SoCs). Teaching undergraduate students to understand concepts and technologies behind embedded systems is important in order to prepare these future engineers with the skills and expertise necessary for designing such complex systems. This paper describes an undergraduate course designed to introduce students to embedded system design concepts and challenges in an engaging and effective manner. Our course uses a combination of in-depth laboratory assignments and topical lectures to provide a unique hands-on education for students. Laboratory assignments utilize Avnet{\textquoteright}s ZedBoard platform, a development board built around Xilinx{\textquoteright}s Zynq-7000 SoC, and require students to solve a variety of embedded system challenges from a range of application domains. Overall, student feedback about the course has been positive.}, author = {Daniel Roggow and Paul Uhing and Phillip Jones and Joseph Zambreno} } @article {MonRog15A, title = {Real-time Simulation of Dynamic Vehicle Models using a High-performance Reconfigurable Platform}, journal = {Microprocessors and Microsystems}, volume = {39}, year = {2015}, pages = {720-740}, abstract = {With the increase in the complexity of models and lack of flexibility offered by the analog computers, coupled with the advancements in digital hardware, the simulation industry has subsequently moved to digital computers and increased usage of programming languages such as C, C++, and MATLAB. However, the reduced time-step required to simulate complex and fast systems imposes a tighter constraint on the time within which the computations have to be performed. The sequential execution of these computations fails to cope with the real-time constraints which further restrict the usefulness of Real-Time Simulation (RTS) in a Virtual Reality (VR) environment. In this paper, we present a methodology for the design and implementation of RTS algorithms, based on the use of Field-Programmable Gate Array (FPGA) technology. We apply our methodology to an 8th order steering valve subsystem of a vehicle with relatively low response time requirements and use the FPGA technology to improve the response time of this model. Our methodology utilizes traditional hardware/software co-design approaches to generate a heterogeneous architecture for an FPGA-based simulator by porting the computationally complex regions to hardware. The hardware design was optimized such that it efficiently utilizes the parallel nature of FPGAs and pipelines the independent operations. Further enhancement was made by building a hardware component library of custom accelerators for common non-linear functions. The library also stores the information about resource utilization, cycle count, and the relative error with different bit-width combinations for these components, which is further used to evaluate different partitioning approaches. In this paper, we illustrate the partitioning of a hardware-based simulator design across dual FPGAs, initiate RTS using a system input from a Hardware-in-the-Loop (HIL) framework, and use these simulation results from our FPGA-based platform to perform response analysis. The total simulation time, which includes the time required to receive the system input over a socket (without HIL), software initialization, hardware computation, and transfer of simulation results back over a socket, shows a speedup of 2x as compared to a similar setup with no hardware acceleration. The correctness of the simulation output from the hardware has also been validated with the simulated results from the software-only design.}, author = {Madhu Monga and Daniel Roggow and Manoj Karkee and Song Sun and Lakshmi Kiran Tondehal and Brian Steward and Atul Kelkar and Joseph Zambreno} } @article {AttTow15A, title = {A Reconfigurable Architecture for the Detection of Strongly Connected Components}, journal = {ACM Transactions on Reconfigurable Technology and Systems (TRETS)}, volume = {9}, year = {2015}, month = {December}, abstract = {The Strongly Connected Components (SCC) detection algorithm serves as a keystone for many graph analysis applications. The SCC execution time for large-scale graphs, as with many other graph algorithms, is dominated by memory latency. In this paper, we investigate the design of a parallel hardware architecture for the detection of SCCs in directed graphs. We propose a design methodology that alleviates memory latency and problems with irregular memory access. The design is composed of 16 processing elements dedicated to parallel Breadth-First Search (BFS) and 8 processing elements dedicated to finding intersection in parallel. Processing elements are organized to reuse resources and utilize memory bandwidth efficiently. We demonstrate a prototype of our design using the Convey HC-2 system, a commercial high-performance reconfigurable computing coprocessor. Our experimental results show a speedup of as much as 17x for detecting SCCs in large-scale graphs when compared to a conventional sequential software implementation.}, author = {Osama Attia and Kevin Townsend and Phillip Jones and Joseph Zambreno} } @article {TowAtt16A, title = {A Scalable Unsegmented Multi-port Memory for FPGA-based Systems}, journal = {International Journal of Reconfigurable Computing (IJRC)}, volume = {2015}, year = {2015}, month = {December}, abstract = {On-chip multi-port memory cores are crucial primitives for many modern high performance reconfigurable architectures and multi-core systems. Previous approaches for scaling memory cores come at the cost of operating frequency, communication overhead, and logic resources without increasing the storage capacity of the memory. In this paper, we present two approaches for designing multi-port memory cores that are suitable for reconfigurable accelerators with substantial on-chip memory or complex communication. Our design approaches tackle these challenges by banking RAM blocks and utilizing interconnect networks which allows scaling without sacrificing logic resources. With banking, memory congestion is unavoidable and we evaluate our multi-port memory cores under different memory access patterns to gain insights about different design trade-offs. We demonstrate our implementation with up to 256 memory ports using a Xilinx Virtex-7 FPGA. Our experimental results report high throughput memories with resource usage that scales with the number of ports. }, author = {Kevin Townsend and Osama Attia and Phillip Jones and Joseph Zambreno} } @conference {ZhaMil15A, title = {A Software Configurable and Parallelized Coprocessor Architecture for LQR Control}, booktitle = {Proceedings of the International Conference on ReConFigurable Computing and FPGAs (ReConFig)}, year = {2015}, month = {December}, abstract = {We present a software configurable and parallelized coprocessor architecture for LQR control that can control physical processes representable by a linear state-space model. Our proposed architecture has distinct advantages over purely software or purely hardware approaches. It differs from other hardware controllers in that it is not hardwired to control one or a small range of plant types (e.g. only electric motors). Via software, an embedded systems engineer can easily reconfigure the controller to suit a wide range of controls applications that can be represented as a state-space model. One goal of our approach is to support a design methodology to help bridge the gap between controls and embedded system software engineering. Control of the well-understood inverted pendulum on a cart is used as an illustrative example of how the proposed hardware accelerator architecture supports our envisioned design methodology for helping bridge this gap. Additionally, we explore the design space of our co-processor{\textquoteright}s parallel architecture in terms of computing speed and resource utilization. Our performance results show a 3.4 to 100 factor speedup over a 666 MHz embedded ARM processor, for plants that can be represented by 4 to 128 states respectively. }, author = {Pei Zhang and Aaron Mills and Joseph Zambreno and Phillip Jones} } @conference {MilZha15A, title = {A Software Configurable Coprocessor-Based State-Space Controller}, booktitle = {Proceedings of the International Symposium on Field-Programmable Logic and Applications (FPL)}, year = {2015}, month = {September}, abstract = {We present a software configurable coprocessor-based state-space controller that can control physical processes representable by a linear state-space model. Our proposed architecture has distinct advantages over purely software or purely hardware approaches. It differs from other hardware controllers in that it is not hardwired to control one or a small range of plant types (e.g. only electric motors). Via software, an embedded systems engineer can easily reconfigure the controller to suit a wide range of controls applications that can be represented as a state-space linear model. Additionally, we introduce a novel design methodology to help bridge the gap between controls and embedded system engineering. Control of the well-understood inverted pendulum on a cart is used as an illustrative example of how the proposed hardware accelerator architecture supports our envisioned design methodology for helping bridge the gap between controls and embedded software engineering. }, author = {Aaron Mills and Pei Zhang and Sudhanshu Vyas and Joseph Zambreno and Phillip Jones} } @conference {KumVya14A, title = {Cache Design for Mixed Critical Real-Time Systems}, booktitle = {Proceedings of the International Conference on Computer Design (ICCD)}, year = {2014}, month = {October}, abstract = {Shared caches in mixed criticality systems are a source of interference for safety critical tasks. Shared memory not only leads to worst-case execution time (WCET) pessimism, but also affects the response time of safety critical tasks. In this paper, we present a criticality aware cache design which implements a Least Critical (LC) cache replacement policy, where a least recently used non-critical cache line is replaced during a cache miss. The cache acts as a Least Recently Used (LRU) cache if there are no critical lines or if all cache lines are critical in a set. In our design, data within a certain address space is given higher preference in the cache. These critical address spaces are configured using critical address range (CAR) registers. The new cache design was implemented in a Leon3 processor core, a 32bit processor compliant with the SPARC V8 architecture. Experimental results are presented that illustrate the impact of the Least Critical cache replacement policy on the response time of critical tasks, and on overall application performance as compared to a conventional LRU cache policy.}, author = {Chetan Kumar and Sudhanshu Vyas and Ron Cytron and Christopher Gill and Joseph Zambreno and Phillip Jones} } @conference {AttJoh14A, title = {CyGraph: A Reconfigurable Architecture for Parallel Breadth-First Search}, booktitle = {Proceedings of the Reconfigurable Architectures Workshop (RAW)}, year = {2014}, month = {May}, abstract = {Large-scale graph structures are considered a keystone for many emerging high-performance computing applications in which Breadth-First Search (BFS) is an important building block. For such graph structures, BFS operations tends to be memory-bound rather than compute-bound. In this paper, we present an efficient reconfigurable architecture for parallel BFS that adopts new optimizations for utilizing memory bandwidth. Our architecture utilizes a custom graph representation based on compressed-sparse raw format (CSR), as well as a restructuring of the conventional BFS algorithm. By taking maximum advantage of available memory bandwidth, our architecture continuously keeps our processing elements active. Using a commercial high-performance reconfigurable computing system (the Convey HC-2), our results demonstrate a 5x speedup over previously published FPGA-based implementations. }, author = {Osama Attia and Tyler Johnson and Kevin Townsend and Phillip Jones and Joseph Zambreno} } @conference {WanZam14B, title = {An Efficient Architecture for Floating-Point Eigenvalue Decomposition}, booktitle = {Proceedings of the International Symposium on Field-Programmable Custom Computing Machines (FCCM)}, year = {2014}, month = {May}, abstract = {Eigenvalue decomposition (EVD) is a widely-used factorization tool to perform principal component analysis, and has been employed for dimensionality reduction and pattern recognition in many scientific and engineering applications, such as image processing, text mining and wireless communications. EVD is considered computationally expensive, and as software implementations have not been able to meet the performance requirements of many real-time applications, the use of reconfigurable computing technology has shown promise in accelerating this type of computation. In this paper, we present an efficient FPGA-based double-precision floating-point architecture for EVD, which can efficiently analyze large-scale matrices. Our experimental results using an FPGA-based hybrid acceleration system indicate the efficiency of our novel array architecture, with dimension-dependent speedups over an optimized software implementation that range from 1.5x to 15.45x in terms of computation time.}, author = {Xinying Wang and Joseph Zambreno} } @conference {WanZam14A, title = {An FPGA Implementation of the Hestenes-Jacobi Algorithm for Singular Value Decomposition}, booktitle = {Proceedings of the Reconfigurable Architectures Workshop (RAW)}, year = {2014}, month = {May}, abstract = {As a useful tool for dimensionality reduction, Singular Value Decomposition (SVD) plays an increasingly significant role in many scientific and engineering applications. The high computational complexity of SVD poses challenges for efficient signal processing and data analysis systems, especially for time-sensitive applications with large data sets. While the emergence of FPGAs provides a flexible and low-cost opportunity to pursue high-performance SVD designs, the classical two-sided Jacobi rotation-based SVD architectures are restricted in terms of scalability and input matrix representation. The Hestenes-Jacobi algorithm offers a more parallelizable solution to analyze arbitrary rectangular matrices; however, to date both FPGA and GPU-based implementations have not lived up to the algorithm{\textquoteright}s potential. In this paper, we introduce a floating-point Hestenes-Jacobi architecture for SVD, which is capable of analyzing arbitrary sized matrices. Our implementation on an FPGA-based hybrid acceleration system demonstrates improved efficiency of our architecture compared to an optimized software-based SVD solution for matrices with small to medium column dimensions, even with comparably large row dimensions. The dimensional speedups can be achieved range from 3.8x to 43.6x for matrices with column dimensions from 128 to 256 and row sizes from 128 to 2048. Additionally, we also evaluate the accuracy of our SVD process through convergence analysis.}, author = {Xinying Wang and Joseph Zambreno} } @article {VyaKum13A, title = {An FPGA-based Plant-on-Chip Platform for Cyber-Physical System Analysis}, journal = {IEEE Embedded Systems Letters (ESL)}, volume = {6}, number = {1}, year = {2014}, pages = {4-7}, abstract = {Digital control systems are traditionally designed independent of their implementation platform, assuming constant sensor sampling rates and processor response times. Applications are deployed to processors that are shared amongst control and non-control tasks, to maximize resource utilization. This potentially overlooks that computing mechanisms meant for improving average CPU usage, such as cache, interrupts, and task management through schedulers, contribute to non-deterministic interference between tasks. This response time jitter can result in reduced system stability, motivating further study by both the controls and computing communities to maximize CPU utilization, while maintaining physical system stability needs. In this paper, we describe an FPGA-based embedded software platform coupled with a hardware plant emulator (as opposed to purely software-based simulations or hardware-in-the-loop setups) that forms a basis for safe and accurate analysis of Cyber-Physical Systems. We model and analyze an inverted pendulum to demonstrate that our setup can provide a significantly more accurate representation of a real system.}, author = {Sudhanshu Vyas and Chetan Kumar and Joseph Zambreno and Christopher Gill and Ron Cytron and Phillip Jones} } @article {PanChe13A, title = {Hardware Architecture for Video Authentication using Sensor Pattern Noise}, journal = {IEEE Transactions on Circuits and Systems for Video Technology (TCSVT)}, volume = {24}, number = {1}, year = {2014}, pages = {157-167}, abstract = {Digital camera identification can be accomplished based on sensor pattern noise which is unique to a device and serves as a distinct identification fingerprint. Camera identification and authentication has formed the basis of image / video forensics in legal proceedings. Unfortunately, real-time video source identification is a computationally heavy task, and does not scale well to conventional software implementations on typical embedded devices. In this paper, we propose a hardware architecture for source identification in networked cameras. The underlying algorithms, an orthogonal forward and inverse Discrete Wavelet Transform (DWT) and Minimum Mean Square Error (MMSE) based Estimation have been optimized for 2D frame sequences in terms of area and throughput performance. We exploit parallelism, pipelining and hardware reuse techniques to minimize hardware resource utilization and increase the achievable throughput of the design. A prototype implementation on a Xilinx Virtex-6 FPGA device was optimized with a resulting throughput of 167 MBps, processing 30 640 {\texttimes} 480 video frames in 0.17 second.}, author = {Amit Pande and Shaxun Chen and Prasant Mohapatra and Joseph Zambreno} } @article {KumVya13B, title = {Hardware-Software Architecture for Priority Queue Management in Real-time and Embedded Systems}, journal = {International Journal of Embedded Systems (IJES)}, volume = {6}, number = {4}, year = {2014}, pages = {319-334}, abstract = {The use of hardware-based data structures for accelerating real-time and embedded system applications is limited by the scarceness of hardware resources. By their nature, being limited by the silicon area available, hardware data structures cannot scale in size as easily as their software counterparts. We assert a hardware-software co-design approach is required to elegantly overcome these limitations. In this paper, we present a hybrid priority queue architecture that includes a hardware accelerated binary heap that can also be managed in software when its queue size exceeds hardware limits. A memory mapped interface provides software with access to priority-queue-structured on-chip memory, which enables quick and low overhead transitions between hardware and software management. As an application of this hybrid architecture, we present a scalable task scheduler for real-time systems that reduces scheduler processing overhead and improves timing determinism of the scheduler. }, author = {Chetan Kumar and Sudhanshu Vyas and Ron Cytron and Christopher Gill and Joseph Zambreno and Phillip Jones} } @conference {TowJon14A, title = {A High Performance Systolic Architecture for k-NN Classification}, booktitle = {Proceedings of the International Conference on Formal Methods and Models for Codesign (MEMOCODE)}, year = {2014}, month = {October}, abstract = {This paper describes the architecture of the winning entry to the 2014 Memocode Design Contest, in the maximum performance category. This year{\textquoteright}s Memocode design contest asks contestants to find the 10 nearest neighbors between 1,000 testing points and 10,000,000 training points. Instead of using Euclidean distance, the contest uses Mahalanobis distance. The contest has 2 awards: the maximum performance award and the cost adjusted performance award. Our implementation uses a brute force approach that calculates the distance between every testing point to every training point. We use the Convey HC-2ex, a FPGA-based platform. However, the theory applies to software implementations as well. At the time of publication, our runtime is 0.54 seconds.}, author = {Kevin Townsend and Phillip Jones and Joseph Zambreno} } @conference {AwaRov14A, title = {Perf-Sat: Runtime Detection of Performance Saturation for GPGPU Applications}, booktitle = {Proceedings of the International Workshop on Scheduling and Resource Management for Parallel and Distributed Systems (SRMPDS)}, year = {2014}, month = {September}, abstract = {Graphic Processing Units (GPUs) achieve latency tolerance by exploiting massive amounts of thread level parallelism. Each core executes several hundred to a few thousand simultaneously active threads. The work scheduler tries to maximize the number of active threads on each core by launching threads until at least one of the required resources is completely utilized. The rationale is, more threads would give the thread scheduler more opportunities to hide memory latency and thus would result in better performance. In this work, we show that launching the maximum number of threads is not always necessary to achieve the best performance. Applications have an optimal thread count value at which the performance saturates. Increasing the number of threads beyond this value results in no better and sometimes worse performance. To this end, we develop Perf-Sat: a mechanism to detect the optimal number of threads required on each core at runtime. Perf-Sat is integrated into the hardware work scheduler and guides it to either increase or decrease the number of active threads. We evaluate the performance impact of our scheduler on two GPU generations and show that Perf-Sat scales well to different applications as well as architectures. With performance loss of less than 1\%, Perf-Sat is able to achieve core resource savings of 18.32\% on average.}, author = {Mihir Awatramani and Diane Rover and Joseph Zambreno} } @conference {WanZam14C, title = {A Reconfigurable Architecture for QR Decomposition Using A Hybrid Approach}, booktitle = {Proceedings of the IEEE Computer Society Annual Symposium on VLSI (ISVLSI)}, year = {2014}, month = {July}, abstract = {QR decomposition has been widely used in many signal processing applications to solve linear inverse problems. However, QR decomposition is considered a computationally expensive process, and its sequential implementations fail to meet the requirements of many time-sensitive applications. The Householder transformation and the Givens rotation are the most popular techniques to conduct QR decomposition. Each of these approaches have their own strengths and weakness. The Householder transformation lends itself to efficient sequential implementation, however its inherent data dependencies complicate parallelization. On the other hand, the structure of Givens rotation provides many opportunities for concurrency, but is typically limited by the availability of computing resources. We propose a deeply pipelined reconfigurable architecture that can be dynamically configured to perform either approach in a manner that takes advantage of the strengths of each. At runtime, the input matrix is first partitioned into numerous submatrices. Our architecture then performs parallel Householder transformations on the sub-matrices in the same column block, which is followed by parallel Givens rotations to annihilate the remaining unneeded individual off-diagonals. Analysis of our design indicates the potential to achieve a performance of 10.5 GFLOPS with speedups of up to 1.46x, 1.15x and 13.75x compared to the MKL implementation, a recent FPGA design and a Matlab solution, respectively.}, author = {Xinying Wang and Phillip Jones and Joseph Zambreno} } @conference {MilZam14A, title = {Towards Scalable Monitoring and Maintenance of Rechargeable Batteries}, booktitle = {Proceedings of the IEEE International Conference on Electro/Information Technology (EIT)}, year = {2014}, month = {June}, abstract = {Current research on State-of-Charge (SOC) tracking for rechargeable batteries focuses primarily on analyzing batteries consisting of a single cell, or otherwise treat a set of series-connected cells as a homogeneous unit. Yet, as the number of series-connected cells per battery increase, so does the challenge of ensuring safe and efficient operation over a potentially long period of deployment. Cell-level energy balancing is commonly proposed as a means to address the effects of cell property mismatch. However, no comprehensive solution exists addressing the need to maintain SOC accuracy over the full life of a large battery, while also managing the energy imbalance which develops between cells. If poorly managed, this imbalance can reduce usable capacity and lifespan. This paper proposes an integrated solution to these various issues by tracking SOC on a per-cell basis and applying SOC to a cell-balancing application. The effectiveness is demonstrated using a custom test platform.}, author = {Aaron Mills and Joseph Zambreno} } @article {PanZam11D, title = {A Chaotic Encryption Scheme for Real-time Embedded Systems: Design and Implementation}, journal = {Telecommunication Systems}, volume = {52}, number = {2}, year = {2013}, pages = {551-561}, abstract = {Chaotic encryption schemes are believed to provide greater level of security than conventional ciphers. In this paper, a chaotic stream cipher is first constructed and then its hardware implementation details over Xilinx Virtex-6 FPGA are provided. Logistic map is the simplest chaotic system and has high potential to be used to design a stream cipher for real-time embedded systems. Its simple construct and non-linear dynamics makes it a common choice for such applications. In this paper, we present a Modified Logistic Map (MLM) which improves the performance of Logistic Map in terms of higher Lyapunov exponent and uniformity of bifurcation map. It also avoids the stable orbits of logistic map giving a more chaotic behavior to the system. A stream cipher is built using MLM and random feedback scheme. The proposed cipher gives 16 bits of encrypted data per clock cycle. The hardware implementation results over Xilinx Virtex-6 FPGA give a synthesis clock frequency of 93 MHz and a throughput of 1.5 Gbps while using 16 hardware multipliers. This makes the cipher suitable for embedded devices which have tight constraints on power consumption, hardware resources and real-time parameters.}, author = {Amit Pande and Joseph Zambreno} } @book {PanZam13A, title = {Embedded Multimedia Security Systems}, year = {2013}, pages = {146}, publisher = {Springer-Verlag}, organization = {Springer-Verlag}, address = {London}, author = {Amit Pande and Joseph Zambreno} } @article {VyaGup13A, title = {Hardware Architectural Support for Control Systems and Sensor Processing}, journal = {ACM Transactions on Embedded Computing Systems (TECS)}, volume = {13}, number = {2}, year = {2013}, abstract = {The field of modern control theory and the systems used to implement these controls have shown rapid development over the last 50 years. It was often the case that those developing control algorithms could assume the computing medium was solely dedicated to the task of controlling a plant. For example, the control algorithm being implemented in software on a dedicated digital signal processor (DSP), or implemented in hardware using a simple dedicated programmable logic device (PLD). As time progressed, the drive to place more system functionality in a single component (reducing power, cost, and increasing reliability) has made this assumption less often true. Thus, it has been pointed out by some experts in the field of control theory (e.g. Astrom) that those developing control algorithms must take into account the effects of running their algorithms on systems that will be shared with other tasks. One aspect of the work presented is this article is a hardware architecture that allows control developers to maintain this simplifying assumption. We focus specifically on the proportional-integral-derivative (PID) controller. An on-chip coprocessor has been implemented that can scale to support servicing hundreds of plants, while maintaining microsecond level response times, tight deterministic control loop timing, and allows the main processor to service non-control tasks. In order to control a plant, the controller needs information about the plant{\textquoteright}s state. Typically this information is obtained from sensors with which the plant has been instrumented. There are a number of common computations that may be performed on this sensor data before being presented to the controller (e.g. averaging and thresholding). Thus in addition to supporting PID algorithms, we have developed a sensor processing unit (SPU) that off-loads these common sensor processing tasks from the main processor. We have prototyped our ideas using Field Programmable Gate Array (FPGA) technology. Through our experimental results, we show our PID execution unit gives orders of magnitude improvement in response time when servicing many plants, as compared to a standard general software implementation. We also show that the SPU scales much better than a general software implementation. In addition, these execution units allow the simplifying assumption of dedicated computing medium to hold for control algorithm development.}, author = {Sudhanshu Vyas and Adwait Gupte and Christopher Gill and Ron Cytron and Joseph Zambreno and Phillip Jones} } @conference {MihZam13A, title = {Increasing GPU Throughput using Kernel Interleaved Thread Block Scheduling}, booktitle = {Proceedings of the International Conference on Computer Design (ICCD)}, year = {2013}, month = {October}, abstract = {The number of active threads required to achieve peak application throughput on graphics processing units (GPUs) depends largely on the ratio of time spent on computation to the time spent accessing data from memory. While compute-intensive applications can achieve peak throughput with a low number of threads, memory-intensive applications might not achieve good throughput even at the maximum supported thread count. In this paper, we study the effects of scheduling work from multiple applications on the same GPU core. We claim that interleaving workload from different applications on a GPU core can improve the utilization of computational units and reduce the load on memory subsystem. Experiments on 17 application pairs from the Rodinia benchmark suite show that overall throughput increases by 7\% on average.}, author = {Mihir Awatramani and Joseph Zambreno and Diane Rover} } @conference {PatMil13A, title = {A Multi-Faceted Approach to FPGA-Based Trojan Circuit Detection}, booktitle = {Proceedings of the IEEE VLSI Test Symposium (VTS)}, year = {2013}, month = {April}, abstract = {Three general approaches to detecting Trojans embedded in FPGA circuits were explored in the context of the 2012 CSAW Embededed Systems Challenge: functional testing, power analysis, and direct analysis of the bitfile. These tests were used to classify a set of 32 bitfiles which include Trojans of an unknown nature. The project is a step towards developing a framework for Trojan-detection which leverages the strengths of a variety of testing techniques.}, author = {Michael Patterson and Aaron Mills and Ryan Scheel and Julie Tillman and Evan Dye and Joseph Zambreno} } @conference {KriZam13A, title = {Polarity Trend Analysis of Public Sentiment on YouTube}, booktitle = {Proceedings of the International Conference on Management of Data (COMAD)}, year = {2013}, month = {December}, abstract = {For the past several years YouTube has been by far the largest user-driven online video provider. While many of these videos contain a significant number of user comments, little work has been done to date in extracting trends from these comments because of their low information consistency and quality. In this paper we perform sentiment analysis of the YouTube comments related to popular topics using machine learning techniques. We demonstrate that an analysis of the sentiments to identify their trends, seasonality and forecasts can provide a clear picture of the influence of real-world events on user sentiments.}, author = {Amar Krishna and Joseph Zambreno and Sandeep Krishnan} } @conference {TowZam13A, title = {Reduce, Reuse, Recycle (R^3): a Design Methodology for Sparse Matrix Vector Multiplication on Reconfigurable Platforms}, booktitle = {Proceedings of the International Conference on Application-specific Systems, Architectures and Processors (ASAP)}, year = {2013}, month = {June}, abstract = {Sparse Matrix Vector Multiplication (SpMV) is an important computational kernel in many scientific computing applications. Pipelining multiply-accumulate operations shifts SpMV from a computationally bounded kernel to an I/O bounded kernel. In this paper, we propose a design methodology and hardware architecture for SpMV that seeks to utilize system memory bandwidth as efficiently as possible, by Reducing the matrix element storage with on-chip decompression hardware, Reusing the vector data by mixing row and column matrix traversal, and Recycling data with matrix-dependent on-chip storage. Our experimental results with a Convey HC-1/HC-2 reconfigurable computing system indicate that for certain sparse matrices, our R^3 methodology performs twice as fast as previous reconfigurable implementations, and effectively competes against other platforms.}, author = {Kevin Townsend and Joseph Zambreno} } @conference {KumVya13A, title = {Scheduling Challenges in Mixed Critical Real-time Heterogeneous Computing Platforms}, booktitle = {Proceedings of Dynamic Data Driven Application Systems (DDDAS)}, year = {2013}, month = {June}, abstract = {In Dynamic Data-Driven Application Systems (DDDAS), applications must dynamically adapt their behavior in response to objectives and conditions that change while deployed. Often these applications may be safety critical or tightly resource constrained, with a need for graceful degradation when introduced to unexpected conditions. This paper begins by motivating and providing a vision for a dynamically adaptable mixed critical computing platform to support DDDAS applications. We then specifically focus on the need for advancements in task models and scheduling algorithms to manage the resources of such a platform. We discuss the short comings of existing task models for capturing important attributes of our envisioned computing platform, and identify challenges that must be addressed when developing scheduling algorithms that act upon our proposed extended task model.}, author = {Chetan Kumar and Sudhanshu Vyas and Ron Cytron and Christopher Gill and Joseph Zambreno and Phillip Jones} } @article {PanMoh12A, title = {Securing Multimedia Content using Joint Compression and Encryption}, journal = {IEEE MultiMedia}, volume = {20}, number = {4}, year = {2013}, pages = {50-61}, abstract = {Algorithmic parameterization and hardware architectures can ensure secure transmission of multimedia data in resource-constrained environments such as wireless video surveillance networks, tele-medicine frameworks for distant health care support in rural areas, and Internet video streaming. Joint multimedia compression and encryption techniques can significantly reduce the computational requirements of video processing systems. We present an approach to reduce the computational cost of multimedia encryption, while also preserving the properties of compressed video (useful for scalability, transcoding, and retrieval), which endanger loss by naive encryption. Hardware-amenable design of proposed algorithms makes them suitable for real-time embedded multimedia systems. This approach alleviates the need of additional hardware for encryption in resource constrained scenario, and can be otherwise used to augment existing encryption methods used for content delivery in Internet or other applications. In this work, we show how two compression blocks for video coding: a modified frequency transform (called as Secure Wavelet Transform or SWT) and a modified entropy coding scheme, (called Chaotic Arithmetic Coding (CAC)) can be used for video encryption. Experimental results are shown for selective encryption using proposed schemes. }, author = {Amit Pande and Prasant Mohapatra and Joseph Zambreno} } @article {PanZam12B, title = {Comments on {\textquoteright}Arithmetic Coding as a Non-Linear Dynamical System{\textquoteright}}, journal = {Communications in Nonlinear Science and Numerical Simulation (CNSNS)}, volume = {17}, number = {12}, year = {2012}, pages = {4536-4543}, abstract = {Nagaraj et al. [1, 2] present a skewed-non-linear Generalized Luroth Series (s-nGLS) framework. S-nGLS uses non-linear maps for GLS to introduce a security parameter a which is used to build a keyspace for image or data encryption. The map introduces non-linearity to the system to add an "encryption key parameter". The skew is added to achieve optimal compression efficiency. s-nGLS used as such for joint encryption and compression is a weak candidate, as explained in this communication. First, we show how the framework is vulnerable to known plaintext based attacks and that a key of size 256 bits can be broken within 1000 trials. Next, we demonstrate that the proposed non-linearity exponentially increases the hardware complexity of design. We also discover that s-nGlS can{\textquoteright}t be implemented as such for large bitstreams. Finally, we demonstrate how correlation of key parameter with compression performance leads to further key vulnerabilities.}, author = {Amit Pande and Joseph Zambreno and Prasant Mohapatra} } @conference {MilVya12A, title = {Design and Evaluation of a Delay-Based FPGA Physically Unclonable Function}, booktitle = {Proceedings of the International Conference on Computer Design (ICCD)}, year = {2012}, month = {September}, abstract = {A new Physically Unclonable Function (PUF) variant was developed on an FPGA, and its quality evaluated. It is conceptually similar to PUFs developed using standard SRAM cells, except it utilizes general FPGA reconfigurable fabric, which offers several advantages. Comparison between our approach and other PUF designs indicates that our design is competitive in terms of repeatability within a given instance, and uniqueness between instances. The design can also be tuned to achieve desired response characteristics which broadens the potential range of applications.}, author = {Aaron Mills and Sudhanshu Vyas and Michael Patterson and Christopher Sabotta and Phillip Jones and Joseph Zambreno} } @conference {SteZam12A, title = {Exposing High School Students to Concurrent Programming Principles using Video Game Scripting Engines}, booktitle = {Proceedings of the Annual Conference of the American Society for Engineering Education (ASEE)}, year = {2012}, month = {June}, abstract = {Introducing programming using an imperative language often requires a steep learning curve due to the significant emphasis and corresponding time commitment placed on a particular language{\textquoteright}s syntax and semantics. This paper presents a suite of video game scripting engines focusing on nurturing computational skills that can be explored in as little as one hour. Scripting engines run code developed by students to control four concurrent players on a team; up to four teams (four different code scripts) can play in a head-to-head competition. To achieve a quick learning curve, the scripting engine only supports a limited number of instructions to define initial player qualities, movements, and game actions. Students are faced with the computational thinking challenge of mapping their game strategies into code. Successful strategies require teams to appreciate the complexities of concurrent programming to control all game players simultaneously. We have observed that students quickly learn that writing code for all team players individually does not result in a competitive match, but requires a mixture of collaboration and parallel programming to be competitive in a short amount of time. The need for more advanced control flow semantics are also motivated, since students must rewrite similar code for performing similar routines through the game simulation. The video game scripting engines have been used in two high school outreach programs and results from these events indicate that the learning objectives were met and students were engaged in the activities the entire duration by modifying their code to be more competitive. Lessons learned from the first scripting engine (Dodgeball) that went into creating the second engine ( Boomtown) are also presented. }, author = {Michael Steffen and Joseph Zambreno} } @conference {KumVya12A, title = {Improving System Predictability and Performance via Hardware Accelerated Data Structures}, booktitle = {Proceedings of Dynamic Data Driven Application Systems (DDDAS)}, year = {2012}, month = {June}, abstract = {In Dynamic Data-Driven Application Systems, applications must dynamically adapt their behavior in response to objectives and conditions that change while deployed. One approach to achieve dynamic adaptation is to offer middleware that facilitates component migration between modalities in response to such dynamic changes. The triggering, planning, and cost evaluation of adaptation takes place within a scheduler. Scheduling overhead is a major limiting factor for implementing dynamic scheduling algorithms with high frequency timer-tick resolution in real time systems. In this paper, we present a scalable hardware scheduler architecture for real time systems that reduces processing overhead and improves timing predictability of the scheduler. A new hardware priority queue design is presented, which supports insertions in constant time, and removals in O(log n) time. The hardware scheduler supports three (Rate Monotonic Scheduling (RMS), Earliest Deadline First (EDF), priority based) scheduling algorithms, which can be configured during run-time. The interface to the scheduler is provided through a set of custom instructions as an extension to the processors instruction set architecture. We also report on our experience migrating between two implementations of an ordered-set implementation, with the goal of providing predictable performance for real-time applications.}, author = {Chetan Kumar and Sudhanshu Vyas and Jonathan Shidal and Ron Cytron and Christopher Gill and Joseph Zambreno and Phillip Jones} } @conference {SteJon12A, title = {Introducing Graphics Processing from a Systems Perspective: A Hardware / Software Approach}, booktitle = {Proceedings of the Annual Conference of the American Society for Engineering Education (ASEE)}, year = {2012}, month = {June}, abstract = {Typical courses in computer graphics focus mainly on the core graphics rendering algorithms and software interfaces - hardware and system-level issues are addressed, if at all, through classroom lectures of industrial case studies. We have recently introduced a senior technical elective which introduces graphics processing from the perspective of the software developer, hardware architect, and system integrator. Towards this end, lecture topics are designed for students with no computer graphics background, and focus on solving specific computing problems using skills learned from a variety of computer engineering courses (e.g. digital logic, computer architecture, software design, embedded systems). As part of the laboratory component, students are presented with a series of bi-weekly design challenges that are geared towards implementing a particular module in the 3D graphics pipeline (with both hardware and software support) using an FPGA-based hardware prototyping platform. Although the main focus of the labs is on architectural design, hardware implementation, and hardware / software verification; each assignment also involves both a functional correctness as well as an optional performance optimization component. Only by analyzing the interactions between the graphics application, middleware, architecture, and logic levels can the performance optimization goal be achieved. Each subsequent challenge builds upon those previous, such that by the end of the semester students will have designed and implemented a fully-functional OpenGL-compliant graphics processor, capable of running significant applications. The course was introduced in the Spring of 2011 and the results from the final course project indicated that many of our intended learning objectives were met; student feedback was also positive. }, author = {Michael Steffen and Phillip Jones and Joseph Zambreno} } @article {SunMon11A, title = {An I/O Bandwidth-Sensitive Sparse Matrix-Vector Multiplication Engine on FPGAs}, journal = {IEEE Transactions on Circuits and Systems-I (TCAS-I)}, volume = {59}, number = {1}, year = {2012}, pages = {113-123}, abstract = {Sparse Matrix-Vector Multiplication (SMVM) is a fundamental core of many high-performance computing applications, including information retrieval, medical imaging, and economic modeling. While the use of reconfigurable computing technology in a high-performance computing environment has shown recent promise in accelerating a wide variety of scientific applications, existing SMVM architectures on FPGA hardware have been limited in that they require either numerous pipeline stalls during computation (due to zero padding) or excessive input preprocessing during run-time. For large-scale sparse matrix scenarios, both of these shortcomings can result in unacceptable performance overheads, limiting the overall value of using FPGAs in a high-performance computing environment. In this paper, we present a scalable and efficient FPGA-based SMVM architecture which can handle arbitrary matrix sparsity patterns without excessive preprocessing or zero padding and can be dynamically expanded based on the available I/O bandwidth. Our experimental results using a commercial FPGA-based acceleration system demonstrate that our reconfigurable SMVM engine is highly efficient, with benchmark-dependent speedups over an optimized software implementation that range from 3.5x to 6.5x in terms of computation time.}, author = {Song Sun and Madhu Monga and Phillip Jones and Joseph Zambreno} } @article {PanZam12A, title = {Poly-DWT: Polymorphic Wavelet Hardware Support For Dynamic Image Compression}, journal = {ACM Transactions on Embedded Computing Systems (TECS)}, volume = {11}, number = {1}, year = {2012}, abstract = {Many modern computing applications have been enabled through the use of real-time multimedia processing. While several hardware architectures have been proposed in the research literature to support such primitives, these fail to address applications whose performance and resource requirements have a dynamic aspect. Embedded multimedia systems typically need a power and computation efficient design in addition to good compression performance. In this paper, we introduce a Polymorphic Wavelet Architecture (Poly-DWT) as a crucial building block towards the development of embedded systems to address such challenges. We illustrate how our Poly-DWT architecture can potentially make dynamic resource allocation decisions, such as the internal bit representation and the processing kernel, according to the application requirements. We introduce a filter switching architecture that allows for dynamic switching between 5/3 and 9/7 wavelet filters and leads to a more power efficient design. Further, a multiplier-free design with a low adder requirement demonstrates the potential of Poly-DWT for embedded systems. Through an FPGA prototype, we perform a quantitative analysis of our Poly-DWT architecture, and compare our filter to existing approaches to illustrate the area and performance benefits inherent in our approach. }, author = {Amit Pande and Joseph Zambreno} } @conference {MonKar12A, title = {Real-Time Simulation of Dynamic Vehicle Models using a High-performance Reconfigurable Platform}, booktitle = {Proceedings of the International Conference on Computational Science (ICCS)}, year = {2012}, month = {June}, abstract = {A purely software-based approach for Real-Time Simulation (RTS) may have difficulties in meeting real-time constraints for complex physical model simulations. In this paper, we present a methodology for the design and implementation of RTS algorithms, based on the use of Field-Programmable Gate Array (FPGA) technology to improve the response time of these models. Our methodology utilizes traditional hardware/software co-design approaches to generate a heterogeneous architecture for an FPGA-based simulator. The hardware design was optimized such that it efficiently utilizes the parallel nature of FPGAs and pipelines the independent operations. Further enhancement is obtained through the use of custom accelerators for common non-linear functions. Since the systems we examined had relatively low response time requirements, our approach greatly simplifies the software components by porting the computationally complex regions to hardware. We illustrate the partitioning of a hardware-based simulator design across dual FPGAs, initiate RTS using a system input from a Hardware-in-the-Loop (HIL) framework, and use these simulation results from our FPGA-based platform to perform response analysis. The total simulation time, which includes the time required to receive the system input over a socket (without HIL), software initialization, hardware computation, and transfer of simulation results back over a socket, shows a speedup of 2x as compared to a similar setup with no hardware acceleration. The correctness of the simulation output from the hardware has also been validated with the simulated results from the software-only design.}, author = {Madhu Monga and Manoj Karkee and Lakshmi Kiran Tondehal and Brian Steward and Atul Kelkar and Joseph Zambreno} } @conference {NelTow12A, title = {Shepard: A Fast Exact Match Short Read Aligner}, booktitle = {Proceedings of the International Conference on Formal Methods and Models for Codesign (MEMOCODE)}, year = {2012}, month = {July}, abstract = {The mapping of many short sequences of DNA, called reads, to a long reference genome is an common task in molecular biology. The task amounts to a simple string search, allowing for a few mismatches due to mutations and inexact read quality. While existing solutions attempt to align a high percentage of the reads using small memory footprints, Shepard is concerned with only exact matches and speed. Using the human genome, Shepard is on the order of hundreds of thousands of times faster than current software implementations such as SOAP2 or Bowtie, and about 60 times faster than GPU implementations such as SOAP3. Shepard contains two components: a software program to preprocess a reference genome into a hash table, and a hardware pipeline for performing fast lookups. The hash table has one entry for each unique 100 base pair sequence that occurs in the reference genome, and contains the index of last occurrence and the number of occurrences. To reduce the hash table size, a minimal perfect hash table is used. The hardware pipeline was designed to perform hash table lookups very quickly, on the order of 600 million lookups per second, and was implemented on a Convey HC-1 high performance reconfigurable computing system. Shepard streams all of the short reads through a custom hardware pipeline and writes the alignment data (index of last occurrence and number of occurrences) to a binary results array.}, author = {Chad Nelson and Kevin Townsend and Bhavani Satyanarayana Rao and Phillip Jones and Joseph Zambreno} } @conference {PanZam11B, title = {Architectures for Simultaneous Coding and Encryption using Chaotic Maps}, booktitle = {Proceedings of the IEEE Computer Society Annual Symposium on VLSI (ISVLSI)}, year = {2011}, month = {July}, abstract = {In this work, we discuss an interpretation of arithmetic coding using chaotic maps. We present a hardware implementation using 64 bit fixed point arithmetic on Virtex-6 FPGA (with and without using DSP slices). The encoder resources are slightly higher than a traditional AC encoder, but there are savings in decoder performance. The architectures achieve clock frequency of 400-500 MHz on Virtex-6 xc6vlx75 device.}, author = {Amit Pande and Joseph Zambreno and Prasant Mohapatra} } @article {BauSte10A, title = {A Case Study in Hardware Trojan Design and Implementation}, journal = {International Journal of Information Security (IJIS)}, volume = {10}, number = {1}, year = {2011}, pages = {1-14}, abstract = {As integrated circuits (ICs) continue to have an overwhelming presence in our digital information-dominated world, having trust in their manufacture and distribution mechanisms is crucial. However, with ever-shrinking transistor technologies, the cost of new fabrication facilities is becoming prohibitive, pushing industry to make greater use of potentially less reliable foreign sources for their IC supply. The 2008 Computer Security Awareness Week (CSAW) Embedded Systems Challenge at the Polytechnic Institute of NYU highlighted some of the vulnerabilities of the IC supply chain in the form of a hardware hacking challenge. This paper explores the design and implementation of our winning entry.}, author = {Alex Baumgarten and Michael Steffen and Matthew Clausman and Joseph Zambreno} } @conference {SayJon11A, title = {Characterizing Non-Ideal Impacts of Reconfigurable Hardware Workloads on Ring Oscillator-based Thermometers}, booktitle = {Proceedings of the International Conference on Reconfigurable Computing and FPGAs (Reconfig)}, year = {2011}, month = {November}, abstract = {Thermal issues have resulted in growing concerns among industries fabricating various types of devices, such as Chip Multiprocessors (CMP) and reconfigurable hardware devices. Since passive cooling costs have risen considerably and packaging for worst-case is no longer practical, dynamic thermal management techniques are being devised to combat thermal effects. For such techniques to be applied effectively, it is necessary to accurately measure device temperatures at run time. Although several techniques have been proposed to measure the on-chip temperatures of reconfigurable devices, ring oscillators in many ways are a preferred choice due to their strong linear temperature-dependence and compact design using available spare reconfigurable resources. A major problem in using ring-oscillators to measure temperature, however, is their strong dependence on the core voltage of, and current distribution throughout the device under test. One of the reasons for variations in these properties is changes in the workload running on the device. Researchers have seen large shifts in the output frequencies of ring-oscillators due to core voltage swings on reconfigurable devices, and have tried to find alternate ways of measuring temperature that attempt to mitigate these effects. The need, however, is to have a workload-compensated ring oscillator-based thermometer for reconfigurable devices. To obtain this, it is first necessary to characterize the non-ideal effects of workload variations on ring oscillator response. Where non-ideal refers to impacts on ring oscillator oscillation frequency due to phenomena other than the workload{\textquoteright}s impact on device temperature. This paper performs such a characterization, in which the effects of workload variation on ring oscillator output frequency is quantified. A complete hardware-software setup is designed to collect temperature and power related data along with ring oscillator response to varying workload configurations. In addition, a potential issue with using the Xilinx System Monitor to measure die temperature at high ranges is also briefly discussed.}, author = {Moinuddin Sayed and Phillip Jones} } @conference {RilGra11A, title = {Circumventing a Ring Oscillator Approach to FPGA-Based Hardware Trojan Detection}, booktitle = {Proceedings of the International Conference on Computer Design (ICCD)}, year = {2011}, month = {October}, abstract = {Ring oscillators are commonly used as a locking mechanism that binds a hardware design to a specific area of silicon within an integrated circuit (IC). This locking mechanism can be used to detect malicious modifications to the hardware design, also known as a hardware Trojan, in situations where such modifications result in a change to the physical placement of the design on the IC. However, careful consideration is needed when designing ring oscillators for such a scenario to guarantee the integrity of the locking mechanism. This paper presents a case study in which flaws discovered in a ring oscillator-based Trojan detection scheme allowed for the circumvention of the security mechanism and the implementation of a large and diverse set of hardware Trojans, limited only by hardware resources.}, author = {Justin Rilling and David Graziano and Jamin Hitchcock and Timothy Meyer and Xinying Wang and Phillip Jones and Joseph Zambreno} } @article {SunZam11A, title = {Design and Analysis of a Reconfigurable Platform for Frequent Pattern Mining}, journal = {IEEE Transactions on Parallel and Distributed Systems (TPDS)}, volume = {22}, number = {9}, year = {2011}, month = {September}, pages = {1497-1505}, abstract = {Frequent pattern mining algorithms are designed to find commonly occurring sets in databases. This class of algorithms is typically very memory intensive, leading to prohibitive runtimes on large databases. A class of reconfigurable architectures has been recently developed that have shown promise in accelerating some data mining applications. In this paper, we propose a new architecture for frequent pattern mining based on a systolic tree structure. The goal of this architecture is to mimic the internal memory layout of the original pattern mining software algorithm while achieving a higher throughput. We provide a detailed analysis of the area and performance requirements of our systolic tree-based architecture, and show that our reconfigurable platform is faster than the original software algorithm for mining long frequent patterns.}, author = {Song Sun and Joseph Zambreno} } @article {PanZam11A, title = {Efficient Mapping and Acceleration of AES on Custom Multi-Core Architectures}, journal = {Concurrency and Computation: Practice and Experience}, volume = {23}, number = {4}, year = {2011}, pages = {372-389}, abstract = {Multi-core processors can deliver significant performance benefits for multi-threaded software by adding processing power with minimal latency, given the proximity of the processors. Cryptographic applications are inherently complex and involve large computations. Most cryptographic operations can be translated into logical operations, shift operations, and table look-ups. In this paper we design a novel processor (called -core) with a reconfigurable Arithmetic Logic Unit, and design custom two-dimensional multicore architectures on top of it to accelerate cryptographic kernels. We propose an efficient mapping of instructions from the multi-core grid to the individual processor cores and illustrate the performance of AES-128E algorithm over custom-sized grids. The model was developed using Simulink and the performance analysis suggests a positive trend towards development of large multi-core (or multi--core) architectures to achieve high throughputs in cryptographic operations.}, author = {Amit Pande and Joseph Zambreno} } @conference {PanZam11C, title = {Hardware Architecture for Simultaneous Arithmetic Coding and Encryption}, booktitle = {Proceedings of the International Conference on Engineering of Reconfigurable Systems and Algorithms (ERSA)}, year = {2011}, month = {July}, abstract = {Arithmetic coding is increasingly being used in upcoming image and video compression standards such as JPEG2000, and MPEG-4/H.264 AVC and SVC standards. It provides an efficient way of lossless compression and recently, it has been used for joint compression and encryption of video data. In this paper, we present an interpretation of arithmetic coding using chaotic maps. This interpretation greatly reduces the hardware complexity of decoder to use a single multiplier by using an alternative algorithm and enables encryption of video data at negligible computational cost. The encoding still requires two multiplications. Next, we present a hardware implementation using 64 bit fixed point arithmetic on Virtex-6 FPGA (with and without using DSP slices). The encoder resources are slightly higher than a traditional AC encoder, but there are savings in decoder performance. The architectures achieve clock frequency of 400-500 MHz on Virtex-6 xc6vlx75 device. We also advocate multiple symbol AC encoder design to increase throughput/slice of the device, obtaining a value of 4.}, author = {Amit Pande and Joseph Zambreno and Prasant Mohapatra} } @conference {SteJon11A, title = {Teaching Graphics Processing and Architecture using a Hardware Prototyping Approach}, booktitle = {Proceedings of the International Conference on Microelectronic Systems Education (MSE)}, year = {2011}, month = {June}, abstract = {Since its introduction over two decades ago, graphics hardware has continued to evolve to improve rendering performance and increase programmability. While most undergraduate courses in computer graphics focus on rendering algorithms and programming APIs, we have recently created an undergraduate senior elective course that focuses on graphics processing and architecture, with a strong emphasis on laboratory work targeting hardware prototyping of the 3D rendering pipeline. In this paper, we present the overall course layout and FPGA-based laboratory infrastructure, that by the end of the semester enables students to implement an OpenGL-compliant graphics processor. To our knowledge, this class is the first that takes a hardware prototyping approach to teaching computer graphics and architecture.}, author = {Michael Steffen and Phillip Jones and Joseph Zambreno} } @conference {PanMoh11A, title = {Using Chaotic Maps for Encrypting Image and Video Content}, booktitle = {Proceedings of the International Symposium on Multimedia (ISM)}, year = {2011}, month = {December}, abstract = {Arithmetic Coding (AC) is widely used for the entropy coding of text and multimedia data. It involves recursive partitioning of the range [0,1) in accordance with the relative probabilities of occurrence of the input symbols. In this paper, we present a data (image or video) encryption scheme based on arithmetic coding, which we refer to as Chaotic Arithmetic Coding (CAC). In CAC, a large number of chaotic maps can be used to perform coding, each achieving Shannon optimal compression performance. The exact choice of map is governed by a key. CAC has the effect of scrambling the intervals without making any changes to the width of interval in which the codeword must lie, thereby allowing encryption without sacrificing any coding efficiency. We next describe Binary CAC (BCAC) with some simple Security Enhancement (SE) modes which can alleviate the security of scheme against known cryptanalysis against AC-based encryption techniques. These modes, namely Plaintext Modulation (PM), Pair-Wise Independent Keys (PWIK), and Key and ciphertext Mixing (MIX) modes have insignificant computational overhead, while BCAC decoder has lower hardware requirements than BAC coder itself, making BCAC with SE as excellent choice for deployment in secure embedded multimedia systems. A bit sensitivity analysis for key and plaintext is presented along with experimental tests for compression performance.}, author = {Amit Pande and Prasant Mohapatra and Joseph Zambreno} } @conference {PanZam10B, title = {Design and Hardware Implementation of a Chaotic Encryption Scheme for Real-Time Embedded Systems}, booktitle = {Proceedings of the IEEE Signal Processing and Communications Conference (SPCOM)}, year = {2010}, month = {July}, abstract = {Chaotic encryption schemes are believed to provide a greater level of security than conventional ciphers. In this paper, a chaotic stream cipher is first constructed and then its hardware implementation details using FPGA technology are provided. Logistic map is the simplest chaotic system and has a high potential to be used to design a stream cipher for real-time embedded systems. The cipher uses a pseudo-random sequence generator based on modified logistic map (MLM) and a random feedback scheme. MLM has better chaotic properties than the logistic map in terms of uniformity of bifurcation diagram and also avoids the stable orbits of logistic map, giving a more chaotic behavior to the system. The proposed cipher gives 16 bits of encrypted data per clock cycle. The hardware implementation results over Xilinx Virtex-6 FPGA give a synthesis clock frequency of 93 MHz and a throughput of 1.5 Gbps while using 16 hardware multipliers. This makes the cipher suitable for embedded devices which have tight constraints on power consumption, hardware resources and real-time parameters.}, author = {Amit Pande and Joseph Zambreno} } @conference {CheSun10A, title = {Dynamic Simulation of DFIG Wind Turbines on FPGA Boards}, booktitle = {Proceedings of the Power and Energy Conference at Illinois (PECI)}, year = {2010}, month = {February}, pages = {39-44}, abstract = {This paper presents the implementation of a dynamic simulation of a doubly fed induction generator (DFIG)- based wind turbine on a field-programmable gate array (FPGA) board. The explicit fourth-order Runge-Kutta numerical integration algorithm is used to obtain the system dynamic response. The FPGA simulation results and speed improvement are validated versus a Matlab/Simulink simulation. Using FPGAs as computational engines can lead to significant simulation speed gains when compared to a typical PC computer, especially when operations can be efficiently parallelized on the board.}, author = {Hao Chen and Song Sun and Dionysios Aliprantis and Joseph Zambreno} } @conference {GupJon10A, title = {An Evaluation of a Slice Fault Aware Tool Chain}, booktitle = {Proceedings of Design, Automation, and Test in Europe (DATE)}, year = {2010}, month = {March}, abstract = {As FPGA sizes and densities grow, their manufacturing yields decrease. This work looks toward reclaiming some of this lost yield. Several previous works have suggested fault aware CAD tools for intelligently routing around faults. In this work we evaluate such an approach quantitatively with respect to some standard benchmarks. We also quantify the trade-offs between performance and fault tolerance in such a method. Leveraging existing CAD tools, we show up to 30\% of slices being faulty can be tolerated. Such approaches could potentially allow manufacturers to sell larger chips with manufacturing faults as smaller chips using a nomenclature that appropriately captures the reduction in logic resources.}, author = {Adwait Gupte and Phillip Jones} } @conference {SteZam10A, title = {A Hardware Pipeline for Accelerating Ray Traversal Algorithms on Streaming Processors}, booktitle = {Proceedings of the IEEE Symposium on Application Specific Processors (SASP)}, year = {2010}, month = {June}, abstract = {Ray Tracing is a graphics rendering method that uses rays to trace the path of light in a computer model. To accelerate the processing of rays, scenes are typically compiled into smaller spatial boxes using a tree structure and rays then traverse the tree structure to determine relevant spatial boxes. This allows computations involving rays and scene objects to be limited to only objects close to the ray and does not require processing all elements in the computer model. We present a ray traversal pipeline designed to accelerate ray tracing traversal algorithms using a combination of currently used programmable graphics processors and a new fixed hardware pipeline. Our fixed hardware pipeline performs an initial traversal operation that quickly identifies a smaller sized, fixed granularity spatial bounding box from the original scene. This spatial box can then be traversed further to identify subsequently smaller spatial bounding boxes using any user-defined acceleration algorithm. We show that our pipeline allows for an expected level of user programmability, including development of custom data structures, and can support a wide range of processor architectures. The performance of our pipeline is evaluated for ray traversal and intersection stages using a kd-tree ray tracing algorithm and a custom simulator modeling a generic streaming processor architecture. Experimental results show that our pipeline reduces the number of executed instructions on a graphics processor for the traversal operation by 2.15X for visible rays. The memory bandwidth required for traversal is also reduced by a factor of 1.3X for visible rays.}, author = {Michael Steffen and Joseph Zambreno} } @conference {SteZam10B, title = {Improving SIMT Efficiency of Global Rendering Algorithms with Architectural Support for Dynamic Micro-Kernels}, booktitle = {Proceedings of the International Symposium on Microarchitecture (MICRO)}, year = {2010}, month = {December}, pages = {237-248}, abstract = {Wide Single Instruction, Multiple Thread (SIMT) architectures often require a static allocation of thread groups that are executed in lockstep throughout the entire application kernel. Individual thread branching is supported by executing all control flow paths for threads in a thread group and only committing the results of threads on the current control path. While convergence algorithms are used to maximize processor efficiency during branching operations, applications requiring complex control flow often result in low processor efficiency due to the length and quantity of control paths. Global rendering algorithms are an example of a class of application that can be accelerated using a large number of independent parallel threads that each require complex control flow, resulting in comparatively low efficiency on SIMT processors. To improve processor utilization for global rendering algorithms, we introduce a SIMT architecture that allows for threads to be created dynamically at runtime. Large application kernels are broken down into smaller code blocks we call {\textmu}-kernels that dynamically created threads can execute. These runtime {\textmu}-kernels allow for the removal of branching statements that would cause divergence within a thread group, and result in new threads being created and grouped with threads beginning execution of the same {\textmu}-kernel. In our evaluation of SIMT processor efficiency for a global rendering algorithms, dynamic {\textmu}-kernels improved processor performance by an average of 1.4x.}, author = {Michael Steffen and Joseph Zambreno} } @conference {PanZam10D, title = {Joint Video Compression and Encryption using Arithmetic Coding and Chaos}, booktitle = {Proceedings of the IEEE International Conference on Internet Multimedia Systems Architecture and Applications (IMSAA)}, year = {2010}, month = {December}, abstract = {Joint Video Compression and Encryption (JVCE) has gained increased attention in the past couple of years to reduce the computational complexity of video compression, as well as provide encryption of multimedia content for web services. In this paper, we present a JVCE framework based on Binary Arithmetic Coding (BAC). We first present an interpretation of BAC in terms of a skewed binary map and then describe 7 other possible chaotic maps which give similar Shannon optimal performance as BAC. We then propose a modification of BAC in which the overall length within the range [0,1) allocated to each symbol is preserved, but the choice of map used to encode each symbol is based on a key. The encoder, referred to as Chaotic Binary Arithmetic Coder (CBAC), has the effect of scrambling the intervals without making any changes to the width of interval in which the codeword must lie, thereby allowing encryption without sacrificing any coding efficiency. We also present some some security enhancement features to show how they can alleviate the limitations of our technique against known cryptanalysis on BAC-based encryption schemes.}, author = {Amit Pande and Joseph Zambreno and Prasant Mohapatra} } @article {BauTya10A, title = {Preventing Integrated Circuit Piracy using Reconfigurable Logic Barriers}, journal = {IEEE Design and Test of Computers}, volume = {27}, number = {1}, year = {2010}, month = {January}, pages = {66-75}, abstract = {Hardware metering to prevent IC piracy is a challenging and important problem. The authors propose a combinational locking scheme based on intelligent placement of the barriers throughout the design in which the objective is to maximize the effectiveness of the barriers and to minimize the overhead.}, author = {Alex Baumgarten and Akhilesh Tyagi and Joseph Zambreno} } @conference {KarMon10A, title = {Real-Time Simulation and Visualization Architecture with Field Programmable Gate Array (FPGA) Simulator}, booktitle = {Proceedings of the ASME World Conference on Innovative Virtual Reality (WINVR)}, year = {2010}, month = {May}, abstract = {Real-time simulation of dynamic vehicle system models is essential to facilitate advances in operator and hardware in the loop simulation and virtual prototyping. Real-time virtual reality-based simulation enables users to visualize and perceive the effect of their actions during the simulation. As model complexity is increased to improve the model fidelity, the computational requirements will also increase, thus increasing the challenge to meet real-time constraints. A distributed simulator architecture was developed for off-road vehicle dynamic models and 3D graphics visualization to distribute the overall computational load across multiple computational platforms. This architecture consisted of three major components: a dynamic model simulator, a virtual reality simulator, and an interface to controller and input hardware devices. The dynamic model simulator component was developed using Matlab/Simulink Real Time Workshop on a PC and also using Field Programmable Gate Arrays (FPGA), which offered a highly parallel hardware platform. The simulator architecture reduced the computational load to an individual platform and increased the real-time simulation capability with complex off-road vehicle system models and controllers. The architecture was used to develop, simulate and visualize a tractor and towed implement steering dynamics model. The model also included a steering valve sub-system which contained very high frequency hydraulic dynamics and required 10 μs integration time step for numerical stability. The real-time simulation goal was not achievable for the model with this level of complexity when the PC-based simulator was used. However, the FPGA-based simulator achieved a real-time goal taking only 2 μs to complete one integration time step. }, author = {Manoj Karkee and Madhu Monga and Brian Steward and Joseph Zambreno and Atul Kelkar} } @conference {PanZam10A, title = {A Reconfigurable Architecture for Secure Multimedia Delivery}, booktitle = {Proceedings of the International Conference on VLSI Design (VLSID)}, year = {2010}, month = {January}, abstract = {This paper introduces a reconfigurable architecture for ensuring secure and real-time video delivery through a novel parameterized construction of the Discrete Wavelet Transform (DWT). This parameterized construction promises multimedia encryption and is also well-suited to a hardware implementation due to our derivation of rational filter coefficients. We achieve an efficient and high-throughput reconfigurable hardware implementation through the use of LUT-based constant multipliers enabling run-time reconfiguration of encryption key. We also compare our prototype (using a Xilinx Virtex 4 FPGA) to several existing implementations in the research literature and show that we achieve superior performance as compared to both traditional CPU-based and custom VLSI approaches while adding features for secure multimedia delivery.}, author = {Amit Pande and Joseph Zambreno} } @article {PanZam10E, title = {Reconfigurable Hardware Implementation of a Modified Chaotic Filter Bank Scheme}, journal = {International Journal of Embedded Systems (IJES)}, volume = {10}, number = {3}, year = {2010}, pages = {248--258}, abstract = {Chaotic filter bank schemes have been proposed in the research literature to allow for the efficient encryption of data for real-time embedded systems. Some security flaws have been found in the underlying approaches which makes such a scheme unsafe for application in real life scenarios. In this paper, we first present an improved scheme to alleviate the weaknesses of the chaotic filter bank scheme, and add enhanced security features, to form a modified chaotic filter bank (MCFB) scheme. Next, we present a reconfigurable hardware implementation of the MCFB scheme. Implementation on reconfigurable hardware speeds up the performance of MCFB scheme by mapping some of the multipliers in design to reconfigurable look-up tables, while removing many unnecessary multipliers. An optimised implementation on Xilinx Virtex-5 XC5VLX330 FPGA gave a speedup of 30\% over non-optimised direct implementation. A clock frequency of 88 MHz was obtained. }, author = {Amit Pande and Joseph Zambreno} } @article {PanZam10C, title = {The Secure Wavelet Transform}, journal = {Springer Journal of Real-Time Image Processing (RTIP)}, year = {2010}, abstract = {There has been an increasing concern for the security of multimedia transactions over real-time embedded systems. Partial and selective encryption schemes have been proposed in the research literature, but these schemes significantly increase the computation cost leading to tradeoffs in system latency, throughput, hardware requirements and power usage. In this paper, we propose a lightweight multimedia encryption strategy based on a modified discrete wavelet transform (DWT) which we refer to as the secure wavelet transform (SWT). The SWT provides joint multimedia encryption and compression by two modifications over the traditional DWT implementations: (a) parameterized construction of the DWT and (b) subband re-orientation for the wavelet decomposition. The SWT has rational coefficients which allow us to build a high throughput hardware implementation on fixed point arithmetic. We obtain a zero-overhead implementation on custom hardware. Furthermore, a Look-up table based reconfigurable implementation allows us to allocate the encryption key to the hardware at run-time. Direct implementation on Xilinx Virtex FPGA gave a clock frequency of 60 MHz while a reconfigurable multiplier based design gave a improved clock frequency of 114 MHz. The pipelined implementation of the SWT achieved a clock frequency of 240 MHz on a Xilinx Virtex-4 FPGA and met the timing constraint of 500 MHz on a standard cell realization using 45 nm CMOS technology.}, author = {Amit Pande and Joseph Zambreno} } @conference {SatBau09A, title = {Architectural Support for Automated Software Attack Detection, Recovery, and Prevention}, booktitle = {Proceedings of the International Conference on Embedded and Ubiquitous Computing (EUC)}, year = {2009}, month = {August}, abstract = {Attacks on software systems are an increasingly serious problem from an economic and security standpoint. Many techniques have been proposed ranging from simple compiler modifications to full-scale re-engineering of computer systems architecture aimed at attack detection. Traditional techniques ignore the arguably more important problem of graceful recovery. Without recovery, even a successful attack detection can become an effective Denial-of-Service. We propose an architectural approach to attack detection and recovery called rollback and huddle that monitors a program{\textquoteright}s execution with a lightweight attack-detection module while continuously checkpointing the system state. In the case of an attack, the program state is rolled back to a time before the attack occurred and an additional module is loaded to identify the source of the attack, repair the original vulnerability, and prevent future attacks. The simple hardware modules work alongside a standard computer architecture and aid in attack detection, checkpoint creation, and attack recovery. Experimental results show minimal runtime overhead and resource utilization.}, author = {Jesse Sathre and Alex Baumgarten and Joseph Zambreno} } @article {SunYan09A, title = {Demonstrable Differential Power Analysis Attacks on Real-World FPGA-Based Embedded Systems}, journal = {Integrated Computer-Aided Engineering}, volume = {16}, number = {2}, year = {2009}, month = {April}, pages = {119-130}, abstract = {In the decade since the concept was publicly introduced, power analysis attacks on cryptographic systems have become an increasingly studied topic in the computer security community. Research into countermeasures for these cryptographic systems has intensified as well. Experiments have been conducted showing the potential effectiveness of power analysis attacks and preventative techniques on both software (e.g. smartcard, DSP) and hardware (e.g. ASIC, FPGA) processing elements. One key observation that motivates our work is that the majority of the research into power analysis on FPGA-based cryptographic systems has been a) theoretical in nature, b) evaluated through simulation, or c) experimented using custom hardware that does not closely mirror real-world systems. In this paper, we look to bridge this gap between theory and practice by detailing our experience in performing a Differential Power Analysis (DPA) attack on a commercial FPGA development board. We present an automated data acquisition and analysis design for an FPGA-based implementation of the Data Encryption Standard (DES), and discuss some of the challenges and obstacles that we encountered when performing the DPA attack on our chosen commercial platform.}, author = {Song Sun and Jackey Yan and Joseph Zambreno} } @conference {SteZam09A, title = {Design and Evaluation of a Hardware Accelerated Ray Tracing Data Structure}, booktitle = {Proceedings of Theory and Practice of Computer Graphics (TPCG)}, year = {2009}, month = {June}, abstract = {The increase in graphics card performance and processor core count has allowed significant performance acceleration for ray tracing applications. Future graphics architectures are expected to continue increasing the number of processor cores, further improving performance by exploiting data parallelism. However, current ray tracing implementations are based on recursive searches which involve multiple memory reads. Consequently, software implementations are used without any dedicated hardware acceleration. In this paper, we introduce a ray tracing method designed around hierarchical space subdivision schemes that reduces memory operations. In addition, parts of this traversal method can be performed in fixed hardware running in parallel with programmable graphics processors. We used a custom performance simulator that uses our traversal method, based on a kd-tree, to compare against a conventional kd-tree. The system memory requirements and system memory reads are analyzed in detail for both acceleration structures. We simulated six benchmark scenes and show a reduction in the number of memory reads of up to 70 percent compared to current recursive methods for scenes with over 100,000 polygons.}, author = {Michael Steffen and Joseph Zambreno} } @conference {CheSun09A, title = {Dynamic Simulation of Electric Machines on FPGA Boards}, booktitle = {Proceedings of the International Electric Machines and Drives Conference (IEMDC)}, year = {2009}, month = {May}, abstract = {This paper presents the implementation of an induction machine dynamic simulation on a field-programmable gate array (FPGA) board. Using FPGAs as computational engines can lead to significant simulation speed gains when compared to a typical PC computer, especially when operations can be efficiently parallelized on the board. The textbook example of a free acceleration followed by a step load change is used to outline the basic steps of designing an explicit Runge{\textendash}Kutta numerical ordinary differential equation (ODE) solver on the FPGA platform. The FPGA simulation results and speed improvement are validated versus a Matlab/Simulink simulation.}, author = {Hao Chen and Song Sun and Dionysios Aliprantis and Joseph Zambreno} } @conference {PanZam09B, title = {An Efficient Hardware Architecture for Multimedia Encryption and Authentication using the Discrete Wavelet Transform}, booktitle = {Proceedings of the IEEE Computer Society Annual Symposium on VLSI (ISVLSI)}, year = {2009}, month = {May}, pages = {85-90}, abstract = {This paper introduces a zero-overhead encryption and authentication scheme for real-time embedded multimedia systems. The parameterized construction of the Discrete Wavelet Transform (DWT) compression block is used to introduce a free parameter in the design. It allows building a keyspace for lightweight multimedia encryption. The parameterization yields rational coefficients leading to an efficient fixed point hardware implementation. A clock speed of over 240 MHz was achieved on a Xilinx Virtex 5 FPGA. Comparison with existing approaches was performed to indicate the high throughput and low hardware overhead in adding the security feature to the DWT architecture.}, author = {Amit Pande and Joseph Zambreno} } @conference {PanZam09C, title = {Efficient Translation of Algorithmic Kernels on Large-Scale Multi-Cores}, booktitle = {Proceedings of the International Workshop on Reconfigurable and Multicore Embedded Systems (WoRMES)}, year = {2009}, month = {August}, abstract = {In this paper we present the design of a novel embedded processor architecture (which we call a {\textmu}-core) that makes use of a reconfigurable ALU. This core serves as the basis of custom 2-dimensional array architectures that can be used to accelerate algorithms such as cryptography and image processing. An efficient translation and mapping of instructions from the multi-core grid to the individual processor cores is proposed and illustrated with an implementation of the AES encryption algorithm on custom-sized grids. A simulation model was developed using Simulink and the performance analysis suggests a positive trend towards the development and utilization of such hardware.}, author = {Amit Pande and Joseph Zambreno} } @conference {SunZam09A, title = {A Floating-point Accumulator for FPGA-based High Performance Computing Applications}, booktitle = {Proceedings of the International Conference on Field-Programmable Technology (FPT)}, year = {2009}, month = {December}, abstract = {A floating-point accumulator for FPGA-based high performance computing applications is proposed and evaluated. Compared to previous work, our accumulator uses a fixed size circuit, and can reduce an arbitrary number of input sets of varying sizes without requiring prior knowledge of the bounds of summands. In this paper, we describe how the adder accumulator operator can be heavily pipelined to achieve a high clock speed when mapped to FPGA technology, while still maintaining the original input ordering. Our experimental results show that our accumulator design is very competitive with previous efforts in terms of FPGA resource usage and clock frequency, making it an ideal building block for large-scale sparse matrix computations as implemented in FPGA-based high performance computing systems.}, author = {Song Sun and Joseph Zambreno} } @conference {GupJon09B, title = {Hotspot Mitigation using Dynamic Partial Reconfiguration for Improved Performance}, booktitle = {Proceedings of the International Conference on Reconfigurable Computing and FPGAs (Reconfig)}, year = {2009}, month = {December}, abstract = {As the chips get denser and faster, heat dissipation is fast turning into a major problem in development of ICs. Nonuniform heating of chips due to hotspots is also an area of concern and much research. In this paper, we propose an adaptive method which takes advantage of the self-reconfiguration capability of modern FPGAs to mitigate hotspots. We adapt the floor plan of the IC in response to the current use and ambient conditions on the fly. It is most applicable to paradigms such as Network on Chip (NoC) that allow separation of communication and computation and allow communication between modules to be abstracted away. We achieve a reduction of up to 8{\textopenbullet} C in the maximum temperature of a hotspot using typical power numbers. Alternatively, by increasing the frequency, we achieve a 2-3 times increase in throughput while maintaining the same maximum temperature.}, author = {Adwait Gupte and Phillip Jones} } @conference {GupJon09A, title = {Towards Hardware Support for Common Sensor Processing Tasks}, booktitle = {Proceedings of the International Conference on Embedded and Real-Time Computing Systems and Applications (RTCSA)}, year = {2009}, month = {August}, abstract = {Sensor processing is a common task within many embedded system domains, such as in control systems sensor feedback used for actuator control, etc. In this paper we have surveyed several embedded system domains, and extracted kernels of computation that are common across applications within a given domain, or across domains. We have shown that adding architectural support for executing these common kernels of computation can yield an overall better system performance. We present a light weight, simplified prototype of a Sensor Processing Unit (SPU) that offloads these computations from the main Arithmetic Logic Unit (ALU) of an embedded processor, and that accesses sensor data in a low latency manner. Our SPU prototype shows an average speed up factor of 2.48 over executing these kernels on an embedded PowerPC processor. A large portion of this speed up is due to our low latency method for accessing sensor data. Isolating our speed up to purely computation still shows an average speed up factor of 1.38 for these kernels.}, author = {Adwait Gupte and Phillip Jones} } @article {SatZam08A, title = {Automated Software Attack Recovery using Rollback and Huddle}, journal = {Springer Journal of Design Automation for Embedded Systems (DAES)}, volume = {12}, number = {3}, year = {2008}, month = {September}, pages = {243-260}, abstract = {While research into building robust and survivable networks has steadily intensified in recent years, similar efforts at the application level and below have focused primarily on attack discovery, ignoring the larger issue of how to gracefully recover from an intrusion at that level. Our work attempts to bridge this inherent gap between theory and practice through the introduction of a new architectural technique, which we call rollback and huddle. Inspired by concepts made popular in the world of software debug, we propose the inclusion of extra on-chip hardware for the efficient storage and tracing of execution contexts. Upon the detection of some software protection violation, the application is restarted at the last known safe checkpoint (the rollback part). During this deterministic replay, an additional hw/sw module is then loaded that can increase the level of system monitoring, log more detailed information about any future attack source, and potentially institute a live patch of the vulnerable part of the software executable (the huddle part). Our experimental results show that this approach could have a practical impact on modern computing system architectures, by allowing for the inclusion of low-overhead software security features while at the same time incorporating an ability to gracefully recover from attack.}, author = {Jesse Sathre and Joseph Zambreno} } @conference {PanZam08A, title = {Design and Analysis of Efficient Reconfigurable Wavelet Filters}, booktitle = {Proceedings of the IEEE International Conference on Electro/Information Technology (EIT)}, year = {2008}, month = {May}, abstract = {Real-time image and multimedia processing applications such as video surveillance and telemedicine can have dynamic requirements of system latency, throughput, and power consumption. In this paper we discuss the design of reconfigurable wavelet filters for image processing applications that can meet such dynamic requirements. We generate several efficient hardware designs based on a derived family of bi-orthogonal 9/7 filters. An efficient folded and multiplier-free implementation of a 9/7 filter is obtained with the help of nine adders, which is a significant improvement over other existing approaches. We also propose an architecture that allows for on-the-fly switching between 9/7 and 5/3 filter structures. A performance comparison of these filters and their hardware requirements with other existing approaches demonstrates the suitability of our choice.}, author = {Amit Pande and Joseph Zambreno} } @conference {SunYan08A, title = {Experiments in Attacking FPGA-Based Embedded Systems using Differential Power Analysis}, booktitle = {Proceedings of the IEEE International Conference on Electro/Information Technology (EIT)}, year = {2008}, month = {May}, abstract = {In the decade since the concept was publicly introduced, power analysis attacks on cryptographic systems have become an increasingly studied topic in the computer security community. Research into countermeasures for these cryptographic systems has intensified as well. Experiments have been conducted showing the potential effectiveness of power analysis attacks and preventative techniques on both software (e.g. smartcard, DSP) and hardware (e.g. ASIC, FPGA) processing elements. One key observation that motivates our work is that the majority of the research into power analysis on FPGA-based cryptographic systems has been a) theoretical in nature, b) evaluated through simulation, or c) experimented using custom hardware that does not closely mirror real-world systems. In this paper, we look to bridge this gap between theory and practice by detailing our experience in performing a Differential Power Analysis (DPA) attack on a commercial FPGA development board. We present an automated data acquisition and analysis design for an FPGA-based implementation of the Data Encryption Standard (DES), and discuss some of the challenges and obstacles that we encountered when performing the DPA attack on our chosen commercial platform.}, author = {Song Sun and Jackey Yan and Joseph Zambreno} } @conference {SunZam08A, title = {Mining Association Rules with Systolic Trees}, booktitle = {Proceedings of the International Conference on Field-Programmable Logic and its Applications (FPL)}, year = {2008}, month = {September}, pages = {143-148}, abstract = {Association Rules Mining (ARM) algorithms are designed to find sets of frequently occurring items in large databases. ARM applications have found their way into a variety of fields, including medicine, biotechnology, and marketing. This class of algorithm is typically very memory intensive, leading to prohibitive runtimes on large databases. Previous attempts at acceleration using custom or reconfigurable hardware have been limited, as many of the significant ARM algorithms were designed from a software developer{\textquoteright}s perspective and have features (e.g. dynamic linked lists, recursion) that do not translate well to hardware. In this paper we look at how we can accomplish the goal of association rules mining from a hardware perspective. We investigate a popular tree-based ARM algorithm (FP-growth), and make use of a systolic tree structure, which mimics the internal memory layout of the original software algorithm while achieving much higher throughput. Our experimental prototype demonstrates how we can trade memory resources on a software platform for computational resources on a reconfigurable hardware platform, in order to exploit a fine-grained parallelism that was not inherent in the original ARM algorithm.}, author = {Song Sun and Joseph Zambreno} } @conference {PanZam08B, title = {Polymorphic Wavelet Architectures using Reconfigurable Hardware}, booktitle = {Proceedings of the International Conference on Field-Programmable Logic and its Applications (FPL)}, year = {2008}, month = {September}, pages = {471-474}, abstract = {Traditional microprocessor-based solutions are insufficient to serve the dynamic throughput demands of real-time scalable multimedia processing systems. This paper introduces a Polymorphic Architecture for the Discrete Wavelet Transform (Poly-DWT) as a building block of reconfigurable systems to address these needs. We illustrate how our PolyDWT architecture can dynamically make resource allocation decisions according to application requirements. We perform a quantitative analysis of our Poly-DWT architecture using an FPGA prototype, and compare our filters to existing approaches to illustrate the area and performance benefits inherent in our approach.}, author = {Amit Pande and Joseph Zambreno} } @conference {SunSte08A, title = {A Reconfigurable Platform for Frequent Pattern Mining}, booktitle = {Proceedings of the International Conference on ReConFigurable Computing and FPGAs (ReConFig)}, year = {2008}, month = {December}, abstract = {In this paper, a new hardware architecture for frequent pattern mining based on a systolic tree structure is proposed. The goal of this architecture is to mimic the internal memory layout of the original FP-growth algorithm while achieving a much higher throughput. We also describe an embedded platform implementation of this architecture along with detailed analysis of area requirements and performance results for different configurations. Our results show that with an appropriate selection of tree size, the reconfigurable platform can be several orders of magnitude faster than the FP-growth algorithm}, author = {Song Sun and Michael Steffen and Joseph Zambreno} } @conference {SatZam07A, title = {Rollback and Huddle: Architectural Support for Trustworthy Application Replay}, booktitle = {Proceedings of the Workshop on Embedded Software Security (WESS)}, year = {2007}, month = {October}, abstract = {While research into building robust and survivable networks has steadily intensified in recent years, similar efforts at the application level and below have focused primarily on attack discovery, ignoring the larger issue of how to gracefully recover from an intrusion at that level. Our work attempts to bridge this inherent gap between theory and practice through the introduction of a new architectural technique, which we call rollback and huddle. Inspired by concepts made popular in the world of software debug, we propose the inclusion of extra on-chip hardware for the efficient storage and tracing of execution contexts. Upon the detection of some software protection violation, the application is restarted at the last known safe checkpoint (the rollback part). During this deterministic replay, an additional hw/sw module is then loaded that can increase the level of system monitoring, log more detailed information about any future attack source, and potentially institute a live patch of the vulnerable part of the software executable (the huddle part). Our experimental results show that this approach could have a practical impact on modern computing system architectures, by allowing for the inclusion of low-overhead software security features while at the same time incorporating an ability to gracefully recover from attack.}, author = {Jesse Sathre and Joseph Zambreno} }