@article {GupVya15A,
	title = {A Fault-aware Toolchain Approach for FPGA Fault Tolerance},
	journal = {ACM Transactions on Design Automation of Electronic Systems (TODAES)},
	volume = {20},
	number = {2},
	year = {2015},
	abstract = {As the size and density of silicon chips continue to increase, maintaining acceptable manufacturing yields has become increasingly difficult. Recent works suggest that lithography techniques are reaching their limits with respect to enabling high yield fabrication of small-scale devices, thus there is an increasing need for techniques that can tolerate fabrication time defects. One candidate technology to help combat these defects is reconfigurable hardware. The flexible nature of reconfigurable devices, such as Field Programmable Gate Arrays (FPGAs), makes it possible for them to route around defective areas of a chip after the device has been packaged and deployed into the field.

This work presents a technique that aims to increase the effective yield of FPGA manufacturing by re-claiming a portion of chips that would be ordinarily classified as unusable. In brief, we propose a modification to existing commercial toolchain flows to make them fault aware. A phase is added to identify faults within the chip. The locations of these faults are then used by the toolchain to avoid faults during the placement and routing phase.

Specifically, we have applied our approach to the Xilinx commercial toolchain flow and evaluated its tolerance to both logic and routing resource faults. Our findings show that, at a cost of 5-10\% in device frequency performance, the modified toolchain flow can tolerate up to 30\% of logic resources being faulty and, depending on the nature of the target application, can tolerate 1-30\% of the device{\textquoteright}s routing resources being faulty. These results provide strong evidence that commercial toolchains not designed for the purpose of tolerating faults can still be greatly leveraged in the presence of faults to place and route circuits in an efficient manner.},
	author = {Adwait Gupte and Sudhanshu Vyas and Phillip Jones}
}
@article {VyaGup13A,
	title = {Hardware Architectural Support for Control Systems and Sensor Processing},
	journal = {ACM Transactions on Embedded Computing Systems (TECS)},
	volume = {13},
	number = {2},
	year = {2013},
	abstract = {The field of modern control theory and the systems used to implement these controls have shown rapid development over the last 50 years. It was often the case that those developing control algorithms could assume the computing medium was solely dedicated to the task of controlling a plant. For example, the control algorithm being implemented in software on a dedicated digital signal processor (DSP), or implemented in hardware using a simple dedicated programmable logic device (PLD). As time progressed, the drive to place more system functionality in a single component (reducing power, cost, and increasing reliability) has made this assumption less often true. Thus, it has been pointed out by some experts in the field of control theory (e.g. Astrom) that those developing control algorithms must take into account the effects of running their algorithms on systems that will be shared with other tasks. One aspect of the work presented is this article is a hardware architecture that allows control developers to maintain this simplifying assumption. We focus specifically on the proportional-integral-derivative (PID) controller. An on-chip coprocessor has been implemented that can scale to support servicing hundreds of plants, while maintaining microsecond level response times, tight deterministic control loop timing, and allows the main processor to service non-control tasks.

In order to control a plant, the controller needs information about the plant{\textquoteright}s state. Typically this information is obtained from sensors with which the plant has been instrumented. There are a number of
common computations that may be performed on this sensor data before being presented to the controller (e.g. averaging and thresholding). Thus in addition to supporting PID algorithms, we have developed a sensor processing unit (SPU) that off-loads these common sensor processing tasks from the main processor.

We have prototyped our ideas using Field Programmable Gate Array (FPGA) technology. Through our experimental results, we show our PID execution unit gives orders of magnitude improvement in response time when servicing many plants, as compared to a standard general software implementation. We also show that the SPU scales much better than a general software implementation. In addition, these execution units allow the simplifying assumption of dedicated computing medium to hold for control algorithm development.},
	author = {Sudhanshu Vyas and Adwait Gupte and Christopher Gill and Ron Cytron and Joseph Zambreno and Phillip Jones}
}
@conference {GupJon10A,
	title = {An Evaluation of a Slice Fault Aware Tool Chain},
	booktitle = {Proceedings of Design, Automation, and Test in Europe (DATE)},
	year = {2010},
	month = {March},
	abstract = {As FPGA sizes and densities grow, their manufacturing yields decrease. This work looks toward reclaiming some of this lost yield. Several previous works have suggested fault aware CAD tools for intelligently routing around faults. In this work we evaluate such an approach quantitatively with respect to some standard benchmarks. We also quantify the trade-offs between performance and fault tolerance in such a method. Leveraging existing CAD tools, we show up to 30\% of slices being faulty can be tolerated. Such approaches could potentially allow manufacturers to sell larger chips with manufacturing faults as smaller chips using a nomenclature that appropriately captures the reduction in logic resources.},
	author = {Adwait Gupte and Phillip Jones}
}
@conference {GupJon09B,
	title = {Hotspot Mitigation using Dynamic Partial Reconfiguration for Improved Performance},
	booktitle = {Proceedings of the International Conference on Reconfigurable Computing and FPGAs (Reconfig)},
	year = {2009},
	month = {December},
	abstract = {As the chips get denser and faster, heat dissipation is fast turning into a major problem in development of ICs. Nonuniform heating of chips due to hotspots is also an area of concern and much research. In this paper, we propose an adaptive method which takes advantage of the self-reconfiguration capability of modern FPGAs to mitigate hotspots. We adapt the floor plan of the IC in response to the current use and ambient conditions on the fly. It is most applicable to paradigms such as Network on Chip (NoC) that allow separation of communication and computation and allow communication between modules to be abstracted away. We achieve a reduction of up to 8{\textopenbullet} C in the maximum temperature of a hotspot using typical power numbers. Alternatively, by increasing the frequency, we achieve a 2-3 times increase in throughput while maintaining the same maximum temperature.},
	author = {Adwait Gupte and Phillip Jones}
}
@conference {GupJon09A,
	title = {Towards Hardware Support for Common Sensor Processing Tasks},
	booktitle = {Proceedings of the International Conference on Embedded and Real-Time Computing Systems and Applications (RTCSA)},
	year = {2009},
	month = {August},
	abstract = {Sensor processing is a common task within many embedded system domains, such as in control systems sensor feedback used for actuator control, etc. In this paper we have surveyed several embedded system domains, and extracted kernels of computation that are common across applications within a given domain, or across domains. We have shown that adding architectural support for executing these common kernels of computation can yield an overall better system performance. We present a light weight, simplified prototype of a Sensor Processing Unit (SPU) that offloads these computations from the main Arithmetic Logic Unit (ALU) of an embedded processor, and that accesses sensor data in a low latency manner. Our SPU prototype shows an average speed up factor of 2.48 over executing these kernels on an embedded PowerPC processor. A large portion of this speed up is due to our low latency method for accessing sensor data. Isolating our speed up to purely computation still shows an average speed up factor of 1.38 for these kernels.},
	author = {Adwait Gupte and Phillip Jones}
}