Efficient Place and Route for Pipeline Reconfigurable Architectures

In ICCD '00

Srihari Cadambi and Seth Copen Goldstein

Austin, TX

Sep 1990

@inproceedings{cadambi-iccd00,
  title = {Efficient Place and Route for Pipeline Reconfigurable
     Architectures},
  url = {http://www.cs.cmu.edu/~seth/papers/cadambi-iccd00.pdf},
  booktitle = {ICCD '00},
  author = {Cadambi, Srihari and Goldstein, Seth Copen},
  address = {Austin, TX},
  year = {2000},
  month = {Sep},
  keywords = {CAD,Place and Route},
}

Related Papers

CAD
	Slack Analysis in the System Design Loop	bib talk
	Girish Venkataramani and Seth Copen Goldstein. In IEEE/ACM/IFIP International Conference on Hardware/Software Codesign and System Synthesis (CODES-ISSS), pages 231–236, Oct 1990.
	@inproceedings{venkataramani-codes08, author = {Venkataramani, Girish and Goldstein, Seth Copen}, booktitle = {IEEE/ACM/IFIP International Conference on Hardware/Software Codesign and System Synthesis {(CODES-ISSS)}}, year = {2008}, address = {Atlanta, GE}, month = {Oct}, keywords = {Asychronous Circuits, CAD, Global Critical Path}, title = {Slack Analysis in the System Design Loop}, talk = {http://www.cs.cmu.edu/~seth/papers/talk-venkataramani-codes08.pdf}, pages = {231--236}, }
	Area Optimizations for Dual-Rail Circuits Using Relative-Timing Analysis	pdf bib
	Tiberiu Chelcea, Girish Venkataramani, and Seth Copen Goldstein. In Proceedings of the 13th IEEE International Symposium on Asynchronous Circuits and Systems, pages 117–128, Mar 1990.
	@inproceedings{chelcea-async07, author = {Chelcea, Tiberiu and Venkataramani, Girish and Goldstein, Seth Copen}, title = {Area Optimizations for Dual-Rail Circuits Using Relative-Timing Analysis}, booktitle = {Proceedings of the 13th IEEE International Symposium on Asynchronous Circuits and Systems}, year = {2007}, address = {Berkeley, CA}, month = {Mar}, pages = {117--128}, abstract = {Future deep sub-micron technologies will be characterized by large parametric variations, which could make asynchronous design an attractive solution for use on large scale. However, the investment in asynchronous CAD tools does not approach that in synchronous ones. Even when asynchronous tools leverage existing synchronous toolflows, they introduce large area and speed overheads. This paper proposes several heuristic and optimal algorithms, based on timing interval analysis, for improving existing asynchronous CAD solutions by optimizing area. The optimized circuits are 2.4 times smaller for an optimal algorithm and 1.8 times smaller for a heuristic one than the existing solutions. The optimized circuits are also shown to be resilient to large parametric variations, yielding better average-case latencies than their synchronous counterparts.}, url = {http://www.cs.cmu.edu/~seth/papers/chelcea-async07.pdf}, keywords = {Asychronous Circuits, CAD}, }
	Global Critical Path: A Tool for System-Level Timing Analysis	pdf bib
	Girish Venkataramani, Mihai Budiu, Tiberiu Chelcea, and Seth Copen Goldstein. In Proceedings of the 44th ACM/IEEE Design Automation Conference, pages 783–786, Jun 1990.
	@inproceedings{dac07-gcp, author = {Venkataramani, Girish and Budiu, Mihai and Chelcea, Tiberiu and Goldstein, Seth Copen}, title = {Global Critical Path: A Tool for System-Level Timing Analysis}, booktitle = {Proceedings of the 44th ACM/IEEE Design Automation Conference}, year = {2007}, month = {Jun}, address = {San Diego, CA}, pages = {783--786}, abstract = {An effective method for focusing optimization effort on the most important parts of a design is to examine those elements on the critical path. Traditionally, the critical path is defined at the RTL level, as the longest path in the combinational logic between clocked reisters. In this paper, we present a system-level timing analysis technique to define the concept of a Global Critical Path (GCP), for predicting system-level performance. We show how the GCP can be used as a theoretical and practical tool for understanding, summarizing and optimizing the behavior of highly concurrent self-timed circuits. We formally define the GCP and show how it can be constructed using a discrete event model and hardware profiling techniques. The GCP provides valuable insight into the control-path behavior of circuits and in finding system-level bottlenecks. We have incorporated the GCP construction and analysis framework into a high-level synthesis and simulation toolchain, thus enabling complete automation in modeling, analysis and optimization.}, url = {http://www.cs.cmu.edu/~seth/papers/dac07-gcp.pdf}, keywords = {Asychronous Circuits, CAD, Global Critical Path, System modeling, Hardware profiling}, }
	Operation Chaining Asynchronous Pipelined Circuits	pdf bib
	Girish Venkataramani and Seth Copen Goldstein. In ICCAD, Nov 1990.
	@inproceedings{venkataramani-iccad07, author = {Venkataramani, Girish and Goldstein, Seth Copen}, title = {Operation Chaining Asynchronous Pipelined Circuits}, booktitle = {ICCAD}, abstract = {We define operation chaining (op-chaining) as an optimization problem to determine the optimal pipeline depth for balancing performance against energy demands in pipelined asynchronous designs. Since there are no clock period requirements, asynchronous pipeline stages can have non-uniform latencies. We exploit this fact to coalesce several stages together thereby saving power and area due to the elimination of control-path resources from the pipeline. The trade-off is potentially reduced pipeline parallelism. In this paper, we formally define this optimization as a graph covering problem, which finds sub-graphs that will be synthesized as an opchained pipeline stage. We then define the solution space for provably correct solutions and present an algorithm to efficiently search this space. The search technique partitions the graph based on post-dominator relationships to find sub-graphs that are potential op-chain candidates. We use knowledge of the Global Critical Path (GCP) [13] to evaluate the performance impact of accepting a candidate sub-graph and formulate a heuristic cost function to model this trade-off. The algorithm has a quadratic-time complexity in the size of the dataflow graph. We have implemented this algorithm within an automated asynchronous synthesis toolchain [12]. Experimental evidence from applying the algorithm on several media processing kernels reveals that the average energy-delay and energy-delay-area products improve by about 1.4x and 1.8x respectively, with a maximum improvement of 5x and 18x.}, month = {Nov}, year = {2007}, url = {http://www.cs.cmu.edu/~seth/papers/venkataramani-iccad07.pdf}, keywords = {Asychronous Circuits, CAD, Global Critical Path}, }
	Leveraging Protocol Knowledge in Slack Matching	pdf bib
	Girish Venkataramani and Seth Copen Goldstein. In IEEE/ACM International Conference on Computer-Aided Design (ICCAD), Nov 1990.
	@inproceedings{venkataramani-iccad06, title = {Leveraging Protocol Knowledge in Slack Matching}, author = {Venkataramani, Girish and Goldstein, Seth Copen}, booktitle = {IEEE/ACM International Conference on Computer-Aided Design (ICCAD)}, year = {2006}, address = {San Jose, CA}, month = {Nov}, abstract = {{Stalls, due to mis-matches in communication rates, are a major performance obstacle in pipelined circuits. If the rate of data production is faster than the rate of consumption, the resulting design performs slower than when the communication rate is matched. This can be remedied by inserting pipeline buffers (to temporarily hold data), allowing the producer to proceed if the consumer is not ready to accept data. The problem of deciding which channels need these buffers (and how many) for an arbitrary communication profile is called the slack matching problem; the optimal solution to this problem has been shown to be NP-complete. \par In this paper, we present a heuristic that uses knowledge of the communication protocol to explicitly model these bottlenecks, and an iterative algorithm to progressively remove these bottlenecks by inserting buffers. We apply this algorithm to asynchronous circuits, and show that it naturally handles large designs with arbitrarily cyclic and acyclic topologies, which exhibit various types of control choice. The heuristic is efficient, achieving linear time complexity in practice, and produces solutions that (a) achieve up to 60\% performance speedup on large media processing kernels, and (b) can either be verified to be optimal, or the approximation margin can be bounded. }}, keywords = {Asychronous Circuits, Spatial Computing, CAD, Global Critical Path}, url = {http://www.cs.cmu.edu/~seth/papers/venkataramani-iccad06.pdf}, }
	Modeling the Global Critical Path in Concurrent Systems	pdf bib
	Girish Venkataramani, Tiberiu Chelcea, Mihai Budiu, and Seth Copen Goldstein. Carnegie Mellon University Technical Report No. CMU-CS-06-144, Aug 1990.
	@techreport{venkataramani-tr06, author = {Venkataramani, Girish and Chelcea, Tiberiu and Budiu, Mihai and Goldstein, Seth Copen}, title = {Modeling the Global Critical Path in Concurrent Systems}, institution = {Carnegie Mellon University}, year = {2006}, number = {CMU-CS-06-144}, month = {Aug}, abstract = {We show how the global critical path can be used as a practical tool for understanding, optimizing and summarizing the behavior of highly concurrent self-timed circuits. Traditionally, critical path analysis has been applied to DAGs, and thus was constrained to combinatorial sub-circuits. We formally define the global critical path (GCP) and show how it can be constructed using only local information that is automatically derived directly from the circuit. We introduce a form of Production Rules, which can accurately determine the GCP for a given input vector, even for modules which exhibit choice and early termination. \par The GCP provides valuable insight into the control behavior of the application, which help in formulating new optimizations and re-formulating existing ones to use the GCP knowledge. We have constructed a fully automated framework for GCP detection and analysis, and have incorporated this framework into a high-level synthesis tool-chain. We demonstrate the effectiveness of the GCP framework by re-formulating two traditional CAD optimizations to use the GCP, yielding efficient algorithms which improve circuit power (by up to 9\%) and performance (by up to 60\%) in our experiments.}, keywords = {Asychronous Circuits, Spatial Computing,CAD, Global Critical Path}, url = {http://www.cs.cmu.edu/~seth/papers/venkataramani-tr06.pdf}, }
	SOMA: A Tool for Synthesizing and Optimizing Memory Accesses in ASICs	pdf bib
	Girish Venkataramani, Tobias Bjerregaard, Tiberiu Chelcea, and Seth Copen Goldstein. In IEEE/ACM/IFIP International Conference on Hardware/Software Codesign and System Synthesis (CODES-ISSS), pages 231–236, Sep 1990.
	@inproceedings{venkataramani-isss05, title = {SOMA: A Tool for Synthesizing and Optimizing Memory Accesses in ASICs}, author = {Venkataramani, Girish and Bjerregaard, Tobias and Chelcea, Tiberiu and Goldstein, Seth Copen}, booktitle = {IEEE/ACM/IFIP International Conference on Hardware/Software Codesign and System Synthesis (CODES-ISSS)}, year = {2005}, isbn = {1-59593-161-9}, pages = {231-236}, address = {Jersey City, NJ, USA}, month = {Sep}, abstract = {Arbitrary memory dependencies and variable latency memory systems are major obstacles to the synthesis of large-scale ASIC systems in high-level synthesis. This paper presents SOMA, a synthesis framework for constructing Memory Access Network (MAN) architectures that inherently enforce memory consistency in the presence of dynamic memory access dependencies. A fundamental bottleneck in any such network is arbitrating between concurrent accesses to a shared memory resource. To alleviate this bottleneck, SOMA uses an application-specific concurrency analysis technique to predict the dynamic memory parallelism profile of the application. This is then used to customize the MAN architecture. Depending on the parallelism profile, the MAN may be optimized for latency, throughput or both. The optimized MAN is automatically synthesized into gate-level structural Verilog using a flexible library of network building blocks. SOMA has been successfully integrated into an automated C-to-hardware synthesis flow, which generates standard cell circuits from unrestricted ANSI-C programs. Post-layout experiments demonstrate that application specific MAN construction significantly improves power and performance.}, keywords = {Asychronous Circuits, Spatial Computing,Phoenix, CAD,Compilers:Memory Optimizations}, url = {http://www.cs.cmu.edu/~seth/papers/venkataramani-isss05.pdf}, }
	Translating ANSI C to Asynchronous Circuits	pdf bib
	Mihai Budiu, Girish Venkataramani, Tiberiu Chelcea, and Seth Copen Goldstein. In 10th IEEE International Symposium on Asynchronous Circuits and Systems (ASYNC '04), Apr 1990.
	@inproceedings{budiu-async04, title = {Translating ANSI C to Asynchronous Circuits}, url = {http://www.cs.cmu.edu/~seth/papers/budiu-async04.pdf}, booktitle = {10th IEEE International Symposium on Asynchronous Circuits and Systems (ASYNC '04)}, author = {Budiu, Mihai and Venkataramani, Girish and Chelcea, Tiberiu and Goldstein, Seth Copen}, address = {Crete, Greece}, year = {2004}, month = {Apr}, keywords = {Asychronous Circuits,CAD,Electronic Nanotechnology,Fault and Defect Tolerance,Phoenix,Reconfigurable Computing,Spatial Computing}, }
	C to Asynchronous Dataflow Circuits: An End-to-End Toolflow	pdf bib
	Girish Venkataramani, Mihai Budiu, Tiberiu Chelcea, and Seth Copen Goldstein. In IEEE 13th International Workshop on Logic Synthesis (IWLS), Jun 1990.
	@inproceedings{venkataramani-iwls04, title = {{C} to Asynchronous Dataflow Circuits: An End-to-End Toolflow}, author = {Venkataramani, Girish and Budiu, Mihai and Chelcea, Tiberiu and Goldstein, Seth Copen}, booktitle = {IEEE 13th International Workshop on Logic Synthesis (IWLS)}, address = {Temecula, CA}, month = {Jun}, year = {2004}, url = {http://www.cs.cmu.edu/~seth/papers/venkataramani-iwls04.pdf}, abstract = {We present a complete toolflow that translates ANSI-C programs into asynchronous circuits. The toolflow is built around a compiler that converts C into a functional dataflow intermediate representation, exposing instruction-level, pipeline and memory parallelism. The compiler performs optimizations and converts the intermediate representation into pipelined asynchronous circuits, with no centralized controllers. In the resulting circuits, control is distributed, communication is achieved through local wires, and arbitration for datapath resources is unnecessary. Circuits automatically synthesized from Mediabench kernels exhibit substantially better energy-delay than either single-issue processors or aggressive superscalar cores.}, keywords = {Asychronous Circuits,Spatial Computing,Phoenix,CAD}, }
	Molecules, Gates, Circuits, Computer	pdf bib
	Seth Copen Goldstein and Mihai Budiu. In Molecular Nanoelectronics, Jan 1990.
	@incollection{goldstein-mn03, title = {Molecules, Gates, Circuits, Computer}, url = {http://www.cs.cmu.edu/~seth/papers/goldstein-mn03.pdf}, booktitle = {Molecular Nanoelectronics}, author = {Goldstein, Seth Copen and Budiu, Mihai}, year = {2003}, editor = {Mark A. Reed and Takhee Lee}, publisher = {American Scientific Publishers}, address = {Stevenson Ranch, CA}, month = {Jan}, isbn = {1-588883-006-3}, keywords = {Asychronous Circuits,CAD,Electronic Nanotechnology,Fault and Defect Tolerance,Reconfigurable Computing,Spatial Computing,electronic nanotechnology,molecular electronics}, }
	MolSpice: Designing Molecular Logic Circuits	pdf bib
	Seth Copen Goldstein, James Ellenbogen, David Almassiam, Matt Brown, Mark Cannarsa, Jesse Klein, Schuyler Schell, Geoff Washburn, and Matthew M Ziegler. In Ninth Foresight Conference on Molecular Nanotechnology, Nov 1990.
	@inproceedings{goldstein-foresight01, author = {Goldstein, Seth Copen and Ellenbogen, James and Almassiam, David and Brown, Matt and Cannarsa, Mark and Klein, Jesse and Schell, Schuyler and Washburn, Geoff and Ziegler, Matthew M}, title = {MolSpice: Designing Molecular Logic Circuits}, booktitle = {Ninth Foresight Conference on Molecular Nanotechnology}, url = {http://www.cs.cmu.edu/~seth/papers/goldstein-foresight01.pdf}, year = {2001}, month = {Nov}, address = {Santa Clara, CA}, keywords = {Electronic Nanotechnology, Molecular Electronics, CAD}, }
	Static Profile-driven Compilation for FPGAs	pdf bib
	Srihari Cadambi and Seth Copen Goldstein. In Proceedings of the 11th International Conference on Field-Programmable Logic and Applications, Aug 1990.
	@inproceedings{cadambi-fpl01, title = {Static Profile-driven Compilation for FPGAs}, url = {http://www.cs.cmu.edu/~seth/papers/cadambi-fpl01.pdf}, booktitle = {Proceedings of the 11th International Conference on Field-Programmable Logic and Applications}, author = {Cadambi, Srihari and Goldstein, Seth Copen}, address = {Belfast, Northern Ireland}, year = {2001}, month = {Aug}, keywords = {CAD,Reconfigurable Computing}, }
	BitValue Inference: Detecting and Exploiting Narrow Bitwidth Computations	pdf bib
	Mihai Budiu and Seth Copen Goldstein. Carnegie Mellon University Technical Report, Jun 1990. See budiu-europar00.
	@techreport{budiu-tr00, title = {BitValue Inference: Detecting and Exploiting Narrow Bitwidth Computations}, url = {http://www.cs.cmu.edu/~seth/papers/budiu-tr00.pdf}, booktitle = {CMU CS Technical Report, CMU-CS-00-141}, author = {Budiu, Mihai and Goldstein, Seth Copen}, institution = {Carnegie Mellon University}, year = {2000}, month = {Jun}, see = {budiu-europar00}, keywords = {CAD,Compilers:CASH,Reconfigurable Computing}, }
	Efficient Place and Route for Pipeline Reconfigurable Architectures	pdf bib
	Srihari Cadambi and Seth Copen Goldstein. In ICCD '00, Sep 1990.
	@inproceedings{cadambi-iccd00, title = {Efficient Place and Route for Pipeline Reconfigurable Architectures}, url = {http://www.cs.cmu.edu/~seth/papers/cadambi-iccd00.pdf}, booktitle = {ICCD '00}, author = {Cadambi, Srihari and Goldstein, Seth Copen}, address = {Austin, TX}, year = {2000}, month = {Sep}, keywords = {CAD,Place and Route}, }
	BitValue Inference: Detecting and Exploiting Narrow Bitwidth Computations	pdf bib
	Mihai Budiu, Majd Sakr, Kevin Walker, and Seth Copen Goldstein. In Proceedings of the 2000 Europar Conference, volume 1900, pages 969–979, Aug 1990. Also appeared as CMU CS Technical Report, CMU-CS-00-141, October 2000..
	@inproceedings{budiu-europar00, title = {{BitValue} Inference: Detecting and Exploiting Narrow Bitwidth Computations}, author = {Budiu, Mihai and Sakr, Majd and Walker, Kevin and Goldstein, Seth Copen}, booktitle = {Proceedings of the 2000 Europar Conference}, year = {2000}, volume = {1900}, pages = {969--979}, month = {Aug}, issn = {0302-9743}, series = {Lecture Notes in Computer Science}, publisher = {Springer Verlag}, address = {Munich, Germany}, url = {http://www.cs.cmu.edu/~seth/papers/budiu-europar00.pdf}, also = {CMU CS Technical Report, CMU-CS-00-141, October 2000.}, abstract = {We present a compiler algorithm called BitValue, which can discover both unused and constant bits in dusty-deck C programs. BitValue uses forward and backward dataflow analyses, generalizing constant-folding and dead-code detection at the bit-level. This algorithm enables compiler optimizations which target special processor architectures for computing on non-standard bitwidths. Using this algorithm we show that up to 31\% of the computed bytes are thrown away (for programs from SpecINT95 and Mediabench). A compiler for reconfigurable hardware uses this algorithm to achieve substantial reductions (up to 20-fold) in the size of the synthesized circuits.}, keywords = {Spatial Computing,Reconfigurable Computing,Phoenix,PipeRench,CAD}, }
	CPR: A Configuration Profiling Tool	pdf bib
	Srihari Cadambi and Seth Copen Goldstein. In 7th Annual IEEE Symposium on Field-Programmable Custom Computing Machines (FCCM '99), pages 104, Apr 1990.
	@inproceedings{cadambi-fccm99, title = {CPR: A Configuration Profiling Tool}, url = {http://www.cs.cmu.edu/~seth/papers/cadambi-fccm99.pdf}, booktitle = {7th Annual IEEE Symposium on Field-Programmable Custom Computing Machines (FCCM '99)}, author = {Cadambi, Srihari and Goldstein, Seth Copen}, year = {1999}, pages = {104}, address = {Napa Valley, CA}, month = {Apr}, keywords = {CAD,Reconfigurable Computing,Place And Route}, }
Place And Route
	Efficient Place and Route for Pipeline Reconfigurable Architectures	pdf bib
	Srihari Cadambi and Seth Copen Goldstein. In ICCD '00, Sep 1990.
	@inproceedings{cadambi-iccd00, title = {Efficient Place and Route for Pipeline Reconfigurable Architectures}, url = {http://www.cs.cmu.edu/~seth/papers/cadambi-iccd00.pdf}, booktitle = {ICCD '00}, author = {Cadambi, Srihari and Goldstein, Seth Copen}, address = {Austin, TX}, year = {2000}, month = {Sep}, keywords = {CAD,Place and Route}, }
	CPR: A Configuration Profiling Tool	pdf bib
	Srihari Cadambi and Seth Copen Goldstein. In 7th Annual IEEE Symposium on Field-Programmable Custom Computing Machines (FCCM '99), pages 104, Apr 1990.
	@inproceedings{cadambi-fccm99, title = {CPR: A Configuration Profiling Tool}, url = {http://www.cs.cmu.edu/~seth/papers/cadambi-fccm99.pdf}, booktitle = {7th Annual IEEE Symposium on Field-Programmable Custom Computing Machines (FCCM '99)}, author = {Cadambi, Srihari and Goldstein, Seth Copen}, year = {1999}, pages = {104}, address = {Napa Valley, CA}, month = {Apr}, keywords = {CAD,Reconfigurable Computing,Place And Route}, }
	Fast Compilation for Pipelined Reconfigurable Fabrics	pdf bib
	Mihai Budiu and Seth Copen Goldstein. In Proceedings of the 1999 ACM/SIGDA Seventh International Symposium on Field Programmable Gate Arrays (FPGA '99), pages 195–205, Feb 1990.
	@inproceedings{budiu-fpga99, author = {Budiu, Mihai and Goldstein, Seth Copen}, title = {Fast Compilation for Pipelined Reconfigurable Fabrics}, booktitle = {Proceedings of the 1999 ACM/SIGDA Seventh International Symposium on Field Programmable Gate Arrays (FPGA '99)}, month = {Feb}, year = {1999}, pages = {195-205}, url = {http://www.cs.cmu.edu/~seth/papers/budiu-fpga99.pdf}, abstract = {In this paper we describe a compiler which quickly synthesizes high quality pipelined datapaths for pipelined reconfigurable devices. The compiler uses the same internal representation to perform synthesis, module generation, optimization, and place and route. The core of the compiler is a linear time place and route algorithm more than two orders of magnitude faster than traditional CAD tools. The key behind our approach is that we never backtrack, rip-up, or re-route. Instead, the graph representing the computation is preprocessed to guarantee routability by inserting lazy noops. The preprocessing steps provides enough information to make a greedy strategy feasible. The compilation speed is approximately 3000 bit-operations/second (on a PII/400Mhz) for a wide range of applications. The hardware utilization averages 60\% on the target device, PipeRench.}, keywords = {Reconfigurable Computing,PipeRench,Place and Route}, }

Back to publications list