BibTeX entries for some of Mihai Budiu's publications

(for the ones somebody may want to cite)
 
@TechReport{abadi-tr05,
   author =	{Mart{\'\i}n Abadi and {\'U}lfar Erlingsson and Jay Ligatti and Mihai Budiu},
   title =	{Control-Flow Integrity},
   institution =	{Microsoft Research},
   number =	{MSR-TR-2005-18},
   pages =	{12},
   month =	{February},
   year =	{2005},
   abstract =	{Current software attacks often build on exploits that
                 subvert machine-code execution. The enforcement of a basic
                 safety property, Control-Flow Integrity (CFI), can prevent
                 such attacks from arbitrarily controlling program behavior.
                 CFI enforcement is simple, and its guarantees can be
                 established formally, even with respect to powerful
                 adversaries. Moreover, CFI enforcement is practical: it is
                 compatible with existing software and can be efficiently
                 implemented using software rewriting in commodity systems.
                 Finally, CFI provides a useful foundation for enforcing
                 further security policies, such as policies that constrain
                 the use of data memory.},
   url =	{ftp://ftp.research.microsoft.com/pub/tr/TR-2005-18.pdf}
}
 
@TechReport{erlingsson-tr05,
   key =	{TR 05a},
   author =	{{\'U}lfar Erlingsson and Mihai Budiu and Mart{\'\i}n Abadi and Jay Ligatti},
   title =	{A Theory of Secure Control Flow},
   institution =	{Microsoft Research},
   number =	{MSR-TR-2005-17},
   pages =	{12},
   month =	{February},
   year =	{2005},
   abstract =	{Control-Flow Integrity (CFI) means that the execution of a
                 program dynamically follows only certain paths, in
                 accordance with a static policy. CFI can prevent attacks
                 that, by exploiting buffer overflows and other
                 vulnerabilities, attempt to control program behavior. This
                 paper develops the basic theory that underlies two
                 practical techniques for CFI enforcement, with precise
                 formulations of hypotheses and guarantees.},
   url =	{ftp://ftp.research.microsoft.com/pub/tr/TR-2005-17.pdf}
}
 
@InProceedings{budiu-odes05,
   author =	{Mihai Budiu and Seth Copen Goldstein},
   title =	{Inter-Iteration Scalar Replacement in the Presence of Conditional Control-Flow},
   booktitle =	{Workshop on Optimizations for {DSP} and Embedded Systems
                 (ODES)},
   pages =	{20--29},
   address =	{San Jose, CA},
   month =	{March},
   year =	{2005},
   abstract =	{A large class of multimedia programs for embedded systems
                 manipulate data represented as dense matrices. In this
                 paper we revisit the classical optimization of scalar
                 replacement of array elements and pointer accesses; this
                 optimization allocates array elements to registers,
                 reducing memory traffic. We generalize the state-of-the-art
                 algorithm, by Carr and Kennedy~\cite{carr-spe94}, improving
                 it to handle simultaneously both conditional control-flow
                 and inter-iteration data reuse. Our algorithm operates
                 within the same assumptions of the classical one (perfect
                 dependence information), and has the same limitations
                 (increased register pressure). It is, however, optimal in
                 the sense that within each code region where scalar
                 promotion is applied, given sufficient registers, each
                 memory location is read/written at most once.},
   url =	{http://www.cs.cmu.edu/~mihaib/research/odes05.pdf}
}
 
@InProceedings{budiu-ispass05,
   author =	{Mihai Budiu and Pedro V. Artigas and Seth Copen Goldstein},
   title =	{Dataflow: A Complement to Superscalar},
   booktitle =	{IEEE International Symposium on Performance Analysis of
                 Systems and Software (ISPASS)},
   pages =	{177-187},
   address =	{Austin, TX},
   month =	{March},
   year =	{2005},
   abstract =	{There has been a resurgence of interest in dataflow
                 architectures, because of their potential for exploiting
                 parallelism with low overhead. In this paper we analyze the
                 performance of a class of static dataflow machines on
                 integer media and control-intensive programs and we explain
                 why a dataflow machine, even with unlimited resources, does
                 not always outperform a superscalar processor on
                 general-purpose codes, under the assumption that both
                 machines take the same time to execute basic operations. We
                 compare a program-specific dataflow machine with unlimited
                 parallelism to a superscalar processor running the same
                 program. While the dataflow machines provide very good
                 performance on most data-parallel programs, we show that
                 the dataflow machine cannot always take advantage of the
                 available parallelism. Using the dynamic critical path we
                 investigate the mechanisms used by superscalar processors
                 to provide a performance advantage and their impact on a
                 dataflow model.},
   url =	{http://www.cs.cmu.edu/~mihaib/research/ispass05.pdf},
   acceptancerate =	{27/92 = 29\%}
}
 
@InProceedings{budiu-asplos04,
   author =	{Mihai Budiu and Girish Venkataramani and Tiberiu Chelcea and Seth Copen Goldstein},
   title =	{Spatial Computation},
   booktitle =	{International Conference on Architectural Support for
                 Programming Languages and Operating Systems (ASPLOS)},
   pages =	{14--26},
   address =	{Boston, MA},
   month =	{October},
   year =	{2004},
   abstract =	{This paper describes a computer architecture that relies
                 on the direct translation of high-level language programs
                 into {\em Spatial Computation} (SC) hardware structures. SC
                 program implementations are completely distributed, without
                 any centralized control. SC circuits are optimized for {\em
                 wires} at the expense of computation units. \par In this
                 paper we investigate a particular implementation SC
                 structures called ASH (Application-Specific Hardware).
                 Under the assumption that computation is cheaper than
                 communication, ASH replicates computation units to simplify
                 interconnect, building a system which uses very simple,
                 completely dedicated communication channels. As a
                 consequence, communication on the datapath never requires
                 arbitration; the only arbitration required is for accessing
                 memory. ASH relies on very simple hardware primitives,
                 using no associative structures, no multiported register
                 files, no scheduling logic, no broadcast, and no clocks. As
                 a consequence, ASH hardware is fast and extremely power
                 efficient. \par In this work we demonstrate three features
                 of ASH: (1) that such architectures can be built by
                 automatic compilation of C programs, (2) that distributed
                 computation is in some respects fundamentally different
                 from monolithic superscalar processors and (3) that ASIC
                 implementations of ASH use 3 orders of magnitude less
                 energy compared to high-end superscalar processors, while
                 being within a factor of two in performance.},
   url =	{http://www.cs.cmu.edu/~mihaib/research/asplos04.pdf},
   doi =	{http://doi.acm.org/10.1145/1024393.1024396},
   acceptancerate =	{24/169 = 14\%}
}
 
@InProceedings{venkataramani-iwls04,
   author =	{Girish Venkataramani and Mihai Budiu and Tiberiu Chelcea and Seth Copen Goldstein},
   title =	{{C} to Asynchronous Dataflow Circuits: An End-to-End Toolflow},
   booktitle =	{International Workshop on Logic synthesis (IWLS)},
   pages =	{501--508},
   address =	{Temecula, CA},
   month =	{June},
   year =	{2004},
   abstract =	{We present a complete toolflow that translates ANSI-C
                 programs into asynchronous circuits. The toolflow is built
                 around a compiler that converts C into a functional
                 dataflow intermediate representation, exposing
                 instruction-level, pipeline and memory parallelism. The
                 compiler performs optimizations and converts the
                 intermediate representation into pipelined asynchronous
                 circuits, with no centralized controllers. In the resulting
                 circuits, control is distributed, communication is achieved
                 through local wires, and arbitration for datapath resources
                 is unnecessary. Circuits automatically synthesized from
                 Mediabench kernels exhibit substantially better
                 energy-delay than either single-issue processors or
                 aggressive superscalar cores.},
   note =	{(full paper)},
   url =	{http://www.cs.cmu.edu/~mihaib/research/iwls04.pdf}
}
 
@InProceedings{koes-msp04,
   author =	{David Koes and Mihai Budiu and Girish Venkataramani and Seth Copen Goldstein},
   title =	{Programmer Specified Pointer Independence},
   booktitle =	{Workshop on Memory System Performance (MSP)},
   month =	{June},
   year =	{2004},
   abstract =	{Good alias analysis is essential in order to achieve high
                 performance on modern processors, yet interprocedural
                 analysis does not scale well. We present a source code
                 annotation, {\tt \#pragma independent}, which is a more
                 flexible, intuitive and useful way for the programmer to
                 provide pointer aliasing information than the current C99
                 {\tt restrict} keyword. We describe a tool which highlights
                 the most important and most likely correct locations at
                 which a programmer can insert the pragmas. We show that
                 such annotations can be used effectively in compilers to
                 achieve speedups of up to 1.2x.},
   note =	{Also as TR CMU-CS-03-123},
   url =	{http://www.cs.cmu.edu/~mihaib/research/msp04.pdf}
}
 
@PhdThesis{budiu-phd03,
   key =	{PhD Thesis 03},
   author =	{Mihai Budiu},
   title =	{Spatial Computation},
   school =	{Carnegie Mellon University, Computer Science Department},
   number =	{CMU-CS-03-217},
   pages =	{225},
   month =	{December},
   year =	{2003},
   abstract =	{This thesis presents a compilation framework for
                 translating ANSI C programs into hardware dataflow
                 machines. The framework is embodied in the CASH compiler, a
                 Compiler for Application-Specific Hardware. CASH generates
                 asynchronous hardware circuits that directly implement the
                 functionality of the source program, without using any
                 interpretative structures. This style of computation is
                 dubbed ``Spatial Computation''. CASH relies extensively on
                 predication and speculation for building efficient hardware
                 circuits. \par The first part of this document describes
                 Pegasus, the internal representation of CASH, and a series
                 of novel program transformations performed by CASH, the
                 most notable of which are a new optimal register-promotion
                 algorithm and partial redundancy elimination for memory
                 accesses based on predicate manipulation. \par The second
                 part of this document evaluates the performance of the
                 generated circuits using simulation. Using media processing
                 benchmarks, we show that for the domain of embedded
                 computation, the circuits generated by CASH can sustain
                 high levels of instruction level parallelism, due to the
                 effective use of dataflow software pipelining. A comparison
                 of Spatial Computation and superscalar processors
                 highlights some of the weaknesses of our model of
                 computation, such as the lack of branch prediction and
                 register renaming. \par The results presented in this
                 document can be applied in several domains: (1) most of the
                 compiler optimizations are applicable to traditional
                 compilers for high-level languages (2) CASH itself can be
                 used as a hardware synthesis tool for very fast
                 system-on-a-chip prototyping directly from C sources, (3)
                 the compilation framework we describe can be applied to the
                 translation of imperative languages to dataflow machines,
                 (4) we have extended the dataflow machine model to
                 encompass predication, data-speculation and
                 control-speculation, and (5) the tool-chain described and
                 some specific optimizations, such as lenient execution, and
                 pipeline balancing, can be used for synthesis and
                 optimization of asynchronous hardware.},
   note =	{Technical report CMU-CS-03-217},
   url =	{http://www.cs.cmu.edu/~mihaib/research/thesis.pdf},
   url2 =	{http://reports-archive.adm.cs.cmu.edu/anon/2003/abstracts/03-217.html}
}
 
@InProceedings{goldstein-asap03,
   author =	{Seth Goldstein and Mihai Budiu and Mahim Mishra and Girish Venkataramani},
   title =	{Reconfigurable Computing and Electronic Nanotechnology},
   booktitle =	{IEEE International Conference on Application-specific
                 Systems, Architectures and Processors},
   pages =	{132--143},
   address =	{Hague, the Netherlands},
   month =	{June 24-26},
   year =	{2003},
   abstract =	{In this paper we examine the opportunities brought about
                 by recent progress in electronic nanotechnology and
                 describe the methods needed to harness them for building a
                 new computer architecture. In this process we decompose
                 some traditional abstractions, such as the transistor, into
                 fine-grain pieces, such as signal restoration and
                 input-output isolation. We also show how we can forgo the
                 extreme reliability of CMOS circuits for low-cost chemical
                 self-assembly at the expense of large manufacturing defect
                 densities. We discuss advanced testing methods which can be
                 used to recover perfect functionality from unreliable
                 parts. We proceed to show how the molecular switch, the
                 regularity of the circuits created by self-assembly and the
                 high defect densities logically require the use of
                 reconfigurable hardware as a basic building block for
                 hardware design. We then capitalize on the convergence of
                 compilation and hardware synthesis (which takes place when
                 programming reconfigurable hardware) to propose the
                 complete elimination of the instruction-set architecture
                 from the system architecture, and the synthesis of
                 asynchronous dataflow machines directly from high-level
                 programming languages, such as C. We discuss in some detail
                 a scalable compilation system that perform this task.},
   note =	{Invited paper},
   url =	{http://www.cs.cmu.edu/~mihaib/research/asap03.pdf},
   url2 =	{http://csdl.computer.org/comp/proceedings/asap/2003/1992/00/19920132abs.htm}
}
 
@InProceedings{budiu-cgo03,
   author =	{Mihai Budiu and Seth Copen Goldstein},
   title =	{Optimizing Memory Accesses For Spatial Computation},
   booktitle =	{International ACM/IEEE Symposium on Code Generation and
                 Optimization (CGO)},
   pages =	{216--227},
   address =	{San Francisco, CA},
   month =	{March 23--26},
   year =	{2003},
   abstract =	{In this paper we present the internal representation and
                 optimizations used by the CASH compiler for improving the
                 memory parallelism of pointer-based programs. CASH uses an
                 SSA-based representation for memory, which compactly
                 summarizes both control-flow and dependence information. In
                 CASH, memory optimization is a four-step process: (1) first
                 an initial, relatively coarse, representation of memory
                 dependences is built; (2) next, unnecessary memory
                 dependences are removed using dependence tests; (3) third,
                 redundant memory operations are removed (4) finally,
                 parallelism is increased by pipelining memory accesses in
                 loops. While the first three steps above are very general,
                 the loop pipelining transformations are particularly
                 applicable for spatial computation, which is the primary
                 target of CASH. The redundant memory removal optimizations
                 presented are: load/store hoisting (subsuming partial
                 redundancy elimination and common-subexpression
                 elimination), load-after-store removal, store-before-store
                 removal (dead store removal) and loop-invariant load
                 motion. One of our loop pipelining transformations is a new
                 form of loop parallelization, called loop decoupling. This
                 transformation separates independent memory accesses within
                 a loop body into several independent loops, which are
                 allowed dynamically to slip with respect to each other. A
                 new computational primitive, a token generator is used to
                 dynamically control the amount of slip, allowing maximum
                 freedom, while guaranteeing that no memory dependences are
                 violated.},
   url =	{http://www-2.cs.cmu.edu/~mihaib/research/cgo03.pdf},
   url2 =	{http://csdl.computer.org/comp/proceedings/cgo/2003/1913/00/19130216abs.htm}
}
 
@InBook{goldstein-book03,
   key =	{Chapter 03},
   author =	{Seth Copen Goldstein and Mihai Budiu},
   editor =	{Mark A. Reed and Takhee Lee},
   title =	{Molecules, Gates, Circuits, Computers},
   chapter =	{in Molecular Nanoelectronics},
   pages =	{327--388},
   publisher =	{American Scientific Publishers},
   month =	{January},
   year =	{2003},
   url =	{http://aspbs.com/html/a0999frm.htm}
}
 
@InProceedings{venkataramani-fpl02,
   key =	{FPL 02a},
   author =	{Girish Venkataramani and Suraj Sudhir and Mihai Budiu and Seth Copen Goldstein},
   title =	{Factors Influencing the Performance of a CPU-RFU Hybrid Architecture},
   booktitle =	{International Conference on Field Programmable Logic and
                 Applications (FPL)},
   pages =	{955--965},
   address =	{Montpellier (La Grande-Motte), France},
   month =	{September},
   year =	{2002},
   abstract =	{Closely coupling a reconfigurable fabric with a
                 conventional processor has been shown to successfully
                 improve the system performance. However, today s
                 superscalar pro-cessors are both complex and adept at
                 extracting Instruction Level Parallelism (ILP), which
                 introduces many complex issues to the design of a hybrid
                 CPU-RFU system. This paper examines the design of a
                 superscalar processor augmented with a closely-coupled
                 recon-figurable fabric. It identifies architectural and
                 compiler issues that affect the performance of the overall
                 system. Previous efforts at combining a processor core with
                 a reconfigurable fabric are examined in the light of these
                 issues. We also present simulation results that emphasize
                 the impact of these factors.},
   url =	{http://www.cs.cmu.edu/~mihaib/research/fpl02-girish.pdf},
   url2 =	{http://springerlink.metapress.com/openurl.asp?genre=article&issn=0302-9743&volume=2438&spage=955}
}
 
@InProceedings{budiu-fpl02,
   author =	{Mihai Budiu and Seth Copen Goldstein},
   title =	{Compiling Application-Specific Hardware},
   booktitle =	{International Conference on Field Programmable Logic and
                 Applications (FPL)},
   pages =	{853--863},
   address =	{Montpellier (La Grande-Motte), France},
   month =	{September 2--4},
   year =	{2002},
   abstract =	{In this paper we describe ASH, an architectural framework
                 for implementing Application-Specific Hardware. ASH is
                 based on automatic hardware synthesis from high-level
                 languages. The generated circuits use only localized
                 computation structures; in consequence, we expect these
                 circuits to be fast, to use little power and to scale well
                 with program complexity. \par We present in detail CASH, a
                 scalable compiler framework for ASH, which generates
                 hardware from programs written in C. Our compiler exploits
                 instruction level parallelism by using aggressive
                 speculation and dynamic scheduling. Based on this
                 compilation scheme, we evaluate the computational resources
                 necessary for implementing complex integer-based programs,
                 and we suggest architectural features that would support
                 the ASH framework.},
   url =	{http://www.cs.cmu.edu/~mihaib/research/fpl02-budiu.pdf},
   url2 =	{http://springerlink.metapress.com/openurl.asp?genre=article&issn=0302-9743&volume=2438&spage=853}
}
 
@TechReport{budiu-tr02,
   author =	{Mihai Budiu and Seth Copen Goldstein},
   title =	{Pegasus: An Efficient Intermediate Representation},
   institution =	{Carnegie Mellon University},
   number =	{CMU-CS-02-107},
   pages =	{20},
   month = may,
   year =	{2002},
   abstract =	{We present Pegasus, a compact and expressive intermediate
                 representation for imperative languages. The representation
                 is suitable for target architectures supporting predicated
                 execution and aggressive speculation. In Pegasus
                 information about the global dataflow of the program is
                 encoded in local structures, enabling compact and efficient
                 algorithms for program optimizations. As a proof of the
                 versatility of Pegasus, we have used it in a compiler
                 translating C programs to hardware implementations.},
   url =	{http://www.cs.cmu.edu/~mihaib/research/tr02.pdf},
   url2 =	{http://reports-archive.adm.cs.cmu.edu/anon/2002/abstracts/02-107.html}
}
 
@InProceedings{budiu-fccm02,
   author =	{Mihai Budiu and Mahim Mishra and Ashwin Bharambe and Seth Copen Goldstein},
   title =	{Peer-to-peer Hardware-Software Interfaces for Reconfigurable Fabrics},
   booktitle =	{IEEE Symposium on Field-Programmable Custom Computing
                 Machines (FCCM)},
   pages =	{57--66},
   address =	{Napa Valley, CA},
   month =	{April},
   year =	{2002},
   abstract =	{In this paper we describe a peer-to-peer interface between
                 processor cores and reconfigurable fabrics. The main
                 advantage of the peer-to-peer model is that it greatly
                 expands the scope of application for reconfigurable
                 computing and hence its potential benefits. The primary
                 extension in our model is that ``code'' on the
                 reconfigurable hardware unit is allowed to invoke routines
                 both on the reconfigurable unit itself and on the fixed
                 logic processor. We describe the software constructs and
                 compilation mechanisms needed for such an architecture,
                 including a detailed description of the interface between
                 the two parts of the application.},
   url =	{http://www.cs.cmu.edu/~mihaib/research/fccm02-peer.pdf},
   url2 =	{http://csdl.computer.org/comp/proceedings/fccm/2002/1801/00/18010057abs.htm}
}
 
@InProceedings{goldstein-isca01,
   author =	{Seth Copen Goldstein and Mihai Budiu},
   title =	{{NanoFabrics}: Spatial Computing Using Molecular Electronics},
   booktitle =	{International Symposium on Computer Architecture (ISCA)},
   pages =	{178--189},
   address =	{G\"{o}teborg, Sweden},
   year =	{2001},
   abstract =	{The continuation of the remarkable exponential increases
                 in processing power over the recent past faces imminent
                 challenges due in part to the physics of deep-submicron
                 CMOS devices and the costs of both chip masks and future
                 fabrication plants. A promising solution to these problems
                 is offered by an alternative to CMOS-based computing,
                 chemically assembled electronic nanotechnology (CAEN). In
                 this paper we outline how CAEN based computing can become a
                 reality. We briefly describe recent work in CAEN and how
                 CAEN will affect computer architecture. We show how the
                 inherently reconfigurable natures of CAEN devices can be
                 exploited to provide high-density chips with defect
                 tolerance which will significantly reduce the cost of
                 manufacturing. After developing the basic building blocks
                 of a CAEN based computing devices we present some
                 preliminary results which indicate that CAEN based
                 computing devices can meet or exceed the performance of
                 CMOS based devices.},
   url =	{http://www.cs.cmu.edu/~seth/papers/isca01.pdf},
   doi =	{http://doi.acm.org/10.1145/379240.379262},
   acceptancerate =	{24/163 = 15\%}
}
 
@InProceedings{budiu-europar00,
   key =	{EuroPar 00},
   author =	{Mihai Budiu and Majd Sakr and Kip Walker and Seth Copen Goldstein},
   title =	{{BitValue} Inference: Detecting and Exploiting Narrow Bitwidth Computations},
   booktitle =	{European Conference on Parallel Processing (EUROPAR)},
   series =	{Lecture Notes in Computer Science},
   volume =	{1900},
   pages =	{969--979},
   publisher =	{Springer Verlag},
   address =	{M\"{u}nich, Germany},
   year =	{2000},
   abstract =	{We present a compiler algorithm called BitValue, which can
                 discover both unused and constant bits in dusty-deck C
                 programs. BitValue uses forward and backward dataflow
                 analyses, generalizing constant-folding and dead-code
                 detection at the bit-level. This algorithm enables compiler
                 optimizations which target special processor architectures
                 for computing on non-standard bitwidths. Using this
                 algorithm we show that up to 31\% of the computed bytes are
                 thrown away (for programs from SpecINT95 and Mediabench). A
                 compiler for reconfigurable hardware uses this algorithm to
                 achieve substantial reductions (up to 20-fold) in the size
                 of the synthesized circuits.},
   note =	{An expanded version is in TR 00},
   url =	{http://www.cs.cmu.edu/~mihaib/research/europar00.pdf},
   url2 =	{http://link.springer.de/link/service/series/0558/papers/1900/19000969.pdf}
}
 
@Article{goldstein-ieee00,
   key =	{Computer 00},
   author =	{Seth Copen Goldstein and Herman Schmit and Mihai Budiu and Srihari Cadambi and Matt Moe and Reed Taylor},
   title =	{{PipeRench}: A Reconfigurable Architecture and Compiler},
   journal =	{IEEE Computer},
   volume =	{33},
   number =	{4},
   pages =	{70--77},
   month =	{April},
   year =	{2000},
   abstract =	{With the proliferation of highly specialized embedded
                 computer systems has come a diversification of workloads
                 for computing devices. General-purpose processors are
                 struggling to efficiently meet these applications'
                 disparate needs, and custom hardware is rarely feasible.
                 According to the authors, reconfigurable computing, which
                 combines the flexibility of general-purpose processors with
                 the efficiency of custom hardware, can provide the
                 alternative. PipeRench and its associated compiler comprise
                 the authors' new architecture for reconfigurable computing.
                 Combined with a traditional digital signal processor,
                 microcontroller or general-purpose processor, PipeRench can
                 support a system's various computing needs without
                 requiring custom hardware. The authors describe the
                 PipeRench architecture and how it solves some of the
                 pre-existing problems with FPGA architectures, such as
                 logic granularity, configuration time, forward
                 compatibility, hard constraints and compilation time.},
   url =	{http://www.cs.cmu.edu/~mihaib/research/computer.pdf},
   url2 =	{http://ieeexplore.ieee.org/iel5/2/18132/00839324.pdf}
}
 
@InProceedings{goldstein-isca99,
   author =	{Seth Copen Goldstein and Herman Schmit and Matthew Moe and Mihai Budiu and Srihari Cadambi and R. Reed Taylor and Ronald Laufer},
   title =	{{PipeRench}: a Coprocessor for Streaming Multimedia Acceleration},
   booktitle =	{International Symposium on Computer Architecture (ISCA)},
   pages =	{28--39},
   address =	{Atlanta, GA},
   year =	{1999},
   abstract =	{Future computing workloads will emphasize an
                 architecture's ability to perform relatively simple
                 calculations on massive quantities of mixed-width data.
                 This paper describes a novel reconfigurable fabric
                 architecture, PipeRench, optimized to accelerate these
                 types of computations. PipeRench enables fast, robust
                 compilers, supports forward compatibility, and virtualizes
                 configurations, thus removing the fixed size constraint
                 present in other fabrics. For the first time we explore how
                 the bit-width of processing elements affects performance
                 and show how the PipeRench architecture has been optimized
                 to balance the needs of the compiler against the realities
                 of silicon. Finally, we demonstrate extreme performance
                 speedup on certain computing kernels (up to 190x versus a
                 modern RISC processor), and analyze how this acceleration
                 translates to application speedup.},
   url =	{http://www.cs.cmu.edu/~mihaib/research/isca99.pdf},
   doi =	{http://doi.acm.org/10.1145/300979.300982},
   acceptancerate =	{26/135=19\%}
}
 
@InProceedings{budiu-fpga99,
   author =	{Mihai Budiu and Seth Copen Goldstein},
   title =	{Fast Compilation for Pipelined Reconfigurable Fabrics},
   booktitle =	{ACM/SIGDA International Symposium on Field Programmable
                 Gate Arrays (FPGA)},
   pages =	{195--205},
   address =	{Monterey, CA},
   year =	{1999},
   abstract =	{In this paper we describe a compiler which quickly
                 synthesizes high quality pipelined datapaths for pipelined
                 reconfigurable devices. The compiler uses the same internal
                 representation to perform synthesis, module generation,
                 optimization, and place and route. The core of the compiler
                 is a linear time place and route algorithm more than two
                 orders of magnitude faster than traditional CAD tools. The
                 key behind our approach is that we never backtrack, rip-up,
                 or re-route. Instead, the graph representing the
                 computation is preprocessed to guarantee routability by
                 inserting lazy noops. The preprocessing steps provides
                 enough information to make a greedy strategy feasible. The
                 compilation speed is approximately 3000
                 bit-operations/second (on a PII/400Mhz) for a wide range of
                 applications. The hardware utilization averages 60\% on the
                 target device, PipeRench.},
   url =	{http://www.cs.cmu.edu/~mihaib/research/fpga99.pdf},
   doi =	{http://doi.acm.org/10.1145/296399.296459}
}
 
@Article{birman-tocs99,
   author =	{Kenneth P. Birman and Mark Hayden and Oznur Oskasap and Zhen Xiao and Mihai Budiu and Yaron Minsky},
   title =	{Bimodal Multicast},
   journal =	{Transactions on Computer Systems (TOCS)},
   volume =	{17},
   number =	{2},
   pages =	{41--88},
   month = may,
   year =	{1999},
   abstract =	{There are many methods for making a multicast protocol
                 ``reliable''. At one end of thespectrum, a reliable
                 multicast protocol might offer atomicity guarantees, such
                 as all-ornothing delivery, delivery ordering, and perhaps
                 additional properties such as virtuallysynchronous
                 addressing. At the other are protocols that use local
                 repair to overcome transient packet loss in the network,
                 offering ``best effort'' reliability. Yet none of this
                 priorwork has treated stability of multicast delivery as a
                 basic reliability property, such as might be needed in an
                 internet radio, TV, or conferencing application. This paper
                 looks atreliability with a new goal: development of a
                 multicast protocol which is reliable in a sense that can be
                 rigorously quantified and includes throughput stability
                 guarantees. We characterize this new protocol as a
                 ``bimodal multicast'' in reference to its reliability
                 model, which corresponds to a family of bimodal probability
                 distributions. Here, we introduce theprotocol, provide a
                 theoretical analysis of its behavior, review experimental
                 results, and discuss some candidate applications. These
                 confirm that bimodal multicast is reliable,scalable, and
                 that the protocol provides remarkably stable delivery
                 throughput.},
   url =	{http://www.cs.cmu.edu/~mihaib/research/pbcast.pdf},
   url2 =	{http://www.acm.org/pubs/citations/journals/tocs/1999-17-2/p41-birman},
   doi =	{http://doi.acm.org/10.1145/312203.312207}
}