%% bare_conf.tex
%% V1.3
%% 2007/01/11
%% by Michael Shell
%% See:
%% http://www.michaelshell.org/
%% for current contact information.
%%
%% This is a skeleton file demonstrating the use of IEEEtran.cls
%% (requires IEEEtran.cls version 1.7 or later) with an IEEE conference paper.
%%
%% Support sites:
%% http://www.michaelshell.org/tex/ieeetran/
%% http://www.ctan.org/tex-archive/macros/latex/contrib/IEEEtran/
%% and
%% http://www.ieee.org/

%%*************************************************************************
%% Legal Notice:
%% This code is offered as-is without any warranty either expressed or
%% implied; without even the implied warranty of MERCHANTABILITY or
%% FITNESS FOR A PARTICULAR PURPOSE! 
%% User assumes all risk.
%% In no event shall IEEE or any contributor to this code be liable for
%% any damages or losses, including, but not limited to, incidental,
%% consequential, or any other damages, resulting from the use or misuse
%% of any information contained here.
%%
%% All comments are the opinions of their respective authors and are not
%% necessarily endorsed by the IEEE.
%%
%% This work is distributed under the LaTeX Project Public License (LPPL)
%% ( http://www.latex-project.org/ ) version 1.3, and may be freely used,
%% distributed and modified. A copy of the LPPL, version 1.3, is included
%% in the base LaTeX documentation of all distributions of LaTeX released
%% 2003/12/01 or later.
%% Retain all contribution notices and credits.
%% ** Modified files should be clearly indicated as such, including  **
%% ** renaming them and changing author support contact information. **
%%
%% File list of work: IEEEtran.cls, IEEEtran_HOWTO.pdf, bare_adv.tex,
%%                    bare_conf.tex, bare_jrnl.tex, bare_jrnl_compsoc.tex
%%*************************************************************************

% *** Authors should verify (and, if needed, correct) their LaTeX system  ***
% *** with the testflow diagnostic prior to trusting their LaTeX platform ***
% *** with production work. IEEE's font choices can trigger bugs that do  ***
% *** not appear when using other class files.                            ***
% The testflow support page is at:
% http://www.michaelshell.org/tex/testflow/


% Note that the a4paper option is mainly intended so that authors in
% countries using A4 can easily print to A4 and see how their papers will
% look in print - the typesetting of the document will not typically be
% affected with changes in paper size (but the bottom and side margins will).
% Use the testflow package mentioned above to verify correct handling of
% both paper sizes by the user's LaTeX system.
%
% Also note that the "draftcls" or "draftclsnofoot", not "draft", option
% should be used if it is desired that the figures are to be displayed in
% draft mode.
%
\documentclass[conference]{IEEEtran}
% Add the compsoc option for Computer Society conferences.
%
% If IEEEtran.cls has not been installed into the LaTeX system files,
% manually specify the path to it like:
% \documentclass[conference]{../sty/IEEEtran}


% Some very useful LaTeX packages include:
% (uncomment the ones you want to load)


% *** MISC UTILITY PACKAGES ***
%
%\usepackage{ifpdf}
% Heiko Oberdiek's ifpdf.sty is very useful if you need conditional
% compilation based on whether the output is pdf or dvi.
% usage:
% \ifpdf
%   % pdf code
% \else
%   % dvi code
% \fi
% The latest version of ifpdf.sty can be obtained from:
% http://www.ctan.org/tex-archive/macros/latex/contrib/oberdiek/
% Also, note that IEEEtran.cls V1.7 and later provides a builtin
% \ifCLASSINFOpdf conditional that works the same way.
% When switching from latex to pdflatex and vice-versa, the compiler may
% have to be run twice to clear warning/error messages.


% *** CITATION PACKAGES ***
%
%\usepackage{cite}
% cite.sty was written by Donald Arseneau
% V1.6 and later of IEEEtran pre-defines the format of the cite.sty package
% \cite{} output to follow that of IEEE. Loading the cite package will
% result in citation numbers being automatically sorted and properly
% "compressed/ranged". e.g., [1], [9], [2], [7], [5], [6] without using
% cite.sty will become [1], [2], [5]--[7], [9] using cite.sty. cite.sty's
% \cite will automatically add leading space, if needed. Use cite.sty's
% noadjust option (cite.sty V3.8 and later) if you want to turn this off.
% cite.sty is already installed on most LaTeX systems. Be sure and use
% version 4.0 (2003-05-27) and later if using hyperref.sty. cite.sty does
% not currently provide for hyperlinked citations.
% The latest version can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/cite/
% The documentation is contained in the cite.sty file itself.


% *** GRAPHICS RELATED PACKAGES ***
\usepackage{graphicx} % Required for including pictures
%
\ifCLASSINFOpdf
  % \usepackage[pdftex]{graphicx}
  % declare the path(s) where your graphic files are
  % \graphicspath{{../pdf/}{../jpeg/}}
  % and their extensions so you won't have to specify these with
  % every instance of \includegraphics
  % \DeclareGraphicsExtensions{.pdf,.jpeg,.png}
\else
  % or other class option (dvipsone, dvipdf, if not using dvips). graphicx
  % will default to the driver specified in the system graphics.cfg if no
  % driver is specified.
  % \usepackage[dvips]{graphicx}
  % declare the path(s) where your graphic files are
  % \graphicspath{{../eps/}}
  % and their extensions so you won't have to specify these with
  % every instance of \includegraphics
  % \DeclareGraphicsExtensions{.eps}
\fi
% graphicx was written by David Carlisle and Sebastian Rahtz. It is
% required if you want graphics, photos, etc. graphicx.sty is already
% installed on most LaTeX systems. The latest version and documentation can
% be obtained at: 
% http://www.ctan.org/tex-archive/macros/latex/required/graphics/
% Another good source of documentation is "Using Imported Graphics in
% LaTeX2e" by Keith Reckdahl which can be found as epslatex.ps or
% epslatex.pdf at: http://www.ctan.org/tex-archive/info/
%
% latex, and pdflatex in dvi mode, support graphics in encapsulated
% postscript (.eps) format. pdflatex in pdf mode supports graphics
% in .pdf, .jpeg, .png and .mps (metapost) formats. Users should ensure
% that all non-photo figures use a vector format (.eps, .pdf, .mps) and
% not a bitmapped formats (.jpeg, .png). IEEE frowns on bitmapped formats
% which can result in "jaggedy"/blurry rendering of lines and letters as
% well as large increases in file sizes.
%
% You can find documentation about the pdfTeX application at:
% http://www.tug.org/applications/pdftex


% *** MATH PACKAGES ***
%
%\usepackage[cmex10]{amsmath}
% A popular package from the American Mathematical Society that provides
% many useful and powerful commands for dealing with mathematics. If using
% it, be sure to load this package with the cmex10 option to ensure that
% only type 1 fonts will utilized at all point sizes. Without this option,
% it is possible that some math symbols, particularly those within
% footnotes, will be rendered in bitmap form which will result in a
% document that can not be IEEE Xplore compliant!
%
% Also, note that the amsmath package sets \interdisplaylinepenalty to 10000
% thus preventing page breaks from occurring within multiline equations. Use:
%\interdisplaylinepenalty=2500
% after loading amsmath to restore such page breaks as IEEEtran.cls normally
% does. amsmath.sty is already installed on most LaTeX systems. The latest
% version and documentation can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/required/amslatex/math/


% *** SPECIALIZED LIST PACKAGES ***
%
%\usepackage{algorithmic}
% algorithmic.sty was written by Peter Williams and Rogerio Brito.
% This package provides an algorithmic environment fo describing algorithms.
% You can use the algorithmic environment in-text or within a figure
% environment to provide for a floating algorithm. Do NOT use the algorithm
% floating environment provided by algorithm.sty (by the same authors) or
% algorithm2e.sty (by Christophe Fiorio) as IEEE does not use dedicated
% algorithm float types and packages that provide these will not provide
% correct IEEE style captions. The latest version and documentation of
% algorithmic.sty can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/algorithms/
% There is also a support site at:
% http://algorithms.berlios.de/index.html
% Also of interest may be the (relatively newer and more customizable)
% algorithmicx.sty package by Szasz Janos:
% http://www.ctan.org/tex-archive/macros/latex/contrib/algorithmicx/


% *** ALIGNMENT PACKAGES ***
%
%\usepackage{array}
% Frank Mittelbach's and David Carlisle's array.sty patches and improves
% the standard LaTeX2e array and tabular environments to provide better
% appearance and additional user controls. As the default LaTeX2e table
% generation code is lacking to the point of almost being broken with
% respect to the quality of the end results, all users are strongly
% advised to use an enhanced (at the very least that provided by array.sty)
% set of table tools. array.sty is already installed on most systems. The
% latest version and documentation can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/required/tools/


%\usepackage{mdwmath}
%\usepackage{mdwtab}
% Also highly recommended is Mark Wooding's extremely powerful MDW tools,
% especially mdwmath.sty and mdwtab.sty which are used to format equations
% and tables, respectively. The MDWtools set is already installed on most
% LaTeX systems. The lastest version and documentation is available at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/mdwtools/


% IEEEtran contains the IEEEeqnarray family of commands that can be used to
% generate multiline equations as well as matrices, tables, etc., of high
% quality.


%\usepackage{eqparbox}
% Also of notable interest is Scott Pakin's eqparbox package for creating
% (automatically sized) equal width boxes - aka "natural width parboxes".
% Available at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/eqparbox/


% *** SUBFIGURE PACKAGES ***
%\usepackage[tight,footnotesize]{subfigure}
% subfigure.sty was written by Steven Douglas Cochran. This package makes it
% easy to put subfigures in your figures. e.g., "Figure 1a and 1b". For IEEE
% work, it is a good idea to load it with the tight package option to reduce
% the amount of white space around the subfigures. subfigure.sty is already
% installed on most LaTeX systems. The latest version and documentation can
% be obtained at:
% http://www.ctan.org/tex-archive/obsolete/macros/latex/contrib/subfigure/
% subfigure.sty has been superceeded by subfig.sty.


%\usepackage[caption=false]{caption}
%\usepackage[font=footnotesize]{subfig}
% subfig.sty, also written by Steven Douglas Cochran, is the modern
% replacement for subfigure.sty. However, subfig.sty requires and
% automatically loads Axel Sommerfeldt's caption.sty which will override
% IEEEtran.cls handling of captions and this will result in nonIEEE style
% figure/table captions. To prevent this problem, be sure and preload
% caption.sty with its "caption=false" package option. This is will preserve
% IEEEtran.cls handing of captions. Version 1.3 (2005/06/28) and later 
% (recommended due to many improvements over 1.2) of subfig.sty supports
% the caption=false option directly:
%\usepackage[caption=false,font=footnotesize]{subfig}
%
% The latest version and documentation can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/subfig/
% The latest version and documentation of caption.sty can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/caption/

\usepackage{booktabs}


% *** FLOAT PACKAGES ***
%
%\usepackage{fixltx2e}
% fixltx2e, the successor to the earlier fix2col.sty, was written by
% Frank Mittelbach and David Carlisle. This package corrects a few problems
% in the LaTeX2e kernel, the most notable of which is that in current
% LaTeX2e releases, the ordering of single and double column floats is not
% guaranteed to be preserved. Thus, an unpatched LaTeX2e can allow a
% single column figure to be placed prior to an earlier double column
% figure. The latest version and documentation can be found at:
% http://www.ctan.org/tex-archive/macros/latex/base/


%\usepackage{stfloats}
% stfloats.sty was written by Sigitas Tolusis. This package gives LaTeX2e
% the ability to do double column floats at the bottom of the page as well
% as the top. (e.g., "\begin{figure*}[!b]" is not normally possible in
% LaTeX2e). It also provides a command:
%\fnbelowfloat
% to enable the placement of footnotes below bottom floats (the standard
% LaTeX2e kernel puts them above bottom floats). This is an invasive package
% which rewrites many portions of the LaTeX2e float routines. It may not work
% with other packages that modify the LaTeX2e float routines. The latest
% version and documentation can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/sttools/
% Documentation is contained in the stfloats.sty comments as well as in the
% presfull.pdf file. Do not use the stfloats baselinefloat ability as IEEE
% does not allow \baselineskip to stretch. Authors submitting work to the
% IEEE should note that IEEE rarely uses double column equations and
% that authors should try to avoid such use. Do not be tempted to use the
% cuted.sty or midfloat.sty packages (also by Sigitas Tolusis) as IEEE does
% not format its papers in such ways.


% *** PDF, URL AND HYPERLINK PACKAGES ***
%
%\usepackage{url}
% url.sty was written by Donald Arseneau. It provides better support for
% handling and breaking URLs. url.sty is already installed on most LaTeX
% systems. The latest version can be obtained at:
% http://www.ctan.org/tex-archive/macros/latex/contrib/misc/
% Read the url.sty source comments for usage information. Basically,
% \url{my_url_here}.


% *** Do not adjust lengths that control margins, column widths, etc. ***
% *** Do not use packages that alter fonts (such as pslatex).         ***
% There should be no need to do such things with IEEEtran.cls V1.6 and later.
% (Unless specifically asked to do so by the journal or conference you plan
% to submit to, of course. )


% correct bad hyphenation here
\hyphenation{op-tical net-works semi-conduc-tor}


\begin{document}
%
% paper title
% can use linebreaks \\ within to get better formatting as desired
\title{Maximum Clique Solver using Bitsets on GPUs}


% author names and affiliations
% use a multiple column layout for up to three different
% affiliations

\author{\IEEEauthorblockN{Matthew VanCompernolle\IEEEauthorrefmark{1},
Lee Barford\IEEEauthorrefmark{1}\IEEEauthorrefmark{2},
and Frederick Harris, Jr.\IEEEauthorrefmark{1}}

\IEEEauthorblockA{\IEEEauthorrefmark{1}Department of
Computer Science and Engineering, University of Nevada, Reno}

\IEEEauthorblockA{\IEEEauthorrefmark{2}Keysight Laboratories, Keysight Technologies}
email: mvancomp@nevada.unr.edu, lee\_barford@ieee.org, Fred.Harris@cse.unr.edu}

% conference papers do not typically use \thanks and this command
% is locked out in conference mode. If really needed, such as for
% the acknowledgment of grants, issue a \IEEEoverridecommandlockouts
% after \documentclass

% for over three affiliations, or if they all won't fit within the width
% of the page, use this alternative format:
% 
%\author{\IEEEauthorblockN{Michael Shell\IEEEauthorrefmark{1},
%Homer Simpson\IEEEauthorrefmark{2},
%James Kirk\IEEEauthorrefmark{3}, 
%Montgomery Scott\IEEEauthorrefmark{3} and
%Eldon Tyrell\IEEEauthorrefmark{4}}
%\IEEEauthorblockA{\IEEEauthorrefmark{1}School of Electrical and Computer Engineering\\
%Georgia Institute of Technology,
%Atlanta, Georgia 30332--0250\\ Email: see http://www.michaelshell.org/contact.html}
%\IEEEauthorblockA{\IEEEauthorrefmark{2}Twentieth Century Fox, Springfield, USA\\
%Email: homer@thesimpsons.com}
%\IEEEauthorblockA{\IEEEauthorrefmark{3}Starfleet Academy, San Francisco, California 96678-2391\\
%Telephone: (800) 555--1212, Fax: (888) 555--1212}
%\IEEEauthorblockA{\IEEEauthorrefmark{4}Tyrell Inc., 123 Replicant Street, Los Angeles, California 90210--4321}}


% use for special paper notices
%\IEEEspecialpapernotice{(Invited Paper)}


% make the title area
\maketitle


\begin{abstract}
%\boldmath
Finding the maximum clique in a graph is useful for solving problems in many real world applications. However the problem is classified as NP-hard,  thus making it very difficult to solve for large and dense graphs. This paper presents one of the only exact maximum clique solvers that takes advantage of the parallelism of Graphical Processing Units (GPUs). The algorithm makes use of bitsets to reduce the amount of storage space needed and take advantage of bit-level parallelism in hardware to increase performance. The results show that the GPU implementation of the algorithm performs better than the corresponding sequential algorithm in almost all cases; performance gains tend to be more prominent on larger graph sizes that can be solved using more levels of parallelism.
\end{abstract}
% IEEEtran.cls defaults to using nonbold math in the Abstract.
% This preserves the distinction between vectors and scalars. However,
% if the conference you are submitting to favors bold math in the abstract,
% then you can use LaTeX's standard command \boldmath at the very start
% of the abstract to achieve this. Many IEEE journals/conferences frown on
% math in the abstract anyway.

% no keywords


% For peer review papers, you can put extra information on the cover
% page as needed:
% \ifCLASSOPTIONpeerreview
% \begin{center} \bfseries EDICS Category: 3-BBND \end{center}
% \fi
%
% For peerreview papers, this IEEEtran command inserts a page break and
% creates the second title. It will be ignored for other modes.
\IEEEpeerreviewmaketitle


\section{Introduction}
% no \IEEEPARstart
A clique, or complete sub-graph, is a subset of the vertices in an undirected graph, such that there exists and edge between every pair of vertices in the subset. The Maximum Clique Problem (MCP) is one of the most studied NP-hard problems which consists of finding the largest possible clique in a graph. For any large or dense graph, MCP often cannot be solved in a reasonable amount of time even on high end machines due to its exponential time complexity. Despite the difficulty of the problem, finding a maximum clique has many useful applications in areas such as bioinformatics, computer vision, robotics, patterns in telecommunications traffic, and more \cite{san2011exact,washington}. For this reason and the fact that many real-world graphs do not elicit worst-case behaviour for MCP, new algorithms and optimizations to the problem continue to be a popular field of research \cite{ryanrossi}.

One emerging trend in research is the use of Graphical Processing Units (GPUs) for general purpose computing to develop high performance applications.  The release of NVIDIA's CUDA, a general purpose parallel computing platform released in 2006, enabled programmers to finally take advantage of the parallelism and high computational power of NVIDIA's GPUs while enabling a low learning curve for developers familiar with standard programming languages \cite{nvidiawebsite}.

This paper describes the implementation of a maximum clique algorithm on GPUs that utilizes bitset representations to reduce memory requirements and increase performance. The implementation does many things that most maximum clique solvers to not attempt, despite the popularity of the field of research. The algorithm presented is one of very few maximum clique solvers that runs on GPUs, makes use of recursion on the GPU, and supports systems with multiple GPUs. 

The rest of the paper is structure as follows: Section II covers background information necessary to better understand the proposed algorithm and summarizes related maximum clique algorithms. Section III explains the new maximum clique algorithm in detail. The results, comparisons between the sequential and parallel GPU algorithm, and an analysis of the results are discussed in section IV. Finally, conclusions and possible future improvements are stated in Section V.

\section{Background}

\subsection{Basic Maximum Clique Algorithm}
A basic method for searching for a maximum clique in a graph $G = (V, E)$, where $V$ is a finite set of vertices and $E$ is a finite set of edges that are unordered pairs $(u, v)$ where $u,v \in V$, is a branch and bound, depth first search method. The basic implementation explained in \cite{tomita2007efficient} uses two global sets: one to store the current clique $Q$ and another to store the largest clique found so far, $Q_{max}$. A candidate set $R$ is also stored throughout the search to keep track of which vertices may be added to $Q$. When the algorithm starts, $Q$ is empty and $R$ contains all vertices in the graph, $R = V$. The search process beings by selected a vertex $p$ from $R$ and adding the vertex to $Q$, forming a new current clique, and then proceeds to calculate a new candidate set $R$ that is the intersection of the current $R$ and the vertices adjacent to $p$. This process occurs recursively until $R$ is empty, which means that $Q$ is a maximal clique and cannot grow any larger, or $\left|Q\right| + \left|R\right| <= \left|Q_{max}\right|$ which means that the $Q$ cannot grow larger than $Q_{max}$. When $Q$ is determined to be maximal, it is compared against $Q_{max}$, and if $\left|Q\right| > \left|Q_{max}\right|$, $Q_{max} = Q$. The search then backtracks in order to find other maximal cliques, removing $p$ from both $Q$ and $R$ and then selected a new $p$ from $R$ and repeating the process until the search space has been exhausted.

\subsection{Initial Ordering}
The order in which a graph is searched can play a significant role in how quickly the maximum clique is found. Searching more likely paths first often leads to finding the solution much faster. For this reason, it is common for many algorithms to order the vertices initially before the search. Some initial orderings that are said to increase performance are non-increasing degree, minimum-width ordering, and MCR ordering which are all explained and implemented in \cite{prosser2012exact}. The algorithm presented in this paper is capable of using all three of these initial orderings.

\subsection{Approximate Coloring}
The basic MCP algorithm is much too slow to find the maximum clique for large or dense graphs in a reasonable amount of time. One way to increase the performance of the algorithm is to further prune the search space in order to avoid unnecessary searching. One method of doing so is to use approximate coloring of the graph vertices, such as the ones used in the MCQ, MCR, MCS, and BBMC algorithms \cite{san2011exact,tomita2007efficient,prosser2012exact,tomita2003efficient}. The coloring process greedily assigns each vertex in the candidate set a number value (or color) where vertices with the same color are non adjacent. Since a clique consists of a set of vertices that are all adjacent, a clique can can only potentially contain one vertex from each color. If $\left|Q\right| + color(p) \leq \left|Q_{max}\right|$ the current branch of the search space can be pruned because the current clique cannot be larger than the current maximum. This forms a tighter upper bound because the number of colors used to color the graph is almost always less than the original pruning method that uses the amount of vertices left in the candidate set. In most cases, even though the coloring process takes a considerable amount of computation time, the additional pruning increases performance in the algorithms that use it. BBMCL showed promising results by reducing the coloring overhead using a less informed, relaxed coloring method \cite{san2014relaxed}. 

\subsection{Bitset Representations and BBMC}
Another alteration to the basic MCP implementation is the use of bitsets to represent which vertices are in a set. A bitset is a data structure made up of an array of bits where each bit has individual meaning based on whether it is set or not. The BBMC algorithm along with its many variations such as BBMCI, BBMCR, BBMCL, and BBMCS all use bitsets to represent whether a vertex from the graph is in a set or not \cite{san2011exact,san2014relaxed,san2013improved,san2013robust}. The use of bitsets allows the algorithms to take advantage of bit-level parallelism in hardware where a single computer instruction can operate on a number of bits the size of a word on a processor at a time (usually 32 or 64 bits in a word). Algorithms using bitsets therefore have two main advantages: the use of a bitset instead of typical storage implementations reduces the memory needed by a factor equal to the size of a word on the system, and common operations can be done in parallel using bit-wise operations.

The BBMC algorithm along with its many alterations and improvements are recognized as some of the leading sequential MCP solvers \cite{san2011exact,san2013improved,san2010fast}. The alternate versions usually make slight changes to improve the base algorithm. For example, BBMCI is an identical algorithm to BBMC that makes a number of optimizations to the bitset framework to increase performance. Another variation of BBMC, BBMCS, was implemented to account for the very large graphs and natural sparsity that occurs in robotic data association problems by reducing memory requirements \cite{san2013robust}. In the base algorithm, the adjacency and inverse adjacency matrices representing the graph are stored as an array of size $n$ of bitsets with $n$ bits each, where $n$ is the number of vertices in the graph. Each bitset in the array stores the adjacency information for a single vertex in the graph. Each bit in the bitset corresponds to a vertex in the graph; if a bit is set to 1 it indicates that the vertex is adjacent to the vertex the bitset corresponds to, and a 0 bit indicates no edge exists between the two vertices. The inverse adjacency matrix is identical to adjacency matrix, but all bits are flipped. In the implementations the current clique, candidate set, and current maximum are all represented as bitsets with one bit for each vertex in the graph. A vertex is in the set if its corresponding position is set and is not in the set otherwise.

\subsection{Parallel MCP Algorithms}
A significant way to speed up the search for the maximum clique in a graph is to take advantage of any available parallelism in hardware. The majority of parallel implementations for MCP have been written to take advantage of multiple CPU cores. The state-of-the-art parallel exact maximum clique finder in \cite{rossi2013parallel} was designed for large sparse graphs to exploit characteristics of social and informational networks and is parallelized on a shared memory system, but could also be used on a distributed memory architecture. The implementation achieved approximately linear speedup with respect to the number of processors used, while sometimes achieving super linear speedup due to increased pruning caused from workers sharing upper bounds with each other.  

A very similar parallel approach was proposed in \cite{mccreesh2013multi} that searches for the maximum clique in a graph using multiple threads. The implementation is based off of the sequential BBMC algorithm that uses bitsets and is implemented in C++ using C++11 threads and shared memory parallelism. Results for the algorithm were impressive, with  most cases achieving around linear speedup and many cases achieving substantial super linear speedup. MCP has also been attempted on a cluster of workstations using a heuristic, neural network, and master-worker load balancing approach with fairly promising results for multiple workstation scalability \cite{cruz2013parallelizing}. A MapReduce framework that partitions the graph and solves subgraphs based on the MCR branch and bound algorithm was also implemented to utilize multiple CPUs in parallel and was shown to outperform traditional MPI-based methods that suffer from poor load balancing \cite{xiang2013scalable}.

There are also algorithms that attempt to utilize the GPU to solve MCP, though there are not every many. One implementation used a neural network and CUDA to search for the maximum clique in a graph using a GPU \cite{cruz2013parallelizing}. The algorithm was limited, only being able to handle graphs with 800 vertices maximum due to memory constraints. The authors state the the algorithm was not well suited for GPUs due to lack of even work distribution and memory constraints. Their future work suggests the use of bitsets to reduce memory requirement and make use of faster memory sources such as constant memory. In another implementation, GPUs are utilized using the Thrust library and CUDA to search for the maximum clique on interval graphs  \cite{trefftz2014parallelizing}. The algorithm is significantly faster than the sequential counterpart. Very few other maximum clique solvers have been implemented utilizing GPU parallelism. This untapped parallelism could be the key to further improving the performance of current leading algorithms.

\section{Parallel GPU Implementation}
One of the issues with utilizing GPUs to search for the maximum clique in a graph is the limited amount of main memory on the processors compared to the amount of system memory a CPU can utilize, which is usually many times larger, as indicated by \cite{cruz2013parallelizing}. Bitsets have the potential to reduce the memory requirements needed to store a set by a factor equal to the size of a word on a processor, which is typically 32 bits on a GPU. An implementation of a variation of the BBMC algorithm which utilizes bitsets could therefore address one of the major limitations determined for maximum clique solvers on GPUs, and the algorithm has the benefit of being one of the leading sequential MCP algorithms. It is also encouraging to know that the authors of BBMC note that their algorithm does not take advantage of the calculation power of graphics cards despite the use of bit parallel operations \cite{san2013robust}, implying that a GPU implementation may further improve upon the algorithm. This section describes a new maximum clique algorithm based on BBMC that utilizes GPU parallelism using CUDA.

\subsection{Parallel Method}
The BBMC implementation on the GPU, which will be referred to as BBMCG, has two distinct dimensions of parallelism. The first dimension uses block-wise parallelism to divide the search space of the graph into subtrees in order to traverse the search space in parallel. This method was inspired by the work queue distribution method proposed in \cite{mccreesh2013multi}, where a work queue is initially populated by splitting the search space immediately below the root. In BBMCG, each block is initialized with a current clique $Q$ that contains one vertex from the graph, indicated by a 1 at the position for the vertex in the bitset. The corresponding initial candidate set $R$ for each block is a set containing all the adjacent vertices to the vertex in $Q$. BBMCG therefore creates a number of blocks for the algorithm that is equal to the number of vertices in the graph. Each block then traverses its own search tree recursively utilizing recursion on the GPU based on the BBMC algorithm described in  \cite{san2011exact,prosser2012exact,san2010fast}. A single global variable is shared between the blocks to indicate the current maximum clique size found. If a block finds a clique in its search space larger than the current maximum, it atomically updates the global maximum using CUDA's built in atomicMax function and stores the maximum clique's bitset in a designated space for the block in global memory. By traversing the search space in parallel and communicating the largest clique found, more of the search space is potentially pruned as larger cliques are found faster. 

BBMCG's second dimension of parallelism is thread-wise parallel computations for bit-wise operations. CUDA lacks a bitset class, so bitsets are implemented using arrays of unsigned integers. In a bitset, a single bit is needed for each vertex in the graph. The number of unsigned integers needed to represent a bitset is calculated by dividing the number of vertices $n$ in the graph by the number of bits in an unsigned integer in CUDA and rounding up. Since unsigned integers are typically 32 bits long, one unsigned integer is needed to represent 32 vertices in the graph. The adjacency matrix which requires a bitset for each vertex in the graph could then be represented as a 2-D array of unsigned integers, but CUDA generally prefers working with contiguous 1-D arrays. The adjacency and inverse adjacency matrices are stored as 1-D arrays of unsigned integer mappings of the 2-D array equivalents. 

Any bit-wise operation that is not sequential in nature is parallelized by performing the operations using multiple threads per block. The number of threads used per block is equivalent to the number of unsigned integers needed to represent a bitset for the graph. BBMCG maps a single thread to each unsigned integer of a bitset in order for the threads to perform bit-wise operations on subsections of the entire bitset in parallel. In the base BBMC algorithm, the majority of computations are bit-wise operations on bitsets such as finding the first set bit, getting the number of set bits, intersecting bitsets, copying bitsets, and setting or clearing a bit in a bitset. All of these operations besides setting and clearing a bit can be executed in parallel to increase performance. For example, to calculate the number of bits that are 1 in a bitset, each thread executes the function $\_\_popc$ on its corresponding unsigned integer from the bitset, which is a built in CUDA function that counts the number of bits set to 1 in an unsigned int. Results of each thread are then stored in array in shared memory, and the threads calculate the sum of all the set bits using an efficient parallel vector reduction algorithm on the array. Figure \ref{fig_bitset_count} shows an example of this process. Similar methods are used to parallelize other bit-wise operations. 

\begin{figure}[!t]
\centering
\includegraphics[width=3.0in]{parallel_bitwise_operation}
\caption{The process used in BBMCG to parallelize counting the number of set bits in a bitset.}
\label{fig_bitset_count}
\end{figure}

\subsection{Memory Allocation and Management}
Although the BBMCG is able to run on graph sizes larger than those in \cite{cruz2013parallelizing}, due to the use of bitsets, some memory limitations were still encountered. One of the primary memory consumers in BBMCG is the storage space needed to store the run-time stack for recursion. Each recursive call in the search needs memory to store local variables, arrays to store the coloring of vertices to improve pruning, and several bitsets to store the current clique and candidate set. The maximum number of recursive calls directly corresponds to the size of the maximum clique in the graph, so the algorithm needs to allocate enough memory to reach the maximum depth to find the maximum clique in worst case scenarios. The amount of memory needed is then multiplied by the number of blocks used, which all make their own recursive calls as they search. Faster forms of memory, such as local registers and shared memory (limited to 48 KB on modern cards) are not large enough to store data at each level of recursion for all blocks. Even constant memory memory is limited to 64 KB and is too small to store the adjacency matrices as suggested in \cite{cruz2013parallelizing}. A single adjacency matrix for a graph size of 700 is approximately the maximum size that can be stored in constant memory, which would severely limit the problems BBMCG can run on.

BBMCG makes heavy use of global memory to address the memory limitations of shared and constant memory, but still uses registers to store non-array based data. Although global memory is much slower than shared and constant memory, it is necessary in order to run the algorithm on reasonable sizes. Global memory needed for the for the entire problem is pre-allocated before launching the kernel. The memory is allocated as several 1-D arrays that each store a single type of data for all of the blocks in the Kernel. Each block calculates the starting position in the 1-D array for its portion of memory and points to it using a pointer. Each block's section of memory in the array is then broken down further into subsections for levels of recursion where each subsection is big enough to store the associated data for a single level. When a block makes a recursive call, a new pointer is created that is offset the size of one subsection of data from the current pointer position, pointing to a new subsection of space reserved to store data for the new level of recursion. The 1-D array is created to be large enough so that each block has enough memory to store data for each level of recursion all the way down to the maximum calculated level. Figure \ref{fig_memory_allocation} shows an example of the memory layout. One disadvantage is that the amount of memory allocated may be much larger than what is actually used, because memory is allocated for a number of recursive calls equal to the number of vertices in the graph for each block. The smaller the maximum clique is in the graph, the less reserved memory is actually used. CUDA supports dynamic memory allocation which would only use the amount of memory that is needed by the algorithm, but it is extremely slow and not worth the storage benefits.

\begin{figure}[!t]
\centering
\includegraphics[width=3.0in]{memory_allocation}
\caption{A visualization of the large 1-D array used to store all instances of current cliques needed by the algorithm at different levels of recursion.}
\label{fig_memory_allocation}
\end{figure}

\subsection{Preprocessing and Post-processing}
Several preprocessing steps need to happen before the BBMCG kernel is launched on the GPU. First the vertices of the graph need to be initially ordered based on the desired ordering method. The initial search space subtrees for each block must be generated from the graph as a pair of bitsets holding a current clique and candidate set. On the CPU the adjacency and inverse adjacency matrices are represented using a bitset class, so they must be converted to arrays of unsigned integers to be able to be passed to the GPU. Before launching the kernel, all of the global memory must be allocated for various types of data such as current cliques, candidate sets, vertex color arrays, the adjacency matrices, local maximum cliques, and more. After memory is allocated and data is correctly represented, data for the adjacency matrices and block subtrees have to copied to global memory on the GPU. Lastly, since the algorithm supports the use of multiple GPUs, threads must be created to launch a BBMCG kernel on each GPU. The multiple GPU variation of BBMCG is primarily used to increase the amount of available memory and will be explained in the discussion section. In the post-processing stage, each thread copies the local maximums of the blocks to CPU memory and finds the largest clique among the blocks. 

\section{Results and Discussion}
\subsection{Sequential Implementation and Test Environment}
The sequential version of BBMC used in the results and the preprocessing portion of BBMCG were both written in C++11. The sequential implementation is a port of the BBMC algorithm provided in \cite{prosser2012exact}, which provides explanations of many different sequential maximum clique solvers and their implementations in Java. It is important to note that our C++ version of BBMC is slightly slower than the Java implementation it was based on. The C++11 version is nearly identical to the Java implementation, but uses vectors from the standard template library in place of the ArrayList class in Java and Boost's dynamic\_bitset class in place of Java's BitSet class. Differences in these class implementations may be the cause of the performance difference, but further investigation is needed. 

The computer used to gather both sequential and parallel GPU results is composed of a Intel(R) Core(TM) i7 CPU 4790K @ 4.00 GHz, two NVIDIA GTX 970's with 1664 CUDA cores running at 1050 MHz and 4 GB of memory each, and 16 GB of DDR3 system memory. Randomly generated graphs and a subset of the DIMACS benchmark graphs \cite{johnson1996cliques} were used to compare the sequential BBMC and parallel BBMCG algorithms.

\subsection{Results}
The randomly generated graphs are used to show trends for the sequential and parallel algorithms as graph size and edge density change. Figure \ref{fig_edge_probability} compares the running times on randomly generate graphs with 200 vertices and edge probabilities $p$. In the graph each pair of vertices has a probability of $p$ to be an edge in the graph. The results show that at a size of 200 vertices, the sequential algorithm outperforms BBMCG in all cases, but the two have similar growth rates with respect to increasing edge probability. It can also be seen the preprocessing stage for BBMCG takes approximately 0.1 seconds, and is slightly higher when running on multiple GPUs than on a single one. It is the case that preprocessing always take approximately that much time, and therefore is negligible in nontrivial problems. A final observation that can be made is that BBMCG runs nearly identical on multiple GPUs as it does on a single GPU for nontrivial problems. 

\begin{figure}[!t]
\centering
\includegraphics[width=3.5in]{edge_probability}
\caption{Growth rate of BBMC and BBMCG run times with respect to graph density on randomly generated graphs with 200 vertices.}
\label{fig_edge_probability}
\end{figure}

Figure \ref{fig_graph_size} compares the running times of BBMC and BBMCG on various graph sizes. It can be seen that the growth rate for the BBMCG algorithm with respect to graph size is much lower than the growth rate of the sequential algorithm. The parallel algorithm outperforms the sequential on all graph sizes above 400 vertices. Once again, the multiple GPU implementation performs almost identical to the single GPU implementation in almost all cases. 

\begin{figure}[!t]
\centering
\includegraphics[width=3.5in]{graph_size}
\caption{Growth rate of BBMC and BBMCG run times with respect to graph size on randomly generated graphs with 0.5 edge probability.}
\label{fig_graph_size}
\end{figure}

Table \ref{dimacs_results} reports the required times for the BBMC and BBMCG algorithms to find the maximum clique in a subset of the DIMACS benchmark graphs. The timings reported do not include run times for BBMCG on multiple GPUs, as they are identical to the run times for BBMCG on a single GPU for all nontrivial graphs. Results show that on 20 of the 24 selected graphs, BBMCG outperforms the sequential algorithm. The four graphs where speedup does not occur all have less than 400 vertices, which is consistent with the results shown in Figure \ref{fig_edge_probability} and Figure \ref{fig_graph_size}. Typical speedup is between the 1.4 to 4.0 range, with a few more extreme outliers. The results presented use minimum-width ordering for the initial ordering, which tends to perform best on the majority of graphs for BBMC and BBMCG. Experiments found that when using non-increasing degree or MCR methods for initial ordering the results were comparable. 

\begin{table}[h]
\caption{Run Times in Seconds on DIMACS Graphs using Minimum-Width Ordering}
\label{dimacs_results}
\begin{tabular}{@{}lrlrrrr@{}}
\toprule
\textbf{Name}               & \textbf{n} & \textbf{p} & \textbf{$\omega$} & \textbf{Sequential} & \textbf{1 GPU} & \textbf{Speedup} \\ \midrule
brock200\_1                 & 200        & 0.745      & 21         & 0.61                & 0.84           & 0.7              \\
\textit{brock400\_4}        & 400        & 0.75       & 33         & 66.87               & 30.11          & \textbf{2.2}     \\
\textit{MANN\_a27}          & 378        & 0.99       & 126        & 0.58                & 6.29           & 0.1              \\
\textit{p\_hat1000-1}       & 1000       & 0.245      & 10         & 0.52                & 0.36           & \textbf{1.4}     \\
\textit{p\_hat1500-1}       & 1,500      & 0.253      & 12         & 5.40                & 1.58           & \textbf{3.4}     \\
\textit{p\_hat300-3}        & 300        & 0.744      & 36         & 1.81                & 1.01           & \textbf{1.8}     \\
\textit{p\_hat500-2}        & 500        & 0.505      & 36         & 0.44                & 0.30           & \textbf{1.5}     \\
\textit{p\_hat500-3}        & 500        & 0.752      & 50         & 116.78              & 69.02          & \textbf{1.7}     \\
\textit{p\_hat700-2}        & 700        & 0.498      & 44         & 5.18                & 1.76           & \textbf{3.0}     \\
\textit{san1000}            & 1000       & 0.502      & 15         & 2.21                & 0.31           & \textbf{7.1}     \\
\textit{san200\_0.9\_2}     & 200        & 0.9        & 60         & 0.21                & 0.09           & \textbf{2.4}     \\
\textit{san200\_0.9\_3}     & 200        & 0.9        & 44         & 0.04                & 0.19           & 0.2              \\
\textit{san400\_0.7\_1.clq} & 400        & 0.7        & 40         & 2.87                & 0.13           & \textbf{22.1}    \\
\textit{san400\_0.7\_2}     & 400        & 0.7        & 30         & 4.10                & 0.15           & \textbf{27.3}    \\
\textit{san400\_0.7\_3}     & 400        & 0.7        & 22         & 3.47                & 0.24           & \textbf{14.6}    \\
\textit{sanr200\_0.9}       & 200        & 0.898      & 42         & 43.78               & 59.69          & 0.7              \\
\textit{sanr400\_0.5}       & 400        & 0.501      & 13         & 0.53                & 0.30           & \textbf{1.8}     \\
\textit{sanr400\_0.7}       & 400        & 0.7        & 21         & 137.64              & 96.88          & \textbf{1.4}     \\
\textit{san400\_0.9\_1}     & 400        & 0.9        & 100        & 3600.00             & 0.16           & \textbf{23076.9} \\
\textit{p\_hat1000-2}       & 1000       & 0.49       & 46         & 243.41              & 70.26          & \textbf{3.5}     \\
\textit{p\_hat500-3}        & 500        & 0.752      & 50         & 114.91              & 69.04          & \textbf{1.7}     \\
\textit{brock400\_3}        & 400        & 0.75       & 27         & 302.15              & 64.89          & \textbf{4.7}     \\
\textit{brock400\_2}        & 400        & 0.75       & 29         & 523.02              & 225.48         & \textbf{2.3}     \\
\textit{brock400\_1}        & 400        & 0.75       & 27         & 420.20              & 309.51         & \textbf{1.4}     \\
\textit{brock800\_4}        & 800        & 0.65       & 26         & 3600.00             & 975.77         & \textbf{3.7}     \\ \bottomrule
\end{tabular}
\end{table}

\subsection{Analysis}
The results were consistently positive on graphs of 400 vertices or more. This seems to be the threshold where the BBMCG algorithm can start taking advantage of the parallel processing power of the GPU. As the size of the graph increases, both block-wise and thread-wise parallel components increase in parallelism. The number of blocks used in BBMCG is equivalent to the number of vertices in the graph. Therefore, larger graphs have more ways to divide up the search space and consequently can optimally use more blocks to search different subtrees. The number of subtrees used to split the search space into was experimented with, and the ideal number of blocks to use almost always turned out to be the number of vertices in the graph, not more or less. A performance gain is much more likely on larger graphs, as many different portions of the graph are all searched simultaneously with the possibility of containing the maximum clique. Searching the graph subtrees simultaneously means that large cliques can be found from subtrees that would not be searched sequentially until later in the search, further pruning the graph and removing unnecessary searching. This can sometimes lead to extreme speedup, such as in the case of san400\_0.9\_1 where BBMCG obtained 23076.9 speedup over BBMC.   

Thread-wise parallelism is a much more consistent method of increasing performance. Block-wise parallelism relies on a non-deterministic search method in an attempt to randomly guess good search paths and find large cliques faster in the graph, and will in most cases, but the strategy depends heavily on the characteristics of the graph being searched. Thread-wise parallelism speeds up many forms of bit-wise operations in the algorithm and does so regardless of the characteristics of the graph. As the size of the graph increases, the number of unsigned integers needed in a bitset to represent the graph increases. Since the number of threads in a block is equivalent to the number of unsigned integers in a bitset, it means that more threads can be used as the graph size increases. In order for a block to make use of 32 threads, which is the number of threads in a warp, the graph must have at least 993 vertices. Since the GPU runs threads in groups the size of warps, most of the graphs cause BBMCG to run idling threads. Only 4 of the 24 DIMACS graphs make use of a full warp. As a result, the majority of graphs tested are not running optimally on the GPU. Another negative aspcect of thread-wise parallelism is the fact that not all operations can be parallelized, such as setting a bit, causing all but 1 thread to idle at these occurrences. 

One severe limitation of the BBMCG algorithm is the lack of load balancing. When a block finishes searching its subtree, it runs out of work and terminates. It is very unlikely that each block is given an equal amount of work when the kernel is initialized, so it is very possible the most blocks only work for a fraction of the run time while a few blocks search the most difficult subtrees. A consequence of this is that the amount of parallelism reduces as the algorithm runs and performance is decreased.

The multiple GPU implementation of BBMCG suffers from the same load balancing issue and is the reason that it performs identically to a single GPU implementation. In the multiple GPU implementation, the initial subtrees for the blocks are divided among the multiple GPUs. Unified memory is used to shared the current maximum clique size between the GPUs and each GPU updates the maximum size as they find it. Unified memory has to be communicated between GPUs, causing a slight slow down. Although the additional GPUs do provide additional computational resources, it is unlikely that they have an even work load on initialization of the kernel. The benefits of finding and sharing large cliques faster across GPUs is limited by the lack of load balancing and slow communication, yielding almost no performance benefits. The block that has to search the most difficult subtree has to do it alone regardless of the addition of more GPUs. Multiple GPUs do provide additional memory however, and therefore do have some use to BBMCG. A maximum graph size of around 1500 vertices can run on a single GPU, but a maximum of around 2500 vertices can run on two GPUs. 

BBMCG is also slowed down by its reliance on global memory to run. Global memory is many times slower than other available forms of memory, such as shared memory. Although shared memory is used when necessary in bitwise operations and in the coloring process, it is used seldom elsewhere. Another memory related issue is the maximum stack size CUDA allows. Even at the maximum size, it was determined that the maximum level of recursion that could be reached before memory was exhausted was approximately 500 levels. This limits BBMCG to only being able to search graphs that have a maximum clique size of about 500 maximum until it run out of memory.

% An example of a floating figure using the graphicx package.
% Note that \label must occur AFTER (or within) \caption.
% For figures, \caption should occur after the \includegraphics.
% Note that IEEEtran v1.7 and later has special internal code that
% is designed to preserve the operation of \label within \caption
% even when the captionsoff option is in effect. However, because
% of issues like this, it may be the safest practice to put all your
% \label just after \caption rather than within \caption{}.
%
% Reminder: the "draftcls" or "draftclsnofoot", not "draft", class
% option should be used if it is desired that the figures are to be
% displayed while in draft mode.
%
%\begin{figure}[!t]
%\centering
%\includegraphics[width=2.5in]{myfigure}
% where an .eps filename suffix will be assumed under latex, 
% and a .pdf suffix will be assumed for pdflatex; or what has been declared
% via \DeclareGraphicsExtensions.
%\caption{Simulation Results}
%\label{fig_sim}
%\end{figure}

% Note that IEEE typically puts floats only at the top, even when this
% results in a large percentage of a column being occupied by floats.


% An example of a double column floating figure using two subfigures.
% (The subfig.sty package must be loaded for this to work.)
% The subfigure \label commands are set within each subfloat command, the
% \label for the overall figure must come after \caption.
% \hfil must be used as a separator to get equal spacing.
% The subfigure.sty package works much the same way, except \subfigure is
% used instead of \subfloat.
%
%\begin{figure*}[!t]
%\centerline{\subfloat[Case I]\includegraphics[width=2.5in]{subfigcase1}%
%\label{fig_first_case}}
%\hfil
%\subfloat[Case II]{\includegraphics[width=2.5in]{subfigcase2}%
%\label{fig_second_case}}}
%\caption{Simulation results}
%\label{fig_sim}
%\end{figure*}
%
% Note that often IEEE papers with subfigures do not employ subfigure
% captions (using the optional argument to \subfloat), but instead will
% reference/describe all of them (a), (b), etc., within the main caption.


% An example of a floating table. Note that, for IEEE style tables, the 
% \caption command should come BEFORE the table. Table text will default to
% \footnotesize as IEEE normally uses this smaller font for tables.
% The \label must come after \caption as always.
%
%\begin{table}[!t]
%% increase table row spacing, adjust to taste
%\renewcommand{\arraystretch}{1.3}
% if using array.sty, it might be a good idea to tweak the value of
% \extrarowheight as needed to properly center the text within the cells
%\caption{An Example of a Table}
%\label{table_example}
%\centering
%% Some packages, such as MDW tools, offer better commands for making tables
%% than the plain LaTeX2e tabular which is used here.
%\begin{tabular}{|c||c|}
%\hline
%One & Two\\
%\hline
%Three & Four\\
%\hline
%\end{tabular}
%\end{table}


% Note that IEEE does not put floats in the very first column - or typically
% anywhere on the first page for that matter. Also, in-text middle ("here")
% positioning is not used. Most IEEE journals/conferences use top floats
% exclusively. Note that, LaTeX2e, unlike IEEE journals/conferences, places
% footnotes above bottom floats. This can be corrected via the \fnbelowfloat
% command of the stfloats package.


\section{Conclusions and Future Work}
In this paper, a maximum clique algorithm for GPUs, BBMCG, was presented based off of one of the leading sequential algorithms, BBMC. The algorithm addresses memory limitations for maximum clique solvers on GPUs by using bitsets to reduce memory requirements. Two dimensions of parallelism are utilized on the GPU to divide up and search the graph's search space in parallel and to improve computational speed by parallelizing bit-wise operations. The result achieved is moderate speedup on all but small graph sizes and occasionally significant speedup on some problems. 

It is promising that speedup is consistently achieved regardless of the reliance on slow global memory, a lack of a load balancing system, and small graph sizes not utilizing all of the threads in a warp. Future work will attempt to address the issues to improve performance further. Preliminary work on a load balancing system suggests that it could be a source for a large boost in performance and would have the benefit of enabling multiple GPU systems to provide further speedup.  Another possibly significant alteration would be to integrate a managed memory pool system in global memory with the goal to eliminate the inefficient allocation of excess memory to blocks caused by not knowing how much memory is needed to solve the problem. Other reportedly beneficial additions to maximum clique solvers such as recoloring, the use of a heuristic clique, and sparse bitset implementations will also be considered as possible improvements.  

\section*{Acknowledgements}

This material is based in part upon work supported by: The National Science Foundation under grant number(s) IIA-1329469, and by Cubix Corporation through use of their PCIe slot expansion hardware solutions and HostEngine. Any opinions, findings, and conclusions or recommendations expressed in this material are those of the author(s) and do not necessarily reflect the views of the National Science Foundation or Cubix Corporation 

% conference papers do not normally have an appendix


% trigger a \newpage just before the given reference
% number - used to balance the columns on the last page
% adjust value as needed - may need to be readjusted if
% the document is modified later
%\IEEEtriggeratref{8}
% The "triggered" command can be changed if desired:
%\IEEEtriggercmd{\enlargethispage{-5in}}

% references section

% can use a bibliography generated by BibTeX as a .bbl file
% BibTeX documentation can be easily obtained at:
% http://www.ctan.org/tex-archive/biblio/bibtex/contrib/doc/
% The IEEEtran BibTeX style support page is at:
% http://www.michaelshell.org/tex/ieeetran/bibtex/
%\bibliographystyle{IEEEtran}
% argument is your BibTeX string definitions and bibliography database(s)
%\bibliography{IEEEabrv,../bib/paper}
%
% <OR> manually copy in the resultant .bbl file
% set second argument of \begin to the number of references
% (used to reserve space for the reference number labels box)
\bibliographystyle{./IEEEtran}
\bibliography{./IEEEexample}


% that's all folks
\end{document}