Rebase: Merge BOLT codebase in monorepo

Summary: This commit is the first step in rebasing all of BOLT history in the LLVM monorepo. It also solves trivial build issues by updating BOLT codebase to use current LLVM. There is still work left in rebasing some BOLT features and in making sure everything is working as intended. History has been rewritten to put BOLT in the /bolt folder, as opposed to /tools/llvm-bolt. (cherry picked from FBD33289252)
2020-12-01 16:29:39 -08:00 · 2020-12-01 16:29:39 -08:00 · 1c5d3a056c
parent 0a8aaf56bb
commit 1c5d3a056c
113 changed files with 4016 additions and 9118 deletions
--- a/bolt/LICENSE.TXT
+++ b/bolt/LICENSE.TXT
@ -1,10 +1,245 @@
 ==============================================================================
-LLVM Release License
+The LLVM Project is under the Apache License v2.0 with LLVM Exceptions:
+==============================================================================
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+    1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+    2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+    3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+    4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+    5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+    6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+    7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+    8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+    9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+    END OF TERMS AND CONDITIONS
+
+    APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+    Copyright [yyyy] [name of copyright owner]
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+
+---- LLVM Exceptions to the Apache 2.0 License ----
+
+As an exception, if, as a result of your compiling your source code, portions
+of this Software are embedded into an Object form of such source code, you
+may redistribute such embedded portions in such Object form without complying
+with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
+
+In addition, if you combine or link compiled forms of this Software with
+software that is licensed under the GPLv2 ("Combined Software") and if a
+court of competent jurisdiction determines that the patent provision (Section
+3), the indemnity provision (Section 9) or other Section of the License
+conflicts with the conditions of the GPLv2, you may retroactively and
+prospectively choose to deem waived or otherwise exclude such Section(s) of
+the License, but only in their entirety and only with respect to the Combined
+Software.
+
+==============================================================================
+Software from third parties included in the LLVM Project:
+==============================================================================
+The LLVM Project contains third party software which is under different license
+terms. All such code will be identified clearly using at least one of two
+mechanisms:
+1) It will be in a separate directory tree with its own `LICENSE.txt` or
+   `LICENSE` file at the top containing the specific license and restrictions
+   which apply to that software, or
+2) It will contain specific license and restriction terms at the top of every
+   file.
+
+==============================================================================
+Legacy LLVM License (https://llvm.org/docs/DeveloperPolicy.html#legacy):
 ==============================================================================
 University of Illinois/NCSA
 Open Source License

-Copyright (c) 2003-2017 University of Illinois at Urbana-Champaign.
+Copyright (c) 2003-2019 University of Illinois at Urbana-Champaign.
 All rights reserved.

 Developed by:
@ -42,22 +277,3 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
 SOFTWARE.

-==============================================================================
-Copyrights and Licenses for Third Party Software Distributed with LLVM:
-==============================================================================
-The LLVM software contains code written by third parties.  Such software will
-have its own individual LICENSE.TXT file in the directory in which it appears.
-This file will describe the copyrights, license, and restrictions which apply
-to that code.
-
-The disclaimer of warranty in the University of Illinois Open Source License
-applies to all code in the LLVM Distribution, and nothing in any of the
-other licenses gives permission to use the names of the LLVM Team or the
-University of Illinois to endorse or promote products derived from this
-Software.
-
-The following pieces of software have additional or alternate copyrights,
-licenses, and/or restrictions:
-
-Program             Directory
-------             ---------
--- a/bolt/LLVMBuild.txt
+++ b/bolt/LLVMBuild.txt
@ -1,25 +0,0 @@
-;===- ./tools/llvm-bolt/LLVMBuild.txt ---------------------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[common]
-subdirectories = merge-fdata
-
-[component_0]
-type = Tool
-name = llvm-bolt
-parent = Tools
-required_libraries = MC MCDisassembler MCParser Object all-targets
--- a/bolt/README.md
+++ b/bolt/README.md
@ -0,0 +1,222 @@
+# BOLT
+
+BOLT is a post-link optimizer developed to speed up large applications.
+It achieves the improvements by optimizing application's code layout based on
+execution profile gathered by sampling profiler, such as Linux `perf` tool.
+An overview of the ideas implemented in BOLT along with a discussion of its
+potential and current results is available in
+[CGO'19 paper](https://research.fb.com/publications/bolt-a-practical-binary-optimizer-for-data-centers-and-beyond/).
+
+## Input Binary Requirements
+
+BOLT operates on X86-64 and AArch64 ELF binaries. At the minimum, the binaries
+should have an unstripped symbol table, and, to get maximum performance gains,
+they should be linked with relocations (`--emit-relocs` or `-q` linker flag).
+
+BOLT disassembles functions and reconstructs the control flow graph (CFG)
+before it runs optimizations. Since this is a nontrivial task,
+especially when indirect branches are present, we rely on certain heuristics
+to accomplish it. These heuristics have been tested on a code generated with
+Clang and GCC compilers. The main requirement for C/C++ code is not to rely
+on code layout properties, such as function pointer deltas.
+Assembly code can be processed too. Requirements for it include a clear
+separation of code and data, with data objects being placed into data
+sections/segments. If indirect jumps are used for intra-function control
+transfer (e.g., jump tables), the code patterns should be matching those
+generated by Clang/GCC.
+
+NOTE: BOLT is currently incompatible with the `-freorder-blocks-and-partition`
+compiler option. Since GCC8 enables this option by default, you have to
+explicitly disable it by adding `-fno-reorder-blocks-and-partition` flag if
+you are compiling with GCC8.
+
+PIE and .so support has been added recently. Please report bugs if you
+encounter any issues.
+
+## Installation
+
+### Docker Image
+
+You can build and use the docker image containing BOLT using our [docker file](./utils/docker/Dockerfile).
+Alternatively, you can build BOLT manually using the steps below.
+
+### Manual Build
+
+BOLT heavily uses LLVM libraries, and by design, it is built as one of LLVM
+tools. The build process is not much different from a regular LLVM build.
+The following instructions are assuming that you are running under Linux.
+
+Start with cloning LLVM and BOLT repos:
+
+```
+> git clone https://github.com/llvm-mirror/llvm llvm
+> cd llvm/tools
+> git checkout -b llvm-bolt f137ed238db11440f03083b1c88b7ffc0f4af65e
+> git clone https://github.com/facebookincubator/BOLT llvm-bolt
+> cd ..
+> patch -p 1 < tools/llvm-bolt/llvm.patch
+```
+
+Proceed to a normal LLVM build using a compiler with C++11 support (for GCC
+use version 4.9 or later):
+
+```
+> cd ..
+> mkdir build
+> cd build
+> cmake -G Ninja ../llvm -DLLVM_TARGETS_TO_BUILD="X86;AArch64" -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON
+> ninja
+```
+
+`llvm-bolt` will be available under `bin/`. Add this directory to your path to
+ensure the rest of the commands in this tutorial work.
+
+Note that we use a specific revision of LLVM as we currently rely on a set of
+patches that are not yet upstreamed.
+
+## Optimizing BOLT's Performance
+
+BOLT runs many internal passes in parallel. If you foresee heavy usage of
+BOLT, you can improve the processing time by linking against one of memory
+allocation libraries with good support for concurrency. E.g. to use jemalloc:
+
+```
+> sudo yum install jemalloc-devel
+> LD_PRELOAD=/usr/lib64/libjemalloc.so llvm-bolt ....
+```
+Or if you rather use tcmalloc:
+```
+> sudo yum install gperftools-devel
+> LD_PRELOAD=/usr/lib64/libtcmalloc_minimal.so llvm-bolt ....
+```
+
+## Usage
+
+For a complete practical guide of using BOLT see [Optimizing Clang with BOLT](./docs/OptimizingClang.md).
+
+### Step 0
+
+In order to allow BOLT to re-arrange functions (in addition to re-arranging
+code within functions) in your program, it needs a little help from the linker.
+Add `--emit-relocs` to the final link step of your application. You can verify
+the presence of relocations by checking for `.rela.text` section in the binary.
+BOLT will also report if it detects relocations while processing the binary.
+
+### Step 1: Collect Profile
+
+This step is different for different kinds of executables. If you can invoke
+your program to run on a representative input from a command line, then check
+**For Applications** section below. If your program typically runs as a
+server/service, then skip to **For Services** section.
+
+The version of `perf` command used for the following steps has to support
+`-F brstack` option. We recommend using `perf` version 4.5 or later.
+
+#### For Applications
+
+This assumes you can run your program from a command line with a typical input.
+In this case, simply prepend the command line invocation with `perf`:
+```
+$ perf record -e cycles:u -j any,u -o perf.data -- <executable> <args> ...
+```
+
+#### For Services
+
+Once you get the service deployed and warmed-up, it is time to collect perf
+data with LBR (branch information). The exact perf command to use will depend
+on the service. E.g., to collect the data for all processes running on the
+server for the next 3 minutes use:
+```
+$ perf record -e cycles:u -j any,u -a -o perf.data -- sleep 180
+```
+
+Depending on the application, you may need more samples to be included with
+your profile. It's hard to tell upfront what would be a sweet spot for your
+application. We recommend the profile to cover 1B instructions as reported
+by BOLT `-dyno-stats` option. If you need to increase the number of samples
+in the profile, you can either run the `sleep` command for longer and use
+`-F<N>` option with `perf` to increase sampling frequency.
+
+Note that for profile collection we recommend using cycle events and not
+`BR_INST_RETIRED.*`. Empirically we found it to produce better results.
+
+If the collection of a profile with branches is not available, e.g., when you run on
+a VM or on hardware that does not support it, then you can use only sample
+events, such as cycles. In this case, the quality of the profile information
+would not be as good, and performance gains with BOLT are expected to be lower.
+
+#### With instrumentation (experimental)
+
+If perf record is not available to you, you may collect profile by first
+instrumenting the binary with BOLT and then running it.
+```
+llvm-bolt <executable> -instrument -o <instrumented-executable>
+```
+
+After you run instrumented-executable with the desired workload, its BOLT
+profile should be ready for you in `/tmp/prof.fdata` and you can skip
+**Step 2**.
+
+Run BOLT with the `-help` option and check the category "BOLT instrumentation
+options" for a quick reference on instrumentation knobs. Instrumentation is
+experimental and currently does not work for PIEs/SOs.
+
+### Step 2: Convert Profile to BOLT Format
+
+NOTE: you can skip this step and feed `perf.data` directly to BOLT using
+experimental `-p perf.data` option.
+
+For this step, you will need `perf.data` file collected from the previous step and
+a copy of the binary that was running. The binary has to be either
+unstripped, or should have a symbol table intact (i.e., running `strip -g` is
+okay).
+
+Make sure `perf` is in your `PATH`, and execute `perf2bolt`:
+```
+$ perf2bolt -p perf.data -o perf.fdata <executable>
+```
+
+This command will aggregate branch data from `perf.data` and store it in a
+format that is both more compact and more resilient to binary modifications.
+
+If the profile was collected without LBRs, you will need to add `-nl` flag to
+the command line above.
+
+### Step 3: Optimize with BOLT
+
+Once you have `perf.fdata` ready, you can use it for optimizations with
+BOLT. Assuming your environment is setup to include the right path, execute
+`llvm-bolt`:
+```
+$ llvm-bolt <executable> -o <executable>.bolt -data=perf.fdata -reorder-blocks=cache+ -reorder-functions=hfsort -split-functions=2 -split-all-cold -split-eh -dyno-stats
+```
+
+If you do need an updated debug info, then add `-update-debug-sections` option
+to the command above. The processing time will be slightly longer.
+
+For a full list of options see `-help`/`-help-hidden` output.
+
+The input binary for this step does not have to 100% match the binary used for
+profile collection in **Step 1**. This could happen when you are doing active
+development, and the source code constantly changes, yet you want to benefit
+from profile-guided optimizations. However, since the binary is not precisely the
+same, the profile information could become invalid or stale, and BOLT will
+report the number of functions with a stale profile. The higher the
+number, the less performance improvement should be expected. Thus, it is
+crucial to update `.fdata` for release branches.
+
+## Multiple Profiles
+
+Suppose your application can run in different modes, and you can generate
+multiple profiles for each one of them. To generate a single binary that can
+benefit all modes (assuming the profiles don't contradict each other) you can
+use `merge-fdata` tool:
+```
+$ merge-fdata *.fdata > combined.fdata
+```
+Use `combined.fdata` for **Step 3** above to generate a universally optimized
+binary.
+
+## License
+
+BOLT is licensed under [University of Illinois/NCSA Open Source License](./LICENSE.TXT).
--- a/bolt/README.txt
+++ b/bolt/README.txt
@ -1,37 +0,0 @@
-BOLT
-====
-
-  BOLT is a post-link optimizer developed to speed up large applications.
-  It achieves speed-ups by optimizing application's code layout based on an
-  execution profile gathered by sampling profilers such as Linux `perf` tool.
-  BOLT could operate on any binary with symbol table, but for maximum gains
-  it utilizes relocations saved by a linker (--emit-relocs).
-  
-  NOTE: Currently BOLT support is limited to non-PIC/PIE binaries.
-
-INSTALLATION
-============
-
-  BOLT heavily uses LLVM libraries and by design it is built as one of LLVM
-  tools. The build process in not much different from regular LLVM.
-
-  Start with cloning LLVM and BOLT repos:
-
-  > git clone https://github.com/llvm-mirror/llvm llvm
-  > cd llvm/tools
-  > git checkout -b llvm-bolt f137ed238db11440f03083b1c88b7ffc0f4af65e
-  > git clone https://github.com/facebookincubator/BOLT llvm-bolt
-  > patch -p 1 < llvm-bolt/llvm.patch
-
-  Proceed to a normal LLVM build:
-
-  > cd ../..
-  > mkdir build
-  > cd build
-  > cmake -G Ninja
-  > ninja
-
-  llvm-bolt will be available under bin/ .
-
-  Note that we use a specific revision of LLVM as we currently rely on a set of
-  patches that are not yet upstreamed.
--- a/bolt/docs/Heatmap.png
+++ b/bolt/docs/Heatmap.png
--- a/bolt/docs/Heatmaps.md
+++ b/bolt/docs/Heatmaps.md
@ -0,0 +1,50 @@
+# Code Heatmaps
+
+BOLT has gained the ability to print code heatmaps based on
+sampling-based LBR profiles generated by `perf`. The output is produced
+in colored ASCII to be displayed in a color-capable terminal. It looks
+something like this:
+
+![](./Heatmap.png)
+
+Heatmaps can be generated for BOLTed and non-BOLTed binaries. You can
+use them to compare the code layout before and after optimizations.
+
+To generate a heatmap, start with running your app under `perf`:
+
+```bash
+$ perf record -e cycles:u -j any,u -- <executable with args>
+```
+or if you want to monitor the existing process(es):
+```bash
+$ perf record -e cycles:u -j any,u [-p PID|-a] -- sleep <interval>
+```
+
+Note that at the moment running with LBR (`-j any,u` or `-b`) is
+a requirement.
+
+Once the run is complete, and `perf.data` is generated, run BOLT in
+a heatmap mode:
+
+```bash
+$ llvm-bolt heatmap -p perf.data <executable>
+```
+
+By default the heatmap will be dumped to *stdout*. You can change it
+with `-o <heatmapfile>` option. Each character/block in the heatmap
+shows the execution data accumulated for corresponding 64 bytes of
+code. You can change this granularity with a `-block-size` option.
+E.g. set it to 4096 to see code usage grouped by 4K pages.
+Other useful options are:
+
+```bash
+-line-size=<uint>   - number of entries per line (default 256)
+-max-address=<uint> - maximum address considered valid for heatmap (default 4GB)
+```
+
+If you prefer to look at the data in a browser (or would like to share
+it that way), then you can use an HTML conversion tool. E.g.:
+
+```bash
+$ aha -b -f <heatmapfile> > <heatmapfile>.html
+```
--- a/bolt/docs/OptimizingClang.md
+++ b/bolt/docs/OptimizingClang.md
@ -0,0 +1,266 @@
+# Optimizing Clang : A Practical Example of Applying BOLT
+
+## Preface
+
+*BOLT* (Binary Optimization and Layout Tool) is designed to improve the application
+performance by laying out code in a manner that helps CPU better utilize its caching and
+branch predicting resources.
+
+The most obvious candidates for BOLT optimizations
+are programs that suffer from many instruction cache and iTLB misses, such as
+large applications measuring over hundreds of megabytes in size. However, medium-sized
+programs can benefit too. Clang, one of the most popular open-source C/C++ compilers,
+is a good example of the latter. Its code size could easily be in the order of tens of megabytes.
+As we will see, the Clang binary suffers from many instruction cache
+misses and can be significantly improved with BOLT, even on top of profile-guided and
+link-time optimizations.
+
+In this tutorial we will first build Clang with PGO and LTO, and then will show steps on how to
+apply BOLT optimizations to make Clang up to 15% faster. We will also analyze where
+the compile-time performance gains are coming from, and verify that the speed-ups are
+sustainable while building other applications.
+
+## Building Clang
+
+The process of getting Clang sources and performing the build is very similar to the
+one described at http://clang.llvm.org/get_started.html. For completeness, we provide the detailed steps
+on how to obtain and build Clang in [Bootstrapping Clang-7 with PGO and LTO](#bootstrapping-clang-7-with-pgo-and-lto) section.
+
+The only difference from the standard Clang build is that we require the `-Wl,-q` flag to be present during
+the final link. This option saves relocation metadata in the executable file, but does not affect
+the generated code in any way.
+
+## Optimizing Clang with BOLT
+
+We will use the setup described in [Bootstrapping Clang-7 with PGO and LTO](#bootstrapping-clang-7-with-pgo-and-lto).
+Adjust the steps accordingly if you skipped that section. We will also assume that `llvm-bolt` is present in your `$PATH`.
+
+Before we can run BOLT optimizations, we need to collect the profile for Clang, and we will use
+Clang/LLVM sources for that.
+Collecting accurate profile requires running `perf` on a hardware that
+implements taken branch sampling (`-b/-j` flag). For that reason, it may not be possible to
+collect the accurate profile in a virtualized environment, e.g. in the cloud.
+We do support regular sampling profiles, but the performance
+improvements are expected to be more modest. 
+
+```bash
+$ mkdir ${TOPLEV}/stage3
+$ cd ${TOPLEV}/stage3
+$ CPATH=${TOPLEV}/stage2-prof-use-lto/install/bin/
+$ cmake -G Ninja ${TOPLEV}/llvm -DLLVM_TARGETS_TO_BUILD=X86 -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_C_COMPILER=$CPATH/clang -DCMAKE_CXX_COMPILER=$CPATH/clang++ \
+    -DLLVM_USE_LINKER=lld -DCMAKE_INSTALL_PREFIX=${TOPLEV}/stage3/install
+$ perf record -e cycles:u -j any,u -- ninja clang
+```
+
+Once the last command is finished, it will create a `perf.data` file larger than 10GiB.
+We will first convert this profile into a more compact aggregated
+form suitable to be consumed by BOLT:
+```bash
+  $ perf2bolt $CPATH/clang-7 -p perf.data -o clang-7.fdata -w clang-7.yaml
+```
+Notice that we are passing `clang-7` to `perf2bolt` which is the real binary that
+`clang` and `clang++` are symlinking to. The next step will optimize Clang using
+the generated profile:
+```bash
+$ llvm-bolt $CPATH/clang-7 -o $CPATH/clang-7.bolt -b clang-7.yaml \
+    -reorder-blocks=cache+ -reorder-functions=hfsort+ -split-functions=3 \
+    -split-all-cold -dyno-stats -icf=1 -use-gnu-stack
+```
+The output will look similar to the one below:
+```t
+...
+BOLT-INFO: enabling relocation mode
+BOLT-INFO: 11415 functions out of 104526 simple functions (10.9%) have non-empty execution profile.
+...
+BOLT-INFO: ICF folded 29144 out of 105177 functions in 8 passes. 82 functions had jump tables.
+BOLT-INFO: Removing all identical functions will save 5466.69 KB of code space. Folded functions were called 2131985 times based on profile.
+BOLT-INFO: basic block reordering modified layout of 7848 (10.32%) functions
+...
+           660155947 : executed forward branches (-2.3%)
+            48252553 : taken forward branches (-57.2%)
+           129897961 : executed backward branches (+13.8%)
+            52389551 : taken backward branches (-19.5%)
+            35650038 : executed unconditional branches (-33.2%)
+           128338874 : all function calls (=)
+            19010563 : indirect calls (=)
+             9918250 : PLT calls (=)
+          6113398840 : executed instructions (-0.6%)
+          1519537463 : executed load instructions (=)
+           943321306 : executed store instructions (=)
+            20467109 : taken jump table branches (=)
+           825703946 : total branches (-2.1%)
+           136292142 : taken branches (-41.1%)
+           689411804 : non-taken conditional branches (+12.6%)
+           100642104 : taken conditional branches (-43.4%)
+           790053908 : all conditional branches (=)
+...
+```
+The statistics in the output is based on the LBR profile collected with `perf`, and since we were using
+the `cycles` counter, its accuracy is affected. However, the relative improvement in `taken conditional
+ branches` is a good indication that BOLT was able to straighten out the code even after PGO.
+
+## Measuring Compile-time Improvement
+
+`clang-7.bolt` can be used as a replacement for *PGO+LTO* Clang:
+```bash
+$ mv $CPATH/clang-7 $CPATH/clang-7.org
+$ ln -fs $CPATH/clang-7.bolt $CPATH/clang-7
+```
+Doing a new build of Clang using the new binary shows a significant overall
+build time reduction on a 48-core Haswell system:
+```bash
+$ ln -fs $CPATH/clang-7.org $CPATH/clang-7
+$ ninja clean && /bin/time -f %e ninja clang -j48
+202.72
+$ ln -fs $CPATH/clang-7.bolt $CPATH/clang-7
+$ ninja clean && /bin/time -f %e ninja clang -j48
+180.11
+```
+That's 22.61 seconds (or 12%) faster compared to the *PGO+LTO* build.
+Notice that we are measuring an improvement of the total build time, which includes the time spent in the linker.
+Compilation time improvements for individual files differ, and speedups over 15% are not uncommon.
+If we run BOLT on a Clang binary compiled without *PGO+LTO* (in which case the build is finished in 253.32 seconds),
+the gains we see are over 50 seconds (25%),
+but, as expected, the result is still slower than *PGO+LTO+BOLT* build.
+
+## Source of the Wins
+
+We mentioned that Clang suffers from considerable instruction cache misses. This can be measured with `perf`:
+```bash
+$ ln -fs $CPATH/clang-7.org $CPATH/clang-7
+$ ninja clean && perf stat -e instructions,L1-icache-misses -- ninja clang -j48
+  ...
+   16,366,101,626,647      instructions
+      359,996,216,537      L1-icache-misses
+```
+That's about 22 instruction cache misses per thousand instructions. As a rule of thumb, if the application
+has over 10 misses per thousand instructions, it is a good indication that it will be improved by BOLT.
+Now let's see how many misses are in the BOLTed binary:
+```bash
+$ ln -fs $CPATH/clang-7.bolt $CPATH/clang-7
+$ ninja clean && perf stat -e instructions,L1-icache-misses -- ninja clang -j48
+  ...
+  16,319,818,488,769      instructions
+     244,888,677,972      L1-icache-misses
+```
+The number of misses per thousand instructions went down from 22 to 15, significantly reducing
+the number of stalls in the CPU front-end.
+Notice how the number of executed instructions stayed roughly the same. That's because we didn't
+run any optimizations beyond the ones affecting the code layout. Other than instruction cache misses,
+BOLT also improves branch mispredictions, iTLB misses, and misses in L2 and L3.
+
+## Using Clang for Other Applications
+
+We have collected profile for Clang using its own source code. Would it be enough to speed up
+the compilation of other projects? We picked `mysqld`, an open-source database, to do the test.
+
+On our 48-core Haswell system using the *PGO+LTO* Clang, the build finished in 136.06 seconds, while using the *PGO+LTO+BOLT* Clang, 126.10 seconds.
+That's a noticeable improvement, but not as significant as the one we saw on Clang itself.
+This is partially because the number of instruction cache misses is slightly lower on this scenario : 19 vs 22.
+Another reason is that Clang is run with a different set of options while building `mysqld` compared
+to the training run.
+
+Different options exercise different code paths, and
+if we trained without a specific option, we may have misplaced parts of the code responsible for handling it.
+To test this theory, we have collected another `perf` profile while building `mysqld`, and merged it with an existing profile
+using the `merge-fdata` utility that comes with BOLT. Optimized with that profile, the *PGO+LTO+BOLT* Clang was able
+to perform the `mysqld` build in 124.74 seconds, i.e. 11 seconds or 9% faster compared to *PGO+LGO* Clang.
+The merged profile didn't make the original Clang compilation slower either, while the number of profiled functions in Clang increased from 11,415 to 14,025.
+
+Ideally, the profile run has to be done with a superset of all commonly used options. However, the main improvement is expected with just the basic set.
+
+## Summary
+
+In this tutorial we demonstrated how to use BOLT to improve the
+performance of the Clang compiler. Similarly, BOLT could be used to improve the performance
+of GCC, or any other application suffering from a high number of instruction
+cache misses.
+
+----
+# Appendix
+
+## Bootstrapping Clang-7 with PGO and LTO
+
+Below we describe detailed steps to build Clang, and make it ready for BOLT optimizations. If you
+already have the build setup, you can skip this section, except for the last step that adds `-Wl,-q` linker flag to the final build.
+
+### Getting Clang-7 Sources
+
+Set `$TOPLEV` to the directory of your preference where you would like to do
+builds. E.g. `TOPLEV=~/clang-7/`. Follow with commands to clone the `release_70` branches
+of LLVM, Clang, lld linker, and the compiler runtime:
+```bash
+$ cd ${TOPLEV}
+$ git clone -q --depth=1 --branch=release_70 https://git.llvm.org/git/llvm.git/ llvm
+$ cd llvm/tools
+$ git clone -q --depth=1 --branch=release_70 https://git.llvm.org/git/clang.git/
+$ cd ../projects
+$ git clone -q --depth=1 --branch=release_70 https://git.llvm.org/git/lld.git/
+$ git clone -q --depth=1 --branch=release_70 https://git.llvm.org/git/compiler-rt.git/
+```
+
+### Building Stage 1 Compiler
+
+Stage 1 will be the first build we are going to do, and we will be using the
+default system compiler to build Clang. If your system lacks a compiler, use your distribution package manager to install one
+that supports C++11. In this example we are going to use GCC. In addition to the compiler,
+you will need the `cmake` and `ninja` packages.
+```bash
+$ mkdir ${TOPLEV}stage1
+$ cd ${TOPLEV}/stage1
+$ cmake -G Ninja ${TOPLEV}/llvm -DLLVM_TARGETS_TO_BUILD=X86 -DCMAKE_BUILD_TYPE=Release \
+      -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_ASM_COMPILER=gcc \
+      -DCMAKE_INSTALL_PREFIX=${TOPLEV}/stage1/install
+$ ninja install
+```
+
+### Building Stage 2 Compiler With Instrumentation
+
+Using the freshly-baked stage 1 Clang compiler, we are going to build Clang with profile generation capabilities:
+```bash
+$ mkdir ${TOPLEV}/stage2-prof-gen
+$ cd ${TOPLEV}/stage2-prof-gen
+$ CPATH=${TOPLEV}/stage1/install/bin/
+$ cmake -G Ninja ${TOPLEV}/llvm -DLLVM_TARGETS_TO_BUILD=X86 -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_C_COMPILER=$CPATH/clang -DCMAKE_CXX_COMPILER=$CPATH/clang++ \
+    -DLLVM_USE_LINKER=lld -DLLVM_BUILD_INSTRUMENTED=ON \
+    -DCMAKE_INSTALL_PREFIX=${TOPLEV}/stage2-prof-gen/install
+$ ninja install
+```
+
+### Generating Profile for PGO
+
+While there are many ways to obtain the profile data, we are going to use the source code already at our
+disposal, i.e. we are going to collect the profile while building Clang itself:
+```bash
+$ mkdir ${TOPLEV}/stage3-train
+$ cd ${TOPLEV}/stage3-train
+$ CPATH=${TOPLEV}/stage2-prof-gen/install/bin
+$ cmake -G Ninja ${TOPLEV}/llvm -DLLVM_TARGETS_TO_BUILD=X86 -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_C_COMPILER=$CPATH/clang -DCMAKE_CXX_COMPILER=$CPATH/clang++ \
+    -DLLVM_USE_LINKER=lld -DCMAKE_INSTALL_PREFIX=${TOPLEV}/stage3-train/install
+$ ninja clang
+```
+Once the build is completed, the profile files will be saved under `${TOPLEV}/stage2-prof-gen/profiles`. We will merge them before they can be passed back into Clang:
+```bash
+$ cd ${TOPLEV}/stage2-prof-gen/profiles
+$ ${TOPLEV}/stage1/install/bin/llvm-profdata merge -output=clang.profdata *
+```
+
+### Building Clang with PGO and LTO
+
+Now the profile can be used to guide optimizations to produce better code for our scenario, i.e. building Clang.
+We will also enable link-time optimizations to allow cross-module inlining and other optimizations. Finally, we are going to add one extra step that is useful for BOLT: a linker flag instructing it to preserve relocations in the output binary. Note that this flag does not affect the generated code or data used at runtime, it only writes metadata to the file on disk:
+```bash
+$ mkdir ${TOPLEV}/stage2-prof-use-lto
+$ cd ${TOPLEV}/stage2-prof-use-lto
+$ CPATH=${TOPLEV}/stage1/install/bin/
+$ export LDFLAGS="-Wl,-q"
+$ cmake -G Ninja ${TOPLEV}/llvm -DLLVM_TARGETS_TO_BUILD=X86 -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_C_COMPILER=$CPATH/clang -DCMAKE_CXX_COMPILER=$CPATH/clang++ \
+    -DLLVM_ENABLE_LTO=Full -DLLVM_PROFDATA_FILE=${TOPLEV}/stage2-prof-gen/profiles/clang.profdata \
+    -DLLVM_USE_LINKER=lld -DCMAKE_INSTALL_PREFIX=${TOPLEV}/stage2-prof-use-lto/install
+$ ninja install
+```
+Now we have a Clang compiler that can build itself much faster. As we will see, it builds other applications faster as well, and, with BOLT, the compile time can be improved even further.
--- a/bolt/docs/RuntimeLibrary.md
+++ b/bolt/docs/RuntimeLibrary.md
@ -0,0 +1,37 @@
+# BOLT ORC-based linker
+
+A high-level view on the simple linker used to insert auxiliary/library code into the final binary produced by BOLT. This is built on top of LLVM's ORC infra (the newest iteration on JITting for LLVM).
+
+## Several levels of code injection
+
+When BOLT starts processing an input executable, its first task is to raise the binary to a low-level IR with CFG. After this is done, we are ready to change code in this binary. Throughout BOLT's pipeline of code transformations, there are plenty of situations when we need to insert new code or fix existing code.
+
+If operating with small code changes inside a basic block, we typically defer this work to MCPlusBuilder. This is our target-independent interface to create new instructions, but it also contains some functions that may create code spanning multiple basic blocks (for instance, when doing indirect call promotion and unrolling an indirect call into a ladder of comparisons/direct calls). The implementation here usually boils down to programmatically creating new MCInst instructions while setting their opcodes according to the target list (see X86GenInstOpcodes.inc generated by tablegen in an LLVM build).
+
+However, this approach quickly becomes awkward if we want to insert a lot of code, especially if this code is frozen and never changes. In these situations, it is more convenient to have a runtime library with all the code you need to insert. This library defines some symbols and can be linked into the final binary. In this case, all you need to do in a BOLT transformation is to insert a call to your library.
+
+## The runtime library
+
+Currently, our runtime library is written in C++ and contains code that helps us instrument a binary.
+
+### Limitations
+Our library is not written with regular C++ code as it is not linked against any other libraries (this means we cannnot rely on anything defined on libstdc++, glibc, libgcc etc), but is self sufficient. In runtime/CMakeLists.txt, we can see it is built with -ffreestanding, which requires the compiler to avoid using a runtime library by itself.
+
+While this requires us to make our own syscalls, it does simplify our linker a lot, which is very limited and can only do basic function name resolving. However, this is a big improvement in comparison with programmatically generating the code in assembly language using MCInsts.
+
+A few more quirks:
+
+* No BSS section: don't use uninitialized globals
+* No dependencies on foreign code: self sufficient
+* You should closely watch the generated bolt_rt object files, anything requiring fancy linker features will break. We only support bare bones .text, .data and nothing else.
+
+Read instr.cpp opening comment for more details.
+
+
+## Linking
+
+While RewriteInstance::emitAndLink() will perform an initial link step to resolve all references of the input program, it will not start linking the runtime library right away. The input program lives in its own module that may end up with unresolved references to the runtime library.
+
+RewriteInstance::linkRuntime() has the job of actually reading individual .o files and adding them to the binary. We currently have a single .o file, so after it is read, ORC can finally resolve references from the first module to the newly inserted .o objects.
+
+This sequence of steps is done by calls to addObject() and emitAndFinalize(). The latter will trigger symbol resolution, relying on the symbol resolver provided by us when calling createLegacyLookupResolver().
--- a/bolt/llvm.patch
+++ b/bolt/llvm.patch
--- a/bolt/src/BinaryContext.cpp
+++ b/bolt/src/BinaryContext.cpp
@ -181,12 +181,12 @@ BinaryContext::createBinaryContext(ObjectFile *File, bool IsPIC,
    return nullptr;
  }

-  auto TheTriple = llvm::make_unique<Triple>(File->makeTriple());
+  auto TheTriple = std::make_unique<Triple>(File->makeTriple());
  const std::string TripleName = TheTriple->str();

  std::string Error;
  const Target *TheTarget =
-      TargetRegistry::lookupTarget(ArchName, *TheTriple, Error);
+      TargetRegistry::lookupTarget(std::string(ArchName), *TheTriple, Error);
  if (!TheTarget) {
    errs() << "BOLT-ERROR: " << Error;
    return nullptr;
@ -201,7 +201,7 @@ BinaryContext::createBinaryContext(ObjectFile *File, bool IsPIC,

  // Set up disassembler.
  std::unique_ptr<const MCAsmInfo> AsmInfo(
-      TheTarget->createMCAsmInfo(*MRI, TripleName));
+      TheTarget->createMCAsmInfo(*MRI, TripleName, MCTargetOptions()));
  if (!AsmInfo) {
    errs() << "BOLT-ERROR: no assembly info for target " << TripleName << "\n";
    return nullptr;
@ -221,11 +221,25 @@ BinaryContext::createBinaryContext(ObjectFile *File, bool IsPIC,
    return nullptr;
  }

-  std::unique_ptr<MCObjectFileInfo> MOFI =
-      llvm::make_unique<MCObjectFileInfo>();
-  std::unique_ptr<MCContext> Ctx =
-      llvm::make_unique<MCContext>(AsmInfo.get(), MRI.get(), MOFI.get());
-  MOFI->InitMCObjectFileInfo(*TheTriple, IsPIC, *Ctx);
+  std::unique_ptr<MCContext> Ctx(
+      new MCContext(*TheTriple, AsmInfo.get(), MRI.get(), STI.get()));
+  std::unique_ptr<MCObjectFileInfo> MOFI(
+      TheTarget->createMCObjectFileInfo(*Ctx, IsPIC));
+  Ctx->setObjectFileInfo(MOFI.get());
+  // We do not support X86 Large code model. Change this in the future.
+  bool Large = false;
+  if (TheTriple->getArch() == llvm::Triple::aarch64)
+    Large = true;
+  unsigned LSDAEncoding =
+      Large ? dwarf::DW_EH_PE_absptr : dwarf::DW_EH_PE_udata4;
+  unsigned TTypeEncoding =
+      Large ? dwarf::DW_EH_PE_absptr : dwarf::DW_EH_PE_udata4;
+  if (IsPIC) {
+    LSDAEncoding = dwarf::DW_EH_PE_pcrel |
+                   (Large ? dwarf::DW_EH_PE_sdata8 : dwarf::DW_EH_PE_sdata4);
+    TTypeEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+                    (Large ? dwarf::DW_EH_PE_sdata8 : dwarf::DW_EH_PE_sdata4);
+  }

  std::unique_ptr<MCDisassembler> DisAsm(
      TheTarget->createMCDisassembler(*STI, *Ctx));
@ -270,11 +284,15 @@ BinaryContext::createBinaryContext(ObjectFile *File, bool IsPIC,
  errs().SetUnbuffered();
  dbgs().SetUnbuffered();

-  auto BC = llvm::make_unique<BinaryContext>(
+  auto BC = std::make_unique<BinaryContext>(
      std::move(Ctx), std::move(DwCtx), std::move(TheTriple), TheTarget,
-      TripleName, std::move(MCE), std::move(MOFI), std::move(AsmInfo),
-      std::move(MII), std::move(STI), std::move(InstructionPrinter),
-      std::move(MIA), std::move(MIB), std::move(MRI), std::move(DisAsm));
+      std::string(TripleName), std::move(MCE), std::move(MOFI),
+      std::move(AsmInfo), std::move(MII), std::move(STI),
+      std::move(InstructionPrinter), std::move(MIA), std::move(MIB),
+      std::move(MRI), std::move(DisAsm));
+
+  BC->TTypeEncoding = TTypeEncoding;
+  BC->LSDAEncoding = LSDAEncoding;

  BC->MAB = std::unique_ptr<MCAsmBackend>(
      BC->TheTarget->createMCAsmBackend(*BC->STI, *BC->MRI, MCTargetOptions()));
@ -476,7 +494,7 @@ BinaryContext::handleAddressRef(uint64_t Address, BinaryFunction &BF,

  // TODO: use DWARF info to get size/alignment here?
  auto *TargetSymbol = getOrCreateGlobalSymbol(Address, "DATAat");
-  DEBUG(dbgs() << "Created symbol " << TargetSymbol->getName() << '\n');
+  LLVM_DEBUG(dbgs() << "Created symbol " << TargetSymbol->getName() << '\n');
  return std::make_pair(TargetSymbol, Addend);
 }

@ -594,24 +612,25 @@ bool BinaryContext::analyzeJumpTable(const uint64_t Address,
    UpperBound = std::min(NextJTAddress, UpperBound);
  }

-  DEBUG(dbgs() << "BOLT-DEBUG: analyzeJumpTable in " << BF.getPrintName()
-               << '\n');
+  LLVM_DEBUG(dbgs() << "BOLT-DEBUG: analyzeJumpTable in " << BF.getPrintName()
+                    << '\n');
  const auto EntrySize = getJumpTableEntrySize(Type);
  for (auto EntryAddress = Address; EntryAddress <= UpperBound - EntrySize;
       EntryAddress += EntrySize) {
-    DEBUG(dbgs() << "  * Checking 0x" << Twine::utohexstr(EntryAddress)
-                 << " -> ");
+    LLVM_DEBUG(dbgs() << "  * Checking 0x" << Twine::utohexstr(EntryAddress)
+                      << " -> ");
    // Check if there's a proper relocation against the jump table entry.
    if (HasRelocations) {
      if (Type == JumpTable::JTT_PIC &&
          !DataPCRelocations.count(EntryAddress)) {
-        DEBUG(
+        LLVM_DEBUG(
            dbgs() << "FAIL: JTT_PIC table, no relocation for this address\n");
        break;
      }
      if (Type == JumpTable::JTT_NORMAL && !getRelocationAt(EntryAddress)) {
-        DEBUG(dbgs()
-              << "FAIL: JTT_NORMAL table, no relocation for this address\n");
+        LLVM_DEBUG(
+            dbgs()
+            << "FAIL: JTT_NORMAL table, no relocation for this address\n");
        break;
      }
    }
@ -624,7 +643,7 @@ bool BinaryContext::analyzeJumpTable(const uint64_t Address,
    if (Value == BF.getAddress() + BF.getSize()) {
      addOffset(Value - BF.getAddress());
      HasUnreachable = true;
-      DEBUG(dbgs() << "OK: __builtin_unreachable\n");
+      LLVM_DEBUG(dbgs() << "OK: __builtin_unreachable\n");
      continue;
    }

@ -633,7 +652,7 @@ bool BinaryContext::analyzeJumpTable(const uint64_t Address,

    // We assume that a jump table cannot have function start as an entry.
    if (!doesBelongToFunction(Value, TargetBF) || Value == BF.getAddress()) {
-      DEBUG({
+      LLVM_DEBUG({
        if (!BF.containsAddress(Value)) {
          dbgs() << "FAIL: function doesn't contain this address\n";
          if (TargetBF) {
@ -656,7 +675,7 @@ bool BinaryContext::analyzeJumpTable(const uint64_t Address,
    // Check there's an instruction at this offset.
    if (TargetBF->getState() == BinaryFunction::State::Disassembled &&
        !TargetBF->getInstructionAtOffset(Value - TargetBF->getAddress())) {
-      DEBUG(dbgs() << "FAIL: no instruction at this offset\n");
+      LLVM_DEBUG(dbgs() << "FAIL: no instruction at this offset\n");
      break;
    }

@ -665,13 +684,13 @@ bool BinaryContext::analyzeJumpTable(const uint64_t Address,
    if (TargetBF == &BF) {
      // Address inside the function.
      addOffset(Value - TargetBF->getAddress());
-      DEBUG(dbgs() << "OK: real entry\n");
+      LLVM_DEBUG(dbgs() << "OK: real entry\n");
    } else {
      // Address in split fragment.
      BF.setHasSplitJumpTable(true);
      // Add invalid offset for proper identification of jump table size.
      addOffset(INVALID_OFFSET);
-      DEBUG(dbgs() << "OK: address in split fragment\n");
+      LLVM_DEBUG(dbgs() << "OK: address in split fragment\n");
    }
  }

@ -683,7 +702,8 @@ bool BinaryContext::analyzeJumpTable(const uint64_t Address,

 void BinaryContext::populateJumpTables() {
  std::vector<BinaryFunction *> FuncsToSkip;
-  DEBUG(dbgs() << "DataPCRelocations: " << DataPCRelocations.size() << '\n');
+  LLVM_DEBUG(dbgs() << "DataPCRelocations: " << DataPCRelocations.size()
+                    << '\n');
  for (auto JTI = JumpTables.begin(), JTE = JumpTables.end(); JTI != JTE;
       ++JTI) {
    auto *JT = JTI->second;
@ -738,7 +758,7 @@ void BinaryContext::populateJumpTables() {
  }

  if (opts::StrictMode && DataPCRelocations.size()) {
-    DEBUG({
+    LLVM_DEBUG({
      dbgs() << DataPCRelocations.size()
             << " unclaimed PC-relative relocations left in data:\n";
      for (auto Reloc : DataPCRelocations)
@ -752,7 +772,8 @@ void BinaryContext::populateJumpTables() {
  for (auto BF : FuncsToSkip) {
    BinaryFunction *ParentBF =
        const_cast<BinaryFunction *>(BF->getTopmostFragment());
-    DEBUG(dbgs() << "Skipping " << ParentBF->getPrintName() << " family\n");
+    LLVM_DEBUG(dbgs() << "Skipping " << ParentBF->getPrintName()
+                      << " family\n");
    ParentBF->setIgnored();
    ParentBF->ignoreFragments();
  }
@ -812,9 +833,8 @@ BinaryContext::getOrCreateJumpTable(BinaryFunction &Function, uint64_t Address,
    JTLabel = registerNameAtAddress(JumpTableName, Address, 0, EntrySize);
  }

-  DEBUG(dbgs() << "BOLT-DEBUG: creating jump table "
-               << JTLabel->getName()
-               << " in function " << Function << '\n');
+  LLVM_DEBUG(dbgs() << "BOLT-DEBUG: creating jump table " << JTLabel->getName()
+                    << " in function " << Function << '\n');

  auto *JT = new JumpTable(*JTLabel,
                           Address,
@ -845,7 +865,7 @@ BinaryContext::duplicateJumpTable(BinaryFunction &Function, JumpTable *JT,
    break;
  }
  assert(Found && "Label not found");
-  auto *NewLabel = Ctx->createTempSymbol("duplicatedJT", true);
+  auto *NewLabel = Ctx->createNamedTempSymbol("duplicatedJT");
  auto *NewJT = new JumpTable(*NewLabel,
                              JT->getAddress(),
                              JT->EntrySize,
@ -872,7 +892,7 @@ std::string BinaryContext::generateJumpTableName(const BinaryFunction &BF,
    Offset = Address - JT->getAddress();
    auto Itr = JT->Labels.find(Offset);
    if (Itr != JT->Labels.end()) {
-      return Itr->second->getName();
+      return std::string(Itr->second->getName());
    }
    Id = JumpTableIds.at(JT->getAddress());
  } else {
@ -908,7 +928,6 @@ bool BinaryContext::hasValidCodePadding(const BinaryFunction &BF) {
                                  InstrSize,
                                  FunctionData->slice(Offset),
                                  InstrAddress,
-                                  nulls(),
                                  nulls()))
        break;
      if (!Predicate(Instr))
@ -1393,18 +1412,17 @@ unsigned BinaryContext::addDebugFilenameToUnit(const uint32_t DestCUID,
         "FileIndex out of range for the compilation unit.");
  StringRef Dir = "";
  if (FileNames[FileIndex - 1].DirIdx != 0) {
-    if (auto DirName =
+    if (auto DirName = dwarf::toString(
            LineTable->Prologue
-                .IncludeDirectories[FileNames[FileIndex - 1].DirIdx - 1]
-                .getAsCString()) {
+                .IncludeDirectories[FileNames[FileIndex - 1].DirIdx - 1])) {
      Dir = *DirName;
    }
  }
  StringRef FileName = "";
-  if (auto FName = FileNames[FileIndex - 1].Name.getAsCString())
+  if (auto FName = dwarf::toString(FileNames[FileIndex - 1].Name))
    FileName = *FName;
  assert(FileName != "");
-  return cantFail(Ctx->getDwarfFile(Dir, FileName, 0, nullptr, None, DestCUID));
+  return cantFail(Ctx->getDwarfFile(Dir, FileName, 0, None, None, DestCUID));
 }

 std::vector<BinaryFunction *> BinaryContext::getSortedFunctions() {
@ -1453,8 +1471,9 @@ void BinaryContext::preprocessDebugInfo() {
  // Building a map of address ranges to CUs similar to .debug_aranges and use
  // it to assign CU to functions.
  std::vector<CURange> AllRanges;
+  AllRanges.reserve(DwCtx->getNumCompileUnits());
  for (const auto &CU : DwCtx->compile_units()) {
-    for (auto &Range : CU->getUnitDIE().getAddressRanges()) {
+    for (auto &Range : cantFail(CU->getUnitDIE().getAddressRanges())) {
      // Parts of the debug info could be invalidated due to corresponding code
      // being removed from the binary by the linker. Hence we check if the
      // address is a valid one.
@ -1476,14 +1495,20 @@ void BinaryContext::preprocessDebugInfo() {
  }

  // Populate MCContext with DWARF files from all units.
+  StringRef GlobalPrefix = AsmInfo->getPrivateGlobalPrefix();
  for (const auto &CU : DwCtx->compile_units()) {
-    const uint32_t CUID = CU->getOffset();
+    const uint64_t CUID = CU->getOffset();
    const DWARFDebugLine::LineTable *LineTable =
      DwCtx->getLineTableForUnit(CU.get());
    const auto &FileNames = LineTable->Prologue.FileNames;
+
+    // Assign a unique label to every line table, one per CU.
+    Ctx->getMCDwarfLineTable(CUID).setLabel(
+      Ctx->getOrCreateSymbol(GlobalPrefix + "line_table_start" + Twine(CUID)));
+
    // Make sure empty debug line tables are registered too.
    if (FileNames.empty()) {
-      cantFail(Ctx->getDwarfFile("", "<unknown>", 0, nullptr, None, CUID));
+      cantFail(Ctx->getDwarfFile("", "<unknown>", 0, None, None, CUID));
      continue;
    }
    for (size_t I = 0, Size = FileNames.size(); I != Size; ++I) {
@ -1491,15 +1516,15 @@ void BinaryContext::preprocessDebugInfo() {
      // means empty dir.
      StringRef Dir = "";
      if (FileNames[I].DirIdx != 0)
-        if (auto DirName =
-                LineTable->Prologue.IncludeDirectories[FileNames[I].DirIdx - 1]
-                    .getAsCString())
+        if (auto DirName = dwarf::toString(
+                LineTable->Prologue
+                    .IncludeDirectories[FileNames[I].DirIdx - 1]))
          Dir = *DirName;
      StringRef FileName = "";
-      if (auto FName = FileNames[I].Name.getAsCString())
+      if (auto FName = dwarf::toString(FileNames[I].Name))
        FileName = *FName;
      assert(FileName != "");
-      cantFail(Ctx->getDwarfFile(Dir, FileName, 0, nullptr, None, CUID));
+      cantFail(Ctx->getDwarfFile(Dir, FileName, 0, None, None, CUID));
    }
  }
 }
@ -1591,7 +1616,7 @@ void BinaryContext::printInstruction(raw_ostream &OS,
    OS << "\n";
    return;
  }
-  InstPrinter->printInst(&Instruction, OS, "", *STI);
+  InstPrinter->printInst(&Instruction, 0, "", *STI, OS);
  if (MIB->isCall(Instruction)) {
    if (MIB->isTailCall(Instruction))
      OS << " # TAILCALL ";
@ -1628,7 +1653,7 @@ void BinaryContext::printInstruction(raw_ostream &OS,
      const auto &Row = LineTable->Rows[RowRef.RowIndex - 1];
      StringRef FileName = "";
      if (auto FName =
-              LineTable->Prologue.FileNames[Row.File - 1].Name.getAsCString())
+              dwarf::toString(LineTable->Prologue.FileNames[Row.File - 1].Name))
        FileName = *FName;
      OS << " # debug line " << FileName << ":" << Row.Line;

@ -1673,16 +1698,15 @@ BinaryContext::getSectionNameForAddress(uint64_t Address) const {
 }

 BinarySection &BinaryContext::registerSection(BinarySection *Section) {
-  assert(!Section->getName().empty() &&
-         "can't register sections without a name");
  auto Res = Sections.insert(Section);
  assert(Res.second && "can't register the same section twice.");

  // Only register allocatable sections in the AddressToSection map.
  if (Section->isAllocatable() && Section->getAddress())
    AddressToSection.insert(std::make_pair(Section->getAddress(), Section));
-  NameToSection.insert(std::make_pair(Section->getName(), Section));
-  DEBUG(dbgs() << "BOLT-DEBUG: registering " << *Section << "\n");
+  NameToSection.insert(
+      std::make_pair(std::string(Section->getName()), Section));
+  LLVM_DEBUG(dbgs() << "BOLT-DEBUG: registering " << *Section << "\n");
  return *Section;
 }

@ -1710,10 +1734,10 @@ BinarySection &BinaryContext::registerOrUpdateSection(StringRef Name,
           "can only update unique sections");
    auto *Section = NamedSections.begin()->second;

-    DEBUG(dbgs() << "BOLT-DEBUG: updating " << *Section << " -> ");
+    LLVM_DEBUG(dbgs() << "BOLT-DEBUG: updating " << *Section << " -> ");
    const auto Flag = Section->isAllocatable();
    Section->update(Data, Size, Alignment, ELFType, ELFFlags);
-    DEBUG(dbgs() << *Section << "\n");
+    LLVM_DEBUG(dbgs() << *Section << "\n");
    // FIXME: Fix section flags/attributes for MachO.
    if (isELF())
      assert(Flag == Section->isAllocatable() &&
@ -1738,7 +1762,8 @@ bool BinaryContext::deregisterSection(BinarySection &Section) {
      ++Range.first;
    }

-    auto NameRange = NameToSection.equal_range(SectionPtr->getName());
+    auto NameRange =
+        NameToSection.equal_range(std::string(SectionPtr->getName()));
    while (NameRange.first != NameRange.second) {
      if (NameRange.first->second == SectionPtr) {
        NameToSection.erase(NameRange.first);
@ -1778,7 +1803,7 @@ BinaryContext::getUnsignedValueAtAddress(uint64_t Address,

  DataExtractor DE(Section->getContents(), AsmInfo->isLittleEndian(),
                   AsmInfo->getCodePointerSize());
-  auto ValueOffset = static_cast<uint32_t>(Address - Section->getAddress());
+  auto ValueOffset = static_cast<uint64_t>(Address - Section->getAddress());
  return DE.getUnsigned(&ValueOffset, Size);
 }

@ -1794,7 +1819,7 @@ BinaryContext::getSignedValueAtAddress(uint64_t Address,

  DataExtractor DE(Section->getContents(), AsmInfo->isLittleEndian(),
                   AsmInfo->getCodePointerSize());
-  auto ValueOffset = static_cast<uint32_t>(Address - Section->getAddress());
+  auto ValueOffset = static_cast<uint64_t>(Address - Section->getAddress());
  return DE.getSigned(&ValueOffset, Size);
 }

@ -1852,7 +1877,7 @@ void BinaryContext::markAmbiguousRelocations(BinaryData &BD,
                                             const uint64_t Address) {
  auto setImmovable = [&](BinaryData &BD) {
    auto *Root = BD.getAtomicRoot();
-    DEBUG(if (Root->isMoveable()) {
+    LLVM_DEBUG(if (Root->isMoveable()) {
      dbgs() << "BOLT-DEBUG: setting " << *Root << " as immovable "
             << "due to ambiguous relocation referencing 0x"
             << Twine::utohexstr(Address) << '\n';
@ -1936,14 +1961,15 @@ BinaryContext::calculateEmittedSize(BinaryFunction &BF, bool FixBranches) {
  SmallString<256> Code;
  raw_svector_ostream VecOS(Code);

+  std::unique_ptr<MCObjectWriter> OW = MAB->createObjectWriter(VecOS);
  std::unique_ptr<MCStreamer> Streamer(TheTarget->createMCObjectStreamer(
-      *TheTriple, *LocalCtx, std::unique_ptr<MCAsmBackend>(MAB), VecOS,
+      *TheTriple, *LocalCtx, std::unique_ptr<MCAsmBackend>(MAB), std::move(OW),
      std::unique_ptr<MCCodeEmitter>(MCEInstance.MCE.release()), *STI,
      /*RelaxAll=*/false,
      /*IncrementalLinkerCompatible=*/false,
      /*DWARFMustBeAtTheEnd=*/false));

-  Streamer->InitSections(false);
+  Streamer->initSections(false, *STI);

  auto *Section = MCEInstance.LocalMOFI->getTextSection();
  Section->setHasInstructions(true);
@ -1955,10 +1981,10 @@ BinaryContext::calculateEmittedSize(BinaryFunction &BF, bool FixBranches) {
  MCSymbol *ColdEndLabel = LocalCtx->createTempSymbol();

  Streamer->SwitchSection(Section);
-  Streamer->EmitLabel(StartLabel);
+  Streamer->emitLabel(StartLabel);
  emitFunctionBody(*Streamer, BF, /*EmitColdPart=*/false,
                   /*EmitCodeOnly=*/true);
-  Streamer->EmitLabel(EndLabel);
+  Streamer->emitLabel(EndLabel);

  if (BF.isSplit()) {
    auto *ColdSection =
@ -1968,14 +1994,18 @@ BinaryContext::calculateEmittedSize(BinaryFunction &BF, bool FixBranches) {
    ColdSection->setHasInstructions(true);

    Streamer->SwitchSection(ColdSection);
-    Streamer->EmitLabel(ColdStartLabel);
+    Streamer->emitLabel(ColdStartLabel);
    emitFunctionBody(*Streamer, BF, /*EmitColdPart=*/true,
                     /*EmitCodeOnly=*/true);
-    Streamer->EmitLabel(ColdEndLabel);
+    Streamer->emitLabel(ColdEndLabel);
+    // To avoid calling MCObjectStreamer::flushPendingLabels() which is private
+    Streamer->emitBytes(StringRef(""));
+    Streamer->SwitchSection(Section);
  }

-  // To avoid calling MCObjectStreamer::flushPendingLabels() which is private.
-  Streamer->EmitBytes(StringRef(""));
+  // To avoid calling MCObjectStreamer::flushPendingLabels() which is private or
+  // MCStreamer::Finish(), which does more than we want
+  Streamer->emitBytes(StringRef(""));

  auto &Assembler =
      static_cast<MCObjectStreamer *>(Streamer.get())->getAssembler();
--- a/bolt/src/BinaryContext.h
+++ b/bolt/src/BinaryContext.h
@ -239,7 +239,7 @@ public:
  std::shared_ptr<ExecutableFileMemoryManager> EFMM;

  StringRef getFilename() const { return Filename; }
-  void setFilename(StringRef Name) { Filename = Name; }
+  void setFilename(StringRef Name) { Filename = std::string(Name); }

  Optional<StringRef> getFileBuildID() const {
    if (FileBuildID) {
@ -248,7 +248,7 @@ public:

    return NoneType();
  }
-  void setFileBuildID(StringRef ID) { FileBuildID = ID; }
+  void setFileBuildID(StringRef ID) { FileBuildID = std::string(ID); }

  bool hasSymbolsWithFileName() const {
    return HasSymbolsWithFileName;
@ -583,6 +583,11 @@ public:
  /// special linux kernel sections
  std::unordered_map<uint64_t, std::vector<LKInstructionMarkerInfo>> LKMarkers;

+  /// DWARF encoding. Available encoding types defined in BinaryFormat/Dwarf.h
+  /// enum Constants, e.g. DW_EH_PE_omit.
+  unsigned TTypeEncoding = dwarf::DW_EH_PE_omit;
+  unsigned LSDAEncoding = dwarf::DW_EH_PE_omit;
+
  BinaryContext(std::unique_ptr<MCContext> Ctx,
                std::unique_ptr<DWARFContext> DwCtx,
                std::unique_ptr<Triple> TheTriple,
@ -974,11 +979,11 @@ public:
  /// Return section(s) associated with given \p Name.
  iterator_range<NameToSectionMapType::iterator>
  getSectionByName(StringRef Name) {
-    return make_range(NameToSection.equal_range(Name));
+    return make_range(NameToSection.equal_range(std::string(Name)));
  }
  iterator_range<NameToSectionMapType::const_iterator>
  getSectionByName(StringRef Name) const {
-    return make_range(NameToSection.equal_range(Name));
+    return make_range(NameToSection.equal_range(std::string(Name)));
  }

  /// Return the unique section associated with given \p Name.
@ -1187,12 +1192,12 @@ public:
  /// won't be used in the main code emitter.
  IndependentCodeEmitter createIndependentMCCodeEmitter() const {
    IndependentCodeEmitter MCEInstance;
-    MCEInstance.LocalMOFI = llvm::make_unique<MCObjectFileInfo>();
-    MCEInstance.LocalCtx = llvm::make_unique<MCContext>(
-        AsmInfo.get(), MRI.get(), MCEInstance.LocalMOFI.get());
-    MCEInstance.LocalMOFI->InitMCObjectFileInfo(*TheTriple,
-                                                /*PIC=*/!HasFixedLoadAddress,
-                                                *MCEInstance.LocalCtx);
+    MCEInstance.LocalCtx.reset(
+        new MCContext(*TheTriple, AsmInfo.get(), MRI.get(), STI.get()));
+    MCEInstance.LocalMOFI.reset(
+        TheTarget->createMCObjectFileInfo(*MCEInstance.LocalCtx.get(),
+                                          /*PIC=*/!HasFixedLoadAddress));
+    MCEInstance.LocalCtx->setObjectFileInfo(MCEInstance.LocalMOFI.get());
    MCEInstance.MCE.reset(
        TheTarget->createMCCodeEmitter(*MII, *MRI, *MCEInstance.LocalCtx));
    return MCEInstance;
--- a/bolt/src/BinaryEmitter.cpp
+++ b/bolt/src/BinaryEmitter.cpp
@ -12,7 +12,9 @@
 #include "BinaryContext.h"
 #include "BinaryEmitter.h"
 #include "BinaryFunction.h"
+#include "Utils.h"
 #include "llvm/MC/MCSection.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/SMLoc.h"

@ -22,8 +24,6 @@
 using namespace llvm;
 using namespace bolt;

-extern cl::opt<uint32_t> X86AlignBranchBoundary;
-
 namespace opts {

 extern cl::OptionCategory BoltCategory;
@ -164,6 +164,8 @@ private:
  void emitJumpTable(const JumpTable &JT, MCSection *HotSection,
                     MCSection *ColdSection);

+  void emitCFIInstruction(const MCCFIInstruction &Inst) const;
+
  /// Emit exception handling ranges for the function.
  void emitLSDA(BinaryFunction &BF, bool EmitColdPart);

@ -189,13 +191,13 @@ private:
 } // anonymous namespace

 void BinaryEmitter::emitAll(StringRef OrgSecPrefix) {
-  Streamer.InitSections(false);
+  Streamer.initSections(false, *BC.STI);

  if (auto *RtLibrary = BC.getRuntimeLibrary()) {
    RtLibrary->emitBinary(BC, Streamer);
  }

-  BC.getTextSection()->setAlignment(opts::AlignText);
+  BC.getTextSection()->setAlignment(Align(opts::AlignText));

  emitFunctions();

@ -204,21 +206,20 @@ void BinaryEmitter::emitAll(StringRef OrgSecPrefix) {

  emitDataSections(OrgSecPrefix);

-  Streamer.EmitLabel(BC.Ctx->getOrCreateSymbol("_end"));
+  Streamer.emitLabel(BC.Ctx->getOrCreateSymbol("_end"));
 }

 void BinaryEmitter::emitFunctions() {
  auto emit = [&](const std::vector<BinaryFunction *> &Functions) {
    const auto HasProfile = BC.NumProfiledFuncs > 0;
-    const uint32_t OriginalBranchBoundaryAlign = X86AlignBranchBoundary;
+    const bool OriginalAllowAutoPadding = Streamer.getAllowAutoPadding();
    for (auto *Function : Functions) {
      if (!BC.shouldEmit(*Function)) {
        continue;
      }

-      DEBUG(dbgs() << "BOLT: generating code for function \""
-                   << *Function << "\" : "
-                   << Function->getFunctionNumber() << '\n');
+      LLVM_DEBUG(dbgs() << "BOLT: generating code for function \"" << *Function
+                        << "\" : " << Function->getFunctionNumber() << '\n');

      // Was any part of the function emitted.
      bool Emitted{false};
@ -226,16 +227,16 @@ void BinaryEmitter::emitFunctions() {
      // Turn off Intel JCC Erratum mitigation for cold code if requested
      if (HasProfile && opts::X86AlignBranchBoundaryHotOnly &&
          !Function->hasValidProfile())
-        X86AlignBranchBoundary = 0;
+        Streamer.setAllowAutoPadding(false);

      Emitted |= emitFunction(*Function, /*EmitColdPart=*/false);

      if (Function->isSplit()) {
        if (opts::X86AlignBranchBoundaryHotOnly)
-          X86AlignBranchBoundary = 0;
+          Streamer.setAllowAutoPadding(false);
        Emitted |= emitFunction(*Function, /*EmitColdPart=*/true);
      }
-      X86AlignBranchBoundary = OriginalBranchBoundaryAlign;
+      Streamer.setAllowAutoPadding(OriginalAllowAutoPadding);

      if (Emitted)
        Function->setEmitted(/*KeepCFG=*/opts::PrintCacheMetrics);
@ -245,7 +246,7 @@ void BinaryEmitter::emitFunctions() {
  // Mark the start of hot text.
  if (opts::HotText) {
    Streamer.SwitchSection(BC.getTextSection());
-    Streamer.EmitLabel(BC.getHotTextStartSymbol());
+    Streamer.emitLabel(BC.getHotTextStartSymbol());
  }

  // Emit functions in sorted order.
@ -258,7 +259,7 @@ void BinaryEmitter::emitFunctions() {
  // Mark the end of hot text.
  if (opts::HotText) {
    Streamer.SwitchSection(BC.getTextSection());
-    Streamer.EmitLabel(BC.getHotTextEndSymbol());
+    Streamer.emitLabel(BC.getHotTextEndSymbol());
  }
 }

@ -277,14 +278,15 @@ bool BinaryEmitter::emitFunction(BinaryFunction &Function, bool EmitColdPart) {
  BC.Ctx->addGenDwarfSection(Section);

  if (BC.HasRelocations) {
-    Streamer.EmitCodeAlignment(BinaryFunction::MinAlign);
+    Streamer.emitCodeAlignment(BinaryFunction::MinAlign, &*BC.STI);
    auto MaxAlignBytes = EmitColdPart
      ? Function.getMaxColdAlignmentBytes()
      : Function.getMaxAlignmentBytes();
    if (MaxAlignBytes > 0)
-      Streamer.EmitCodeAlignment(Function.getAlignment(), MaxAlignBytes);
+      Streamer.emitCodeAlignment(Function.getAlignment(), &*BC.STI,
+                                 MaxAlignBytes);
  } else {
-    Streamer.EmitCodeAlignment(Function.getAlignment());
+    Streamer.emitCodeAlignment(Function.getAlignment(), &*BC.STI);
  }

  MCContext &Context = Streamer.getContext();
@ -296,28 +298,28 @@ bool BinaryEmitter::emitFunction(BinaryFunction &Function, bool EmitColdPart) {
  if (!EmitColdPart) {
    StartSymbol = Function.getSymbol();
    for (MCSymbol *Symbol : Function.getSymbols()) {
-      Streamer.EmitSymbolAttribute(Symbol, MCSA_ELF_TypeFunction);
-      Streamer.EmitLabel(Symbol);
+      Streamer.emitSymbolAttribute(Symbol, MCSA_ELF_TypeFunction);
+      Streamer.emitLabel(Symbol);
    }
  } else {
    StartSymbol = Function.getColdSymbol();
-    Streamer.EmitSymbolAttribute(StartSymbol, MCSA_ELF_TypeFunction);
-    Streamer.EmitLabel(StartSymbol);
+    Streamer.emitSymbolAttribute(StartSymbol, MCSA_ELF_TypeFunction);
+    Streamer.emitLabel(StartSymbol);
  }

  // Emit CFI start
  if (Function.hasCFI()) {
-    Streamer.EmitCFIStartProc(/*IsSimple=*/false);
+    Streamer.emitCFIStartProc(/*IsSimple=*/false);
    if (Function.getPersonalityFunction() != nullptr) {
-      Streamer.EmitCFIPersonality(Function.getPersonalityFunction(),
+      Streamer.emitCFIPersonality(Function.getPersonalityFunction(),
                                  Function.getPersonalityEncoding());
    }
    auto *LSDASymbol = EmitColdPart ? Function.getColdLSDASymbol()
                                    : Function.getLSDASymbol();
    if (LSDASymbol) {
-      Streamer.EmitCFILsda(LSDASymbol, BC.MOFI->getLSDAEncoding());
+      Streamer.emitCFILsda(LSDASymbol, BC.LSDAEncoding);
    } else {
-      Streamer.EmitCFILsda(0, dwarf::DW_EH_PE_omit);
+      Streamer.emitCFILsda(0, dwarf::DW_EH_PE_omit);
    }
    // Emit CFI instructions relative to the CIE
    for (const auto &CFIInstr : Function.cie()) {
@ -326,7 +328,7 @@ bool BinaryEmitter::emitFunction(BinaryFunction &Function, bool EmitColdPart) {
          MAI->getInitialFrameState();
      if (std::find(FrameInstrs.begin(), FrameInstrs.end(), CFIInstr) ==
          FrameInstrs.end())
-        Streamer.EmitCFIInstruction(CFIInstr);
+        emitCFIInstruction(CFIInstr);
    }
  }

@ -337,7 +339,7 @@ bool BinaryEmitter::emitFunction(BinaryFunction &Function, bool EmitColdPart) {
  if (!opts::BreakFunctionNames.empty()) {
    for (auto &Name : opts::BreakFunctionNames) {
      if (Function.hasNameRegex(Name)) {
-        Streamer.EmitIntValue(0x0B0F, 2); // UD2: 0F 0B
+        Streamer.emitIntValue(0x0B0F, 2); // UD2: 0F 0B
        break;
      }
    }
@ -348,22 +350,22 @@ bool BinaryEmitter::emitFunction(BinaryFunction &Function, bool EmitColdPart) {

  // Emit padding if requested.
  if (auto Padding = opts::padFunction(Function)) {
-    DEBUG(dbgs() << "BOLT-DEBUG: padding function " << Function << " with "
-                 << Padding << " bytes\n");
+    LLVM_DEBUG(dbgs() << "BOLT-DEBUG: padding function " << Function << " with "
+                      << Padding << " bytes\n");
    Streamer.emitFill(Padding, MAI->getTextAlignFillValue());
  }

  if (opts::MarkFuncs) {
-    Streamer.EmitIntValue(MAI->getTrapFillValue(), 1);
+    Streamer.emitIntValue(BC.MIB->getTrapFillValue(), 1);
  }

  // Emit CFI end
  if (Function.hasCFI())
-    Streamer.EmitCFIEndProc();
+    Streamer.emitCFIEndProc();

  MCSymbol *EndSymbol = EmitColdPart ? Function.getFunctionColdEndLabel()
                                     : Function.getFunctionEndLabel();
-  Streamer.EmitLabel(EndSymbol);
+  Streamer.emitLabel(EndSymbol);

  if (MAI->hasDotTypeDotSizeDirective()) {
    const MCExpr *SizeExpr = MCBinaryExpr::createSub(
@ -394,13 +396,13 @@ void BinaryEmitter::emitFunctionBody(BinaryFunction &BF, bool EmitColdPart,

    if ((opts::AlignBlocks || opts::PreserveBlocksAlignment)
        && BB->getAlignment() > 1) {
-      Streamer.EmitCodeAlignment(BB->getAlignment(),
+      Streamer.emitCodeAlignment(BB->getAlignment(), &*BC.STI,
                                 BB->getAlignmentMaxBytes());
    }
-    Streamer.EmitLabel(BB->getLabel());
+    Streamer.emitLabel(BB->getLabel());
    if (!EmitCodeOnly) {
      if (auto *EntrySymbol = BF.getSecondaryEntryPointSymbol(*BB)) {
-        Streamer.EmitLabel(EntrySymbol);
+        Streamer.emitLabel(EntrySymbol);
      }
    }

@ -430,11 +432,11 @@ void BinaryEmitter::emitFunctionBody(BinaryFunction &BF, bool EmitColdPart,
        const auto *Label = BC.MIB->getTargetSymbol(Instr);
        assert(Instr.getNumOperands() >= 1 && Label &&
               "bad EH_LABEL instruction");
-        Streamer.EmitLabel(const_cast<MCSymbol *>(Label));
+        Streamer.emitLabel(const_cast<MCSymbol *>(Label));
        continue;
      }
      if (BC.MIB->isCFI(Instr)) {
-        Streamer.EmitCFIInstruction(*BF.getCFIFor(Instr));
+        emitCFIInstruction(*BF.getCFIFor(Instr));
        continue;
      }

@ -445,7 +447,6 @@ void BinaryEmitter::emitFunctionBody(BinaryFunction &BF, bool EmitColdPart,
        // This assumes the second instruction in the macro-op pair will get
        // assigned to its own MCRelaxableFragment. Since all JCC instructions
        // are relaxable, we should be safe.
-        Streamer.EmitNeverAlignCodeAtEnd(/*Alignment to avoid=*/64);
      }

      if (!EmitCodeOnly && opts::UpdateDebugSections && BF.getDWARFUnit()) {
@ -458,12 +459,12 @@ void BinaryEmitter::emitFunctionBody(BinaryFunction &BF, bool EmitColdPart,
      if (!EmitCodeOnly && BF.requiresAddressTranslation() &&
          BC.MIB->hasAnnotation(Instr, "Offset")) {
        const auto Offset = BC.MIB->getAnnotationAs<uint32_t>(Instr, "Offset");
-        MCSymbol *LocSym = BC.Ctx->createTempSymbol(/*CanBeUnnamed=*/true);
-        Streamer.EmitLabel(LocSym);
+        MCSymbol *LocSym = BC.Ctx->createTempSymbol();
+        Streamer.emitLabel(LocSym);
        BB->getLocSyms().emplace_back(std::make_pair(Offset, LocSym));
      }

-      Streamer.EmitInstruction(Instr, *BC.STI);
+      Streamer.emitInstruction(Instr, *BC.STI);
      LastIsPrefix = BC.MIB->isPrefix(Instr);
    }
  }
@ -480,9 +481,9 @@ void BinaryEmitter::emitConstantIslands(BinaryFunction &BF, bool EmitColdPart,

  if (!OnBehalfOf) {
    if (!EmitColdPart)
-      Streamer.EmitLabel(BF.getFunctionConstantIslandLabel());
+      Streamer.emitLabel(BF.getFunctionConstantIslandLabel());
    else
-      Streamer.EmitLabel(BF.getFunctionColdConstantIslandLabel());
+      Streamer.emitLabel(BF.getFunctionColdConstantIslandLabel());
  }

  assert((!OnBehalfOf || Islands.Proxies[OnBehalfOf].size() > 0) &&
@ -538,7 +539,7 @@ void BinaryEmitter::emitConstantIslands(BinaryFunction &BF, bool EmitColdPart,
      auto NextStop = std::min(NextLabelOffset, NextRelOffset);
      assert(NextStop <= EndOffset && "internal overflow error");
      if (FunctionOffset < NextStop) {
-        Streamer.EmitBytes(FunctionContents.slice(FunctionOffset, NextStop));
+        Streamer.emitBytes(FunctionContents.slice(FunctionOffset, NextStop));
        FunctionOffset = NextStop;
      }
      if (IS != Islands.Offsets.end() && FunctionOffset == IS->first) {
@ -547,49 +548,49 @@ void BinaryEmitter::emitConstantIslands(BinaryFunction &BF, bool EmitColdPart,
        // symbol being emitted on behalf of an external function.
        if (!OnBehalfOf) {
          if (!EmitColdPart) {
-            DEBUG(dbgs() << "BOLT-DEBUG: emitted label "
-                         << IS->second->getName() << " at offset 0x"
-                         << Twine::utohexstr(IS->first) << '\n');
+            LLVM_DEBUG(dbgs() << "BOLT-DEBUG: emitted label "
+                              << IS->second->getName() << " at offset 0x"
+                              << Twine::utohexstr(IS->first) << '\n');
            if (IS->second->isUndefined())
-              Streamer.EmitLabel(IS->second);
+              Streamer.emitLabel(IS->second);
            else
-              assert(BF.hasName(IS->second->getName()));
+              assert(BF.hasName(std::string(IS->second->getName())));
          } else if (Islands.ColdSymbols.count(IS->second) != 0) {
-            DEBUG(dbgs() << "BOLT-DEBUG: emitted label "
-                         << Islands.ColdSymbols[IS->second]->getName()
-                         << '\n');
+            LLVM_DEBUG(dbgs()
+                       << "BOLT-DEBUG: emitted label "
+                       << Islands.ColdSymbols[IS->second]->getName() << '\n');
            if (Islands.ColdSymbols[IS->second]->isUndefined())
-              Streamer.EmitLabel(Islands.ColdSymbols[IS->second]);
+              Streamer.emitLabel(Islands.ColdSymbols[IS->second]);
          }
        } else {
          if (!EmitColdPart) {
            if (MCSymbol *Sym = Islands.Proxies[OnBehalfOf][IS->second]) {
-              DEBUG(dbgs() << "BOLT-DEBUG: emitted label " << Sym->getName()
-                           << '\n');
-              Streamer.EmitLabel(Sym);
+              LLVM_DEBUG(dbgs() << "BOLT-DEBUG: emitted label "
+                                << Sym->getName() << '\n');
+              Streamer.emitLabel(Sym);
            }
          } else if (MCSymbol *Sym =
                         Islands.ColdProxies[OnBehalfOf][IS->second]) {
-            DEBUG(dbgs() << "BOLT-DEBUG: emitted label " << Sym->getName()
-                         << '\n');
-            Streamer.EmitLabel(Sym);
+            LLVM_DEBUG(dbgs() << "BOLT-DEBUG: emitted label " << Sym->getName()
+                              << '\n');
+            Streamer.emitLabel(Sym);
          }
        }
        ++IS;
      }
      if (RI != BF.getMoveRelocations().end() && FunctionOffset == RI->first) {
        auto RelocationSize = RI->second.emit(&Streamer);
-        DEBUG(dbgs() << "BOLT-DEBUG: emitted relocation for symbol "
-                     << RI->second.Symbol->getName() << " at offset 0x"
-                     << Twine::utohexstr(RI->first)
-                     << " with size " << RelocationSize << '\n');
+        LLVM_DEBUG(dbgs() << "BOLT-DEBUG: emitted relocation for symbol "
+                          << RI->second.Symbol->getName() << " at offset 0x"
+                          << Twine::utohexstr(RI->first) << " with size "
+                          << RelocationSize << '\n');
        FunctionOffset += RelocationSize;
        ++RI;
      }
    }
    assert(FunctionOffset <= EndOffset && "overflow error");
    if (FunctionOffset < EndOffset) {
-      Streamer.EmitBytes(FunctionContents.slice(FunctionOffset, EndOffset));
+      Streamer.emitBytes(FunctionContents.slice(FunctionOffset, EndOffset));
    }
  }
  assert(IS == Islands.Offsets.end() && "some symbols were not emitted!");
@ -661,6 +662,10 @@ SMLoc BinaryEmitter::emitLineInfo(const BinaryFunction &BF, SMLoc NewLoc,
 }

 void BinaryEmitter::emitJumpTables(const BinaryFunction &BF) {
+  MCSection *ReadOnlySection = BC.MOFI->getReadOnlySection();
+  MCSection *ReadOnlyColdSection = BC.MOFI->getContext().getELFSection(
+      ".rodata.cold", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+
  if (!BF.hasJumpTables())
    return;

@ -689,11 +694,10 @@ void BinaryEmitter::emitJumpTables(const BinaryFunction &BF) {
        ColdSection = HotSection;
      } else {
        if (BF.isSimple()) {
-          HotSection = BC.MOFI->getReadOnlySection();
-          ColdSection = BC.MOFI->getReadOnlyColdSection();
+          HotSection = ReadOnlySection;
+          ColdSection = ReadOnlyColdSection;
        } else {
-          HotSection = BF.hasProfile() ? BC.MOFI->getReadOnlySection()
-                                       : BC.MOFI->getReadOnlyColdSection();
+          HotSection = BF.hasProfile() ? ReadOnlySection : ReadOnlyColdSection;
          ColdSection = HotSection;
        }
      }
@ -723,42 +727,90 @@ void BinaryEmitter::emitJumpTable(const JumpTable &JT, MCSection *HotSection,
    LabelCounts[CurrentLabel] = CurrentLabelCount;
  } else {
    Streamer.SwitchSection(JT.Count > 0 ? HotSection : ColdSection);
-    Streamer.EmitValueToAlignment(JT.EntrySize);
+    Streamer.emitValueToAlignment(JT.EntrySize);
  }
  MCSymbol *LastLabel = nullptr;
  uint64_t Offset = 0;
  for (auto *Entry : JT.Entries) {
    auto LI = JT.Labels.find(Offset);
    if (LI != JT.Labels.end()) {
-      DEBUG(dbgs() << "BOLT-DEBUG: emitting jump table "
-                   << LI->second->getName() << " (originally was at address 0x"
-                   << Twine::utohexstr(JT.getAddress() + Offset)
-                   << (Offset ? "as part of larger jump table\n" : "\n"));
+      LLVM_DEBUG(dbgs() << "BOLT-DEBUG: emitting jump table "
+                        << LI->second->getName()
+                        << " (originally was at address 0x"
+                        << Twine::utohexstr(JT.getAddress() + Offset)
+                        << (Offset ? "as part of larger jump table\n" : "\n"));
      if (!LabelCounts.empty()) {
-        DEBUG(dbgs() << "BOLT-DEBUG: jump table count: "
-                     << LabelCounts[LI->second] << '\n');
+        LLVM_DEBUG(dbgs() << "BOLT-DEBUG: jump table count: "
+                          << LabelCounts[LI->second] << '\n');
        if (LabelCounts[LI->second] > 0) {
          Streamer.SwitchSection(HotSection);
        } else {
          Streamer.SwitchSection(ColdSection);
        }
-        Streamer.EmitValueToAlignment(JT.EntrySize);
+        Streamer.emitValueToAlignment(JT.EntrySize);
      }
-      Streamer.EmitLabel(LI->second);
+      Streamer.emitLabel(LI->second);
      LastLabel = LI->second;
    }
    if (JT.Type == JumpTable::JTT_NORMAL) {
-      Streamer.EmitSymbolValue(Entry, JT.OutputEntrySize);
+      Streamer.emitSymbolValue(Entry, JT.OutputEntrySize);
    } else { // JTT_PIC
      auto JTExpr = MCSymbolRefExpr::create(LastLabel, Streamer.getContext());
      auto E = MCSymbolRefExpr::create(Entry, Streamer.getContext());
      auto Value = MCBinaryExpr::createSub(E, JTExpr, Streamer.getContext());
-      Streamer.EmitValue(Value, JT.EntrySize);
+      Streamer.emitValue(Value, JT.EntrySize);
    }
    Offset += JT.EntrySize;
  }
 }

+void BinaryEmitter::emitCFIInstruction(const MCCFIInstruction &Inst) const {
+  switch (Inst.getOperation()) {
+  default:
+    llvm_unreachable("Unexpected instruction");
+  case MCCFIInstruction::OpDefCfaOffset:
+    Streamer.emitCFIDefCfaOffset(Inst.getOffset());
+    break;
+  case MCCFIInstruction::OpAdjustCfaOffset:
+    Streamer.emitCFIAdjustCfaOffset(Inst.getOffset());
+    break;
+  case MCCFIInstruction::OpDefCfa:
+    Streamer.emitCFIDefCfa(Inst.getRegister(), Inst.getOffset());
+    break;
+  case MCCFIInstruction::OpDefCfaRegister:
+    Streamer.emitCFIDefCfaRegister(Inst.getRegister());
+    break;
+  case MCCFIInstruction::OpOffset:
+    Streamer.emitCFIOffset(Inst.getRegister(), Inst.getOffset());
+    break;
+  case MCCFIInstruction::OpRegister:
+    Streamer.emitCFIRegister(Inst.getRegister(), Inst.getRegister2());
+    break;
+  case MCCFIInstruction::OpWindowSave:
+    Streamer.emitCFIWindowSave();
+    break;
+  case MCCFIInstruction::OpNegateRAState:
+    Streamer.emitCFINegateRAState();
+    break;
+  case MCCFIInstruction::OpSameValue:
+    Streamer.emitCFISameValue(Inst.getRegister());
+    break;
+  case MCCFIInstruction::OpGnuArgsSize:
+    Streamer.emitCFIGnuArgsSize(Inst.getOffset());
+    break;
+  case MCCFIInstruction::OpEscape:
+    Streamer.AddComment(Inst.getComment());
+    Streamer.emitCFIEscape(Inst.getValues());
+    break;
+  case MCCFIInstruction::OpRestore:
+    Streamer.emitCFIRestore(Inst.getRegister());
+    break;
+  case MCCFIInstruction::OpUndefined:
+    Streamer.emitCFIUndefined(Inst.getRegister());
+    break;
+  }
+}
+
 // The code is based on EHStreamer::emitExceptionTable().
 void BinaryEmitter::emitLSDA(BinaryFunction &BF, bool EmitColdPart) {
  const auto *Sites =
@ -781,17 +833,17 @@ void BinaryEmitter::emitLSDA(BinaryFunction &BF, bool EmitColdPart) {

  Streamer.SwitchSection(BC.MOFI->getLSDASection());

-  const auto TTypeEncoding = BC.MOFI->getTTypeEncoding();
+  const auto TTypeEncoding = BC.TTypeEncoding;
  const auto TTypeEncodingSize = BC.getDWARFEncodingSize(TTypeEncoding);
  const auto TTypeAlignment = 4;

  // Type tables have to be aligned at 4 bytes.
-  Streamer.EmitValueToAlignment(TTypeAlignment);
+  Streamer.emitValueToAlignment(TTypeAlignment);

  // Emit the LSDA label.
  auto *LSDASymbol = EmitColdPart ? BF.getColdLSDASymbol() : BF.getLSDASymbol();
  assert(LSDASymbol && "no LSDA symbol set");
-  Streamer.EmitLabel(LSDASymbol);
+  Streamer.emitLabel(LSDASymbol);

  // Corresponding FDE start.
  const MCSymbol *StartSymbol = EmitColdPart ? BF.getColdSymbol()
@ -818,27 +870,27 @@ void BinaryEmitter::emitLSDA(BinaryFunction &BF, bool EmitColdPart) {
  // code for shared objects.
  std::function<void(const MCSymbol *)> emitLandingPad;
  if (BC.HasFixedLoadAddress) {
-    Streamer.EmitIntValue(dwarf::DW_EH_PE_udata4, 1); // LPStart format
-    Streamer.EmitIntValue(0, 4);                      // LPStart
+    Streamer.emitIntValue(dwarf::DW_EH_PE_udata4, 1); // LPStart format
+    Streamer.emitIntValue(0, 4);                      // LPStart
    emitLandingPad = [&](const MCSymbol *LPSymbol) {
      if (!LPSymbol)
-        Streamer.EmitIntValue(0, 4);
+        Streamer.emitIntValue(0, 4);
      else
-        Streamer.EmitSymbolValue(LPSymbol, 4);
+        Streamer.emitSymbolValue(LPSymbol, 4);
    };
  } else {
    assert(!EmitColdPart &&
           "cannot have exceptions in cold fragment for shared object");
-    Streamer.EmitIntValue(dwarf::DW_EH_PE_omit, 1); // LPStart format
+    Streamer.emitIntValue(dwarf::DW_EH_PE_omit, 1); // LPStart format
    emitLandingPad = [&](const MCSymbol *LPSymbol) {
      if (!LPSymbol)
-        Streamer.EmitIntValue(0, 4);
+        Streamer.emitIntValue(0, 4);
      else
        Streamer.emitAbsoluteSymbolDiff(LPSymbol, StartSymbol, 4);
    };
  }

-  Streamer.EmitIntValue(TTypeEncoding, 1);        // TType format
+  Streamer.emitIntValue(TTypeEncoding, 1);        // TType format

  // See the comment in EHStreamer::emitExceptionTable() on to use
  // uleb128 encoding (which can use variable number of bytes to encode the same
@ -861,14 +913,14 @@ void BinaryEmitter::emitLSDA(BinaryFunction &BF, bool EmitColdPart) {

  // Account for any extra padding that will be added to the call site table
  // length.
-  Streamer.EmitPaddedULEB128IntValue(TTypeBaseOffset,
-                                     TTypeBaseOffsetSize + SizeAlign);
+  Streamer.emitULEB128IntValue(TTypeBaseOffset,
+                               /*PadTo=*/TTypeBaseOffsetSize + SizeAlign);

  // Emit the landing pad call site table. We use signed data4 since we can emit
  // a landing pad in a different part of the split function that could appear
  // earlier in the address space than LPStart.
-  Streamer.EmitIntValue(dwarf::DW_EH_PE_sdata4, 1);
-  Streamer.EmitULEB128IntValue(CallSiteTableLength);
+  Streamer.emitIntValue(dwarf::DW_EH_PE_sdata4, 1);
+  Streamer.emitULEB128IntValue(CallSiteTableLength);

  for (const auto &CallSite : *Sites) {
    const auto *BeginLabel = CallSite.Start;
@ -882,7 +934,7 @@ void BinaryEmitter::emitLSDA(BinaryFunction &BF, bool EmitColdPart) {
    Streamer.emitAbsoluteSymbolDiff(BeginLabel, StartSymbol, 4);
    Streamer.emitAbsoluteSymbolDiff(EndLabel, BeginLabel, 4);
    emitLandingPad(CallSite.LP);
-    Streamer.EmitULEB128IntValue(CallSite.Action);
+    Streamer.emitULEB128IntValue(CallSite.Action);
  }

  // Write out action, type, and type index tables at the end.
@ -894,7 +946,7 @@ void BinaryEmitter::emitLSDA(BinaryFunction &BF, bool EmitColdPart) {
  // For type table we (re-)encode the table using TTypeEncoding matching
  // the current assembler mode.
  for (auto const &Byte : BF.getLSDAActionTable()) {
-    Streamer.EmitIntValue(Byte, 1);
+    Streamer.emitIntValue(Byte, 1);
  }

  const auto &TypeTable = (TTypeEncoding & dwarf::DW_EH_PE_indirect)
@ -909,28 +961,28 @@ void BinaryEmitter::emitLSDA(BinaryFunction &BF, bool EmitColdPart) {
    default:
      llvm_unreachable("unsupported TTypeEncoding");
    case dwarf::DW_EH_PE_absptr:
-      Streamer.EmitIntValue(TypeAddress, TTypeEncodingSize);
+      Streamer.emitIntValue(TypeAddress, TTypeEncodingSize);
      break;
    case dwarf::DW_EH_PE_pcrel: {
      if (TypeAddress) {
        const MCSymbol *TypeSymbol =
          BC.getOrCreateGlobalSymbol(TypeAddress, "TI", 0, TTypeAlignment);
-        MCSymbol *DotSymbol = BC.Ctx->createTempSymbol();
-        Streamer.EmitLabel(DotSymbol);
+        MCSymbol *DotSymbol = BC.Ctx->createNamedTempSymbol();
+        Streamer.emitLabel(DotSymbol);
        const auto *SubDotExpr = MCBinaryExpr::createSub(
            MCSymbolRefExpr::create(TypeSymbol, *BC.Ctx),
            MCSymbolRefExpr::create(DotSymbol, *BC.Ctx),
            *BC.Ctx);
-        Streamer.EmitValue(SubDotExpr, TTypeEncodingSize);
+        Streamer.emitValue(SubDotExpr, TTypeEncodingSize);
      } else {
-        Streamer.EmitIntValue(0, TTypeEncodingSize);
+        Streamer.emitIntValue(0, TTypeEncodingSize);
      }
      break;
    }
    }
  }
  for (auto const &Byte : BF.getLSDATypeIndexTable()) {
-    Streamer.EmitIntValue(Byte, 1);
+    Streamer.emitIntValue(Byte, 1);
  }
 }

@ -953,7 +1005,7 @@ void BinaryEmitter::emitDebugLineInfoForOriginalFunctions() {
        BC.getCodeSection(Function.getCodeSectionName());

    uint64_t Address = It.first;
-    if (LineTable->lookupAddressRange(Address, Function.getMaxSize(),
+    if (LineTable->lookupAddressRange({Address, 0}, Function.getMaxSize(),
                                      Results)) {
      auto &OutputLineTable =
          BC.Ctx->getMCDwarfLineTable(Unit->getOffset()).getMCLineSections();
@ -969,7 +1021,7 @@ void BinaryEmitter::emitDebugLineInfoForOriginalFunctions() {
            (DWARF2_FLAG_EPILOGUE_BEGIN * Row.EpilogueBegin),
            Row.Isa,
            Row.Discriminator,
-            Row.Address);
+            Row.Address.Address);
        auto Loc = BC.Ctx->getCurrentDwarfLoc();
        BC.Ctx->clearDwarfLocSeen();
        OutputLineTable.addLineEntry(MCDwarfLineEntry{nullptr, Loc},
@ -984,8 +1036,8 @@ void BinaryEmitter::emitDebugLineInfoForOriginalFunctions() {
      OutputLineTable.addLineEntry(MCDwarfLineEntry{nullptr, Loc},
                                   FunctionSection);
    } else {
-      DEBUG(dbgs() << "BOLT-DEBUG: function " << Function
-                   << " has no associated line number information\n");
+      LLVM_DEBUG(dbgs() << "BOLT-DEBUG: function " << Function
+                        << " has no associated line number information\n");
    }
  }
 }
@ -1022,28 +1074,29 @@ void BinaryEmitter::emitFunctionBodyRaw(BinaryFunction &BF) {
    auto NextStop = std::min(NextLabelOffset, NextRelocationOffset);
    assert(NextStop <= BF.getSize() && "internal overflow error");
    if (FunctionOffset < NextStop) {
-      Streamer.EmitBytes(FunctionContents.slice(FunctionOffset, NextStop));
+      Streamer.emitBytes(FunctionContents.slice(FunctionOffset, NextStop));
      FunctionOffset = NextStop;
    }
    if (LI != BF.getLabels().end() && FunctionOffset == LI->first) {
-      Streamer.EmitLabel(LI->second);
-      DEBUG(dbgs() << "BOLT-DEBUG: emitted label " << LI->second->getName()
-                   << " at offset 0x" << Twine::utohexstr(LI->first) << '\n');
+      Streamer.emitLabel(LI->second);
+      LLVM_DEBUG(dbgs() << "BOLT-DEBUG: emitted label " << LI->second->getName()
+                        << " at offset 0x" << Twine::utohexstr(LI->first)
+                        << '\n');
      ++LI;
    }
    if (RI != BF.getMoveRelocations().end() && FunctionOffset == RI->first) {
      auto RelocationSize = RI->second.emit(&Streamer);
-      DEBUG(dbgs() << "BOLT-DEBUG: emitted relocation for symbol "
-                   << RI->second.Symbol->getName() << " at offset 0x"
-                   << Twine::utohexstr(RI->first)
-                   << " with size " << RelocationSize << '\n');
+      LLVM_DEBUG(dbgs() << "BOLT-DEBUG: emitted relocation for symbol "
+                        << RI->second.Symbol->getName() << " at offset 0x"
+                        << Twine::utohexstr(RI->first) << " with size "
+                        << RelocationSize << '\n');
      FunctionOffset += RelocationSize;
      ++RI;
    }
  }
  assert(FunctionOffset <= BF.getSize() && "overflow error");
  if (FunctionOffset < BF.getSize()) {
-    Streamer.EmitBytes(FunctionContents.substr(FunctionOffset));
+    Streamer.emitBytes(FunctionContents.substr(FunctionOffset));
  }
 }

--- a/bolt/src/BinaryFunction.cpp
+++ b/bolt/src/BinaryFunction.cpp
@ -16,6 +16,7 @@
 #include "MCPlusBuilder.h"
 #include "NameResolver.h"
 #include "NameShortener.h"
+#include "Utils.h"
 #include "llvm/ADT/edit_distance.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringRef.h"
@ -33,6 +34,8 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/Regex.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Regex.h"
@ -198,7 +201,8 @@ SMLoc findDebugLineInformationForInstructionAt(uint64_t Address,
      "Cannot fit instruction debug line information into SMLoc's pointer");

  SMLoc NullResult = DebugLineTableRowRef::NULL_ROW.toSMLoc();
-  uint32_t RowIndex = LineTable->lookupAddress(Address);
+  uint32_t RowIndex = LineTable->lookupAddress(
+      {Address, object::SectionedAddress::UndefSection});
  if (RowIndex == LineTable->UnknownRowIndex)
    return NullResult;

@ -695,6 +699,98 @@ void BinaryFunction::printRelocations(raw_ostream &OS,
  }
 }

+namespace {
+std::string mutateDWARFExpressionTargetReg(const MCCFIInstruction &Instr,
+                                           MCPhysReg NewReg) {
+  StringRef ExprBytes = Instr.getValues();
+  assert(ExprBytes.size() > 1 && "DWARF expression CFI is too short");
+  uint8_t Opcode = ExprBytes[0];
+  assert((Opcode == dwarf::DW_CFA_expression ||
+          Opcode == dwarf::DW_CFA_val_expression) &&
+         "invalid DWARF expression CFI");
+  const uint8_t *const Start =
+      reinterpret_cast<const uint8_t *>(ExprBytes.drop_front(1).data());
+  const uint8_t *const End =
+      reinterpret_cast<const uint8_t *>(Start + ExprBytes.size() - 1);
+  unsigned Size = 0;
+  decodeULEB128(Start, &Size, End);
+  assert(Size > 0 && "Invalid reg encoding for DWARF expression CFI");
+  SmallString<8> Tmp;
+  raw_svector_ostream OSE(Tmp);
+  encodeULEB128(NewReg, OSE);
+  return Twine(ExprBytes.slice(0, 1))
+      .concat(OSE.str())
+      .concat(ExprBytes.drop_front(1 + Size))
+      .str();
+}
+} // namespace
+
+void BinaryFunction::mutateCFIRegisterFor(const MCInst &Instr,
+                                          MCPhysReg NewReg) {
+  const MCCFIInstruction *OldCFI = getCFIFor(Instr);
+  assert(OldCFI && "invalid CFI instr");
+  switch (OldCFI->getOperation()) {
+  default:
+    llvm_unreachable("Unexpected instruction");
+  case MCCFIInstruction::OpDefCfa:
+    setCFIFor(Instr, MCCFIInstruction::cfiDefCfa(nullptr, NewReg,
+                                                 OldCFI->getOffset()));
+    break;
+  case MCCFIInstruction::OpDefCfaRegister:
+    setCFIFor(Instr, MCCFIInstruction::createDefCfaRegister(nullptr, NewReg));
+    break;
+  case MCCFIInstruction::OpOffset:
+    setCFIFor(Instr, MCCFIInstruction::createOffset(nullptr, NewReg,
+                                                    OldCFI->getOffset()));
+    break;
+  case MCCFIInstruction::OpRegister:
+    setCFIFor(Instr, MCCFIInstruction::createRegister(nullptr, NewReg,
+                                                      OldCFI->getRegister2()));
+    break;
+  case MCCFIInstruction::OpSameValue:
+    setCFIFor(Instr, MCCFIInstruction::createSameValue(nullptr, NewReg));
+    break;
+  case MCCFIInstruction::OpEscape:
+    setCFIFor(Instr,
+              MCCFIInstruction::createEscape(
+                  nullptr,
+                  StringRef(mutateDWARFExpressionTargetReg(*OldCFI, NewReg))));
+    break;
+  case MCCFIInstruction::OpRestore:
+    setCFIFor(Instr, MCCFIInstruction::createRestore(nullptr, NewReg));
+    break;
+  case MCCFIInstruction::OpUndefined:
+    setCFIFor(Instr, MCCFIInstruction::createUndefined(nullptr, NewReg));
+    break;
+  }
+}
+
+const MCCFIInstruction *BinaryFunction::mutateCFIOffsetFor(const MCInst &Instr,
+                                                           int64_t NewOffset) {
+  const MCCFIInstruction *OldCFI = getCFIFor(Instr);
+  assert(OldCFI && "invalid CFI instr");
+  switch (OldCFI->getOperation()) {
+  default:
+    llvm_unreachable("Unexpected instruction");
+  case MCCFIInstruction::OpDefCfaOffset:
+    setCFIFor(Instr, MCCFIInstruction::cfiDefCfaOffset(nullptr, NewOffset));
+    break;
+  case MCCFIInstruction::OpAdjustCfaOffset:
+    setCFIFor(Instr,
+              MCCFIInstruction::createAdjustCfaOffset(nullptr, NewOffset));
+    break;
+  case MCCFIInstruction::OpDefCfa:
+    setCFIFor(Instr, MCCFIInstruction::cfiDefCfa(nullptr, OldCFI->getRegister(),
+                                                 NewOffset));
+    break;
+  case MCCFIInstruction::OpOffset:
+    setCFIFor(Instr, MCCFIInstruction::createOffset(
+                         nullptr, OldCFI->getRegister(), NewOffset));
+    break;
+  }
+  return getCFIFor(Instr);
+}
+
 IndirectBranchType
 BinaryFunction::processIndirectBranch(MCInst &Instruction,
                                      unsigned Size,
@ -806,8 +902,8 @@ BinaryFunction::processIndirectBranch(MCInst &Instruction,
  if (BaseRegNum == BC.MRI->getProgramCounter())
    ArrayStart += getAddress() + Offset + Size;

-  DEBUG(dbgs() << "BOLT-DEBUG: addressed memory is 0x"
-               << Twine::utohexstr(ArrayStart) << '\n');
+  LLVM_DEBUG(dbgs() << "BOLT-DEBUG: addressed memory is 0x"
+                    << Twine::utohexstr(ArrayStart) << '\n');

  auto Section = BC.getSectionForAddress(ArrayStart);
  if (!Section) {
@ -918,7 +1014,7 @@ MCSymbol *BinaryFunction::getOrCreateLocalLabel(uint64_t Address,
    }
  }

-  auto *Label = BC.Ctx->createTempSymbol();
+  auto *Label = BC.Ctx->createNamedTempSymbol();
  Labels[Offset] = Label;

  return Label;
@ -984,7 +1080,7 @@ bool BinaryFunction::disassemble() {

  // Insert a label at the beginning of the function. This will be our first
  // basic block.
-  Labels[0] = Ctx->createTempSymbol("BB0", false);
+  Labels[0] = Ctx->createNamedTempSymbol("BB0");

  auto handlePCRelOperand =
      [&](MCInst &Instruction, uint64_t Address, uint64_t Size) {
@ -992,7 +1088,7 @@ bool BinaryFunction::disassemble() {
    if (!MIB->evaluateMemOperandTarget(Instruction, TargetAddress, Address,
                                       Size)) {
      errs() << "BOLT-ERROR: PC-relative operand can't be evaluated:\n";
-      BC.InstPrinter->printInst(&Instruction, errs(), "", *BC.STI);
+      BC.InstPrinter->printInst(&Instruction, 0, "", *BC.STI, errs());
      errs() << '\n';
      Instruction.dump_pretty(errs(), BC.InstPrinter.get());
      errs() << '\n';
@ -1052,7 +1148,6 @@ bool BinaryFunction::disassemble() {
                                   Size,
                                   FunctionData.slice(Offset),
                                   AbsoluteInstrAddr,
-                                   nulls(),
                                   nulls())) {
      // Functions with "soft" boundaries, e.g. coming from assembly source,
      // can have 0-byte padding at the end.
@ -1113,12 +1208,14 @@ bool BinaryFunction::disassemble() {
         ItrE = Relocations.lower_bound(Offset + Size); Itr != ItrE; ++Itr) {
      const auto &Relocation = Itr->second;

-      DEBUG(dbgs() << "BOLT-DEBUG: replacing immediate 0x"
-            << Twine::utohexstr(Relocation.Value) << " with relocation"
-            " against " << Relocation.Symbol
-            << "+" << Relocation.Addend << " in function " << *this
-            << " for instruction at offset 0x"
-            << Twine::utohexstr(Offset) << '\n');
+      LLVM_DEBUG(dbgs() << "BOLT-DEBUG: replacing immediate 0x"
+                        << Twine::utohexstr(Relocation.Value)
+                        << " with relocation"
+                           " against "
+                        << Relocation.Symbol << "+" << Relocation.Addend
+                        << " in function " << *this
+                        << " for instruction at offset 0x"
+                        << Twine::utohexstr(Offset) << '\n');

      // Process reference to the primary symbol.
      if (!Relocation.isPCRelative())
@ -1196,10 +1293,10 @@ bool BinaryFunction::disassemble() {
            if (TargetAddress == getAddress() + getSize() &&
                TargetAddress < getAddress() + getMaxSize()) {
              // Result of __builtin_unreachable().
-              DEBUG(dbgs() << "BOLT-DEBUG: jump past end detected at 0x"
-                           << Twine::utohexstr(AbsoluteInstrAddr)
-                           << " in function " << *this
-                           << " : replacing with nop.\n");
+              LLVM_DEBUG(dbgs() << "BOLT-DEBUG: jump past end detected at 0x"
+                                << Twine::utohexstr(AbsoluteInstrAddr)
+                                << " in function " << *this
+                                << " : replacing with nop.\n");
              BC.MIB->createNoop(Instruction);
              if (IsCondBranch) {
                // Register branch offset for profile validation.
@ -1260,11 +1357,12 @@ bool BinaryFunction::disassemble() {
                    (RelSize == 1) ? ELF::R_X86_64_PC8 : ELF::R_X86_64_PC32;
                if (BC.isAArch64())
                  RelType = ELF::R_AARCH64_CALL26;
-                DEBUG(dbgs() << "BOLT-DEBUG: creating relocation for static"
-                             << " function call to " << TargetSymbol->getName()
-                             << " at offset 0x" << Twine::utohexstr(RelOffset)
-                             << " with size " << RelSize << " for function "
-                             << *this << '\n');
+                LLVM_DEBUG(dbgs()
+                           << "BOLT-DEBUG: creating relocation for static"
+                           << " function call to " << TargetSymbol->getName()
+                           << " at offset 0x" << Twine::utohexstr(RelOffset)
+                           << " with size " << RelSize << " for function "
+                           << *this << '\n');
                addRelocation(getAddress() + RelOffset, TargetSymbol, RelType,
                              -RelSize, 0);
              }
@ -1434,7 +1532,6 @@ bool BinaryFunction::scanExternalRefs() {
                                   Size,
                                   FunctionData.slice(Offset),
                                   AbsoluteInstrAddr,
-                                   nulls(),
                                   nulls())) {
      if (opts::Verbosity >= 1 && !isZeroPaddingAt(Offset)) {
        errs() << "BOLT-WARNING: unable to disassemble instruction at offset 0x"
@ -1703,13 +1800,11 @@ void BinaryFunction::postProcessJumpTables() {
        auto EntryAddress = JT->getAddress() + EntryOffset;
        auto Res = BC.removeRelocationAt(EntryAddress);
        (void)Res;
-        DEBUG(
-          auto Section = BC.getSectionForAddress(EntryAddress);
-          auto Offset = EntryAddress - Section->getAddress();
-          dbgs() << "BOLT-DEBUG: removing relocation from section "
-                 << Section->getName() << " at offset 0x"
-                 << Twine::utohexstr(Offset) << " = "
-                 << Res << '\n');
+        LLVM_DEBUG(auto Section = BC.getSectionForAddress(EntryAddress);
+                   auto Offset = EntryAddress - Section->getAddress();
+                   dbgs() << "BOLT-DEBUG: removing relocation from section "
+                          << Section->getName() << " at offset 0x"
+                          << Twine::utohexstr(Offset) << " = " << Res << '\n');
      }

      EntryOffset += JT->EntrySize;
@ -1822,8 +1917,8 @@ bool BinaryFunction::postProcessIndirectBranches(
            // Invalidating the jump table may also invalidate other jump table
            // boundaries. Until we have/need a support for this, mark the
            // function as non-simple.
-            DEBUG(dbgs() << "BOLT-DEBUG: rejected jump table reference"
-                         << JT->getName() << " in " << *this << '\n');
+            LLVM_DEBUG(dbgs() << "BOLT-DEBUG: rejected jump table reference"
+                              << JT->getName() << " in " << *this << '\n');
            return false;
          }
        }
@ -1852,8 +1947,8 @@ bool BinaryFunction::postProcessIndirectBranches(
        outs() << "BOLT-INFO: rejected potential indirect tail call in "
               << "function " << *this << " in basic block "
               << BB->getName() << ".\n";
-        DEBUG(BC.printInstructions(dbgs(), BB->begin(), BB->end(),
-                                   BB->getOffset(), this, true));
+        LLVM_DEBUG(BC.printInstructions(dbgs(), BB->begin(), BB->end(),
+                                        BB->getOffset(), this, true));
      }

      if (!opts::StrictMode)
@ -2004,8 +2099,8 @@ bool BinaryFunction::buildCFG(MCPlusBuilder::AllocatorIdTy AllocatorId) {
    bool IsLKMarker = BC.LKMarkers.count(InstrInputAddr);
    if (IsSDTMarker || IsLKMarker) {
      HasSDTMarker = true;
-      DEBUG(dbgs() << "SDTMarker or LKMarker detected in the input at : "
-                   << utohexstr(InstrInputAddr) << "\n");
+      LLVM_DEBUG(dbgs() << "SDTMarker or LKMarker detected in the input at : "
+                        << utohexstr(InstrInputAddr) << "\n");
      if (!MIB->hasAnnotation(Instr, "Offset")) {
        MIB->addAnnotation(Instr, "Offset", static_cast<uint32_t>(Offset),
                           AllocatorId);
@ -2035,7 +2130,7 @@ bool BinaryFunction::buildCFG(MCPlusBuilder::AllocatorIdTy AllocatorId) {
        MCSymbol *Label;
        {
          auto L = BC.scopeLock();
-          Label = BC.Ctx->createTempSymbol("FT", true);
+          Label = BC.Ctx->createNamedTempSymbol("FT");
        }
        InsertBB = addBasicBlock(
            Offset, Label, opts::PreserveBlocksAlignment && IsLastInstrNop);
@ -2077,15 +2172,16 @@ bool BinaryFunction::buildCFG(MCPlusBuilder::AllocatorIdTy AllocatorId) {
  }

  // Intermediate dump.
-  DEBUG(print(dbgs(), "after creating basic blocks"));
+  LLVM_DEBUG(print(dbgs(), "after creating basic blocks"));

  // TODO: handle properly calls to no-return functions,
  // e.g. exit(3), etc. Otherwise we'll see a false fall-through
  // blocks.

  for (auto &Branch : TakenBranches) {
-    DEBUG(dbgs() << "registering branch [0x" << Twine::utohexstr(Branch.first)
-                 << "] -> [0x" << Twine::utohexstr(Branch.second) << "]\n");
+    LLVM_DEBUG(dbgs() << "registering branch [0x"
+                      << Twine::utohexstr(Branch.first) << "] -> [0x"
+                      << Twine::utohexstr(Branch.second) << "]\n");
    auto *FromBB = getBasicBlockContainingOffset(Branch.first);
    auto *ToBB = getBasicBlockAtOffset(Branch.second);
    if (!FromBB || !ToBB) {
@ -2137,8 +2233,8 @@ bool BinaryFunction::buildCFG(MCPlusBuilder::AllocatorIdTy AllocatorId) {

  if (!IsPrevFT) {
    // Possibly a call that does not return.
-    DEBUG(dbgs() << "last block was marked as a fall-through in " << *this
-                 << '\n');
+    LLVM_DEBUG(dbgs() << "last block was marked as a fall-through in " << *this
+                      << '\n');
  }

  // Assign landing pads and throwers info.
@ -2236,10 +2332,10 @@ void BinaryFunction::calculateMacroOpFusionStats() {
    if (!Offset || (getAddress() + Offset) % 64)
      continue;

-    DEBUG(dbgs() << "\nmissed macro-op fusion at address 0x"
-                 << Twine::utohexstr(getAddress() + Offset) << " in function "
-                 << *this << "; executed " << BB->getKnownExecutionCount()
-                 << " times.\n");
+    LLVM_DEBUG(dbgs() << "\nmissed macro-op fusion at address 0x"
+                      << Twine::utohexstr(getAddress() + Offset)
+                      << " in function " << *this << "; executed "
+                      << BB->getKnownExecutionCount() << " times.\n");
    ++BC.MissedMacroFusionPairs;
    BC.MissedMacroFusionExecCount += BB->getKnownExecutionCount();
  }
@ -2300,7 +2396,7 @@ void BinaryFunction::removeConditionalTailCalls() {
    // is, so we can map samples recorded in new BBs back to the original BB
    // seem in the input binary (if using BAT)
    auto TailCallBB = createBasicBlock(BB.getInputOffset(),
-                                       BC.Ctx->createTempSymbol("TC", true));
+                                       BC.Ctx->createNamedTempSymbol("TC"));
    TailCallBB->addInstruction(TailCallInstr);
    TailCallBB->setCFIState(CFIStateBeforeCTC);

@ -2444,8 +2540,6 @@ private:
    case MCCFIInstruction::OpRestore:
    case MCCFIInstruction::OpUndefined:
    case MCCFIInstruction::OpRegister:
-    case MCCFIInstruction::OpExpression:
-    case MCCFIInstruction::OpValExpression:
      RegRule[Instr.getRegister()] = RuleNumber;
      break;
    case MCCFIInstruction::OpDefCfaRegister:
@ -2461,12 +2555,19 @@ private:
      CFAOffset = Instr.getOffset();
      CFARule = UNKNOWN;
      break;
-    case MCCFIInstruction::OpDefCfaExpression:
-      CFARule = RuleNumber;
+    case MCCFIInstruction::OpEscape: {
+      Optional<uint8_t> Reg = readDWARFExpressionTargetReg(Instr.getValues());
+      // Handle DW_CFA_def_cfa_expression
+      if (!Reg) {
+        CFARule = RuleNumber;
+        break;
+      }
+      RegRule[*Reg] = RuleNumber;
      break;
+    }
    case MCCFIInstruction::OpAdjustCfaOffset:
    case MCCFIInstruction::OpWindowSave:
-    case MCCFIInstruction::OpEscape:
+    case MCCFIInstruction::OpNegateRAState:
      llvm_unreachable("unsupported CFI opcode");
      break;
    case MCCFIInstruction::OpRememberState:
@ -2555,15 +2656,27 @@ struct CFISnapshotDiff : public CFISnapshot {
    case MCCFIInstruction::OpRestore:
    case MCCFIInstruction::OpUndefined:
    case MCCFIInstruction::OpRegister:
-    case MCCFIInstruction::OpExpression:
-    case MCCFIInstruction::OpValExpression: {
-      if (RestoredRegs[Instr.getRegister()])
+    case MCCFIInstruction::OpEscape: {
+      uint32_t Reg;
+      if (Instr.getOperation() != MCCFIInstruction::OpEscape) {
+        Reg = Instr.getRegister();
+      } else {
+        Optional<uint8_t> R = readDWARFExpressionTargetReg(Instr.getValues());
+        // Handle DW_CFA_def_cfa_expression
+        if (!R) {
+          if (RestoredCFAReg && RestoredCFAOffset)
+            return true;
+          RestoredCFAReg = true;
+          RestoredCFAOffset = true;
+          return false;
+        }
+        Reg = *R;
+      }
+      if (RestoredRegs[Reg])
        return true;
-      RestoredRegs[Instr.getRegister()] = true;
+      RestoredRegs[Reg] = true;
      const int32_t CurRegRule =
-          RegRule.find(Instr.getRegister()) != RegRule.end()
-              ? RegRule[Instr.getRegister()]
-              : UNKNOWN;
+          RegRule.find(Reg) != RegRule.end() ? RegRule[Reg] : UNKNOWN;
      if (CurRegRule == UNKNOWN) {
        if (Instr.getOperation() == MCCFIInstruction::OpRestore ||
            Instr.getOperation() == MCCFIInstruction::OpSameValue)
@ -2590,15 +2703,9 @@ struct CFISnapshotDiff : public CFISnapshot {
      RestoredCFAReg = true;
      RestoredCFAOffset = true;
      return CFAReg == Instr.getRegister() && CFAOffset == Instr.getOffset();
-    case MCCFIInstruction::OpDefCfaExpression:
-      if (RestoredCFAReg && RestoredCFAOffset)
-        return true;
-      RestoredCFAReg = true;
-      RestoredCFAOffset = true;
-      return false;
    case MCCFIInstruction::OpAdjustCfaOffset:
    case MCCFIInstruction::OpWindowSave:
-    case MCCFIInstruction::OpEscape:
+    case MCCFIInstruction::OpNegateRAState:
      llvm_unreachable("unsupported CFI opcode");
      return false;
    case MCCFIInstruction::OpRememberState:
@ -2659,6 +2766,32 @@ BinaryFunction::unwindCFIState(int32_t FromState, int32_t ToState,
  CFISnapshotDiff FromCFITable(ToCFITable);
  FromCFITable.advanceTo(FromState);

+  auto undoStateDefCfa = [&]() {
+    if (ToCFITable.CFARule == CFISnapshot::UNKNOWN) {
+      FrameInstructions.emplace_back(MCCFIInstruction::cfiDefCfa(
+          nullptr, ToCFITable.CFAReg, ToCFITable.CFAOffset));
+      if (FromCFITable.isRedundant(FrameInstructions.back())) {
+        FrameInstructions.pop_back();
+        return;
+      }
+      NewStates.push_back(FrameInstructions.size() - 1);
+      InsertIt = addCFIPseudo(InBB, InsertIt, FrameInstructions.size() - 1);
+      ++InsertIt;
+    } else if (ToCFITable.CFARule < 0) {
+      if (FromCFITable.isRedundant(CIEFrameInstructions[-ToCFITable.CFARule]))
+        return;
+      NewStates.push_back(FrameInstructions.size());
+      InsertIt = addCFIPseudo(InBB, InsertIt, FrameInstructions.size());
+      ++InsertIt;
+      FrameInstructions.emplace_back(CIEFrameInstructions[-ToCFITable.CFARule]);
+    } else if (!FromCFITable.isRedundant(
+                   FrameInstructions[ToCFITable.CFARule])) {
+      NewStates.push_back(ToCFITable.CFARule);
+      InsertIt = addCFIPseudo(InBB, InsertIt, ToCFITable.CFARule);
+      ++InsertIt;
+    }
+  };
+
  auto undoState = [&](const MCCFIInstruction &Instr) {
    switch (Instr.getOperation()) {
    case MCCFIInstruction::OpRememberState:
@ -2669,13 +2802,24 @@ BinaryFunction::unwindCFIState(int32_t FromState, int32_t ToState,
    case MCCFIInstruction::OpOffset:
    case MCCFIInstruction::OpRestore:
    case MCCFIInstruction::OpUndefined:
-    case MCCFIInstruction::OpRegister:
-    case MCCFIInstruction::OpExpression:
-    case MCCFIInstruction::OpValExpression: {
-      if (ToCFITable.RegRule.find(Instr.getRegister()) ==
-          ToCFITable.RegRule.end()) {
+    case MCCFIInstruction::OpEscape:
+    case MCCFIInstruction::OpRegister: {
+      uint32_t Reg;
+      if (Instr.getOperation() != MCCFIInstruction::OpEscape) {
+        Reg = Instr.getRegister();
+      } else {
+        Optional<uint8_t> R = readDWARFExpressionTargetReg(Instr.getValues());
+        // Handle DW_CFA_def_cfa_expression
+        if (!R) {
+          undoStateDefCfa();
+          return;
+        }
+        Reg = *R;
+      }
+
+      if (ToCFITable.RegRule.find(Reg) == ToCFITable.RegRule.end()) {
        FrameInstructions.emplace_back(
-            MCCFIInstruction::createRestore(nullptr, Instr.getRegister()));
+            MCCFIInstruction::createRestore(nullptr, Reg));
        if (FromCFITable.isRedundant(FrameInstructions.back())) {
          FrameInstructions.pop_back();
          break;
@ -2685,7 +2829,7 @@ BinaryFunction::unwindCFIState(int32_t FromState, int32_t ToState,
        ++InsertIt;
        break;
      }
-      const int32_t Rule = ToCFITable.RegRule[Instr.getRegister()];
+      const int32_t Rule = ToCFITable.RegRule[Reg];
      if (Rule < 0) {
        if (FromCFITable.isRedundant(CIEFrameInstructions[-Rule]))
          break;
@ -2705,35 +2849,11 @@ BinaryFunction::unwindCFIState(int32_t FromState, int32_t ToState,
    case MCCFIInstruction::OpDefCfaRegister:
    case MCCFIInstruction::OpDefCfaOffset:
    case MCCFIInstruction::OpDefCfa:
-    case MCCFIInstruction::OpDefCfaExpression:
-      if (ToCFITable.CFARule == CFISnapshot::UNKNOWN) {
-        FrameInstructions.emplace_back(MCCFIInstruction::createDefCfa(
-            nullptr, ToCFITable.CFAReg, -ToCFITable.CFAOffset));
-        if (FromCFITable.isRedundant(FrameInstructions.back())) {
-          FrameInstructions.pop_back();
-          break;
-        }
-        NewStates.push_back(FrameInstructions.size() - 1);
-        InsertIt = addCFIPseudo(InBB, InsertIt, FrameInstructions.size() - 1);
-        ++InsertIt;
-      } else if (ToCFITable.CFARule < 0) {
-        if (FromCFITable.isRedundant(CIEFrameInstructions[-ToCFITable.CFARule]))
-          break;
-        NewStates.push_back(FrameInstructions.size());
-        InsertIt = addCFIPseudo(InBB, InsertIt, FrameInstructions.size());
-        ++InsertIt;
-        FrameInstructions.emplace_back(
-            CIEFrameInstructions[-ToCFITable.CFARule]);
-      } else if (!FromCFITable.isRedundant(
-                     FrameInstructions[ToCFITable.CFARule])) {
-        NewStates.push_back(ToCFITable.CFARule);
-        InsertIt = addCFIPseudo(InBB, InsertIt, ToCFITable.CFARule);
-        ++InsertIt;
-      }
+      undoStateDefCfa();
      break;
    case MCCFIInstruction::OpAdjustCfaOffset:
    case MCCFIInstruction::OpWindowSave:
-    case MCCFIInstruction::OpEscape:
+    case MCCFIInstruction::OpNegateRAState:
      llvm_unreachable("unsupported CFI opcode");
      break;
    case MCCFIInstruction::OpGnuArgsSize:
@ -2742,7 +2862,6 @@ BinaryFunction::unwindCFIState(int32_t FromState, int32_t ToState,
    }
  };

-
  // Undo all modifications from ToState to FromState
  for (int32_t I = ToState, E = FromState; I != E; ++I) {
    const auto &Instr = FrameInstructions[I];
@ -2769,7 +2888,7 @@ void BinaryFunction::normalizeCFIState() {
  std::stack<int32_t> Stack;
  for (BinaryBasicBlock *CurBB : BasicBlocksLayout) {
    for (auto II = CurBB->begin(); II != CurBB->end(); ++II) {
-      if (auto *CFI = getCFIFor(*II)) {
+      if (const MCCFIInstruction *CFI = getCFIFor(*II)) {
        if (CFI->getOperation() == MCCFIInstruction::OpRememberState) {
          Stack.push(II->getOperand(0).getImm());
          continue;
@ -2787,9 +2906,10 @@ void BinaryFunction::normalizeCFIState() {
 }

 bool BinaryFunction::finalizeCFIState() {
-  DEBUG(dbgs() << "Trying to fix CFI states for each BB after reordering.\n");
-  DEBUG(dbgs() << "This is the list of CFI states for each BB of " << *this
-               << ": ");
+  LLVM_DEBUG(
+      dbgs() << "Trying to fix CFI states for each BB after reordering.\n");
+  LLVM_DEBUG(dbgs() << "This is the list of CFI states for each BB of " << *this
+                    << ": ");

  int32_t State = 0;
  bool SeenCold = false;
@ -2823,13 +2943,13 @@ bool BinaryFunction::finalizeCFIState() {
    }

    State = CFIStateAtExit;
-    DEBUG(dbgs() << Sep << State; Sep = ", ");
+    LLVM_DEBUG(dbgs() << Sep << State; Sep = ", ");
  }
-  DEBUG(dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << "\n");

  for (auto BB : BasicBlocksLayout) {
    for (auto II = BB->begin(); II != BB->end(); ) {
-      auto CFI = getCFIFor(*II);
+      const MCCFIInstruction *CFI = getCFIFor(*II);
      if (CFI &&
          (CFI->getOperation() == MCCFIInstruction::OpRememberState ||
           CFI->getOperation() == MCCFIInstruction::OpRestoreState)) {
@ -2934,7 +3054,7 @@ void BinaryFunction::setIgnored() {

  IsIgnored = true;
  IsSimple = false;
-  DEBUG(dbgs() << "Ignoring " << getPrintName() << '\n');
+  LLVM_DEBUG(dbgs() << "Ignoring " << getPrintName() << '\n');
 }

 void BinaryFunction::duplicateConstantIslands() {
@ -3080,13 +3200,13 @@ void BinaryFunction::dumpGraph(raw_ostream& OS) const {
      std::string Branch;
      if (Success) {
        if (Succ == BB->getConditionalSuccessor(true)) {
-          Branch = CondBranch
-            ? BC.InstPrinter->getOpcodeName(CondBranch->getOpcode())
-            : "TB";
+          Branch = CondBranch ? std::string(BC.InstPrinter->getOpcodeName(
+                                    CondBranch->getOpcode()))
+                              : "TB";
        } else if (Succ == BB->getConditionalSuccessor(false)) {
-          Branch = UncondBranch
-            ? BC.InstPrinter->getOpcodeName(UncondBranch->getOpcode())
-            : "FB";
+          Branch = UncondBranch ? std::string(BC.InstPrinter->getOpcodeName(
+                                      UncondBranch->getOpcode()))
+                                : "FB";
        } else {
          Branch = "FT";
        }
@ -3126,7 +3246,7 @@ void BinaryFunction::viewGraph() const {
           << " bolt-cfg-XXXXX.dot temporary file.\n";
    return;
  }
-  dumpGraphToFile(Filename.str());
+  dumpGraphToFile(std::string(Filename));
  if (DisplayGraph(Filename)) {
    errs() << "BOLT-ERROR: Can't display " << Filename << " with graphviz.\n";
  }
@ -3144,7 +3264,7 @@ void BinaryFunction::dumpGraphForPass(std::string Annotation) const {

 void BinaryFunction::dumpGraphToFile(std::string Filename) const {
  std::error_code EC;
-  raw_fd_ostream of(Filename, EC, sys::fs::F_None);
+  raw_fd_ostream of(Filename, EC, sys::fs::OF_None);
  if (EC) {
    if (opts::Verbosity >= 1) {
      errs() << "BOLT-WARNING: " << EC.message() << ", unable to open "
@ -3333,7 +3453,7 @@ void BinaryFunction::propagateGnuArgsSizeInfo(
    for (auto II = BB->begin(); II != BB->end(); ) {
      auto &Instr = *II;
      if (BC.MIB->isCFI(Instr)) {
-        auto CFI = getCFIFor(Instr);
+        const MCCFIInstruction *CFI = getCFIFor(Instr);
        if (CFI->getOperation() == MCCFIInstruction::OpGnuArgsSize) {
          CurrentGnuArgsSize = CFI->getOffset();
          // Delete DW_CFA_GNU_args_size instructions and only regenerate
@ -3364,8 +3484,8 @@ void BinaryFunction::postProcessBranches() {
        // one valid successor. Such behaviour is undefined and thus we remove
        // the conditional branch while leaving a valid successor.
        BB->eraseInstruction(std::prev(LastInstrRI.base()));
-        DEBUG(dbgs() << "BOLT-DEBUG: erasing conditional branch in "
-                     << BB->getName() << " in function " << *this << '\n');
+        LLVM_DEBUG(dbgs() << "BOLT-DEBUG: erasing conditional branch in "
+                          << BB->getName() << " in function " << *this << '\n');
      }
    } else if (BB->succ_size() == 0) {
      // Ignore unreachable basic blocks.
@ -3375,14 +3495,15 @@ void BinaryFunction::postProcessBranches() {
      // If it's the basic block that does not end up with a terminator - we
      // insert a return instruction unless it's a call instruction.
      if (LastInstrRI == BB->rend()) {
-        DEBUG(dbgs() << "BOLT-DEBUG: at least one instruction expected in BB "
-                     << BB->getName() << " in function " << *this << '\n');
+        LLVM_DEBUG(
+            dbgs() << "BOLT-DEBUG: at least one instruction expected in BB "
+                   << BB->getName() << " in function " << *this << '\n');
        continue;
      }
      if (!BC.MIB->isTerminator(*LastInstrRI) &&
          !BC.MIB->isCall(*LastInstrRI)) {
-        DEBUG(dbgs() << "BOLT-DEBUG: adding return to basic block "
-                     << BB->getName() << " in function " << *this << '\n');
+        LLVM_DEBUG(dbgs() << "BOLT-DEBUG: adding return to basic block "
+                          << BB->getName() << " in function " << *this << '\n');
        MCInst ReturnInstr;
        BC.MIB->createReturn(ReturnInstr);
        BB->addInstruction(ReturnInstr);
@ -3554,8 +3675,8 @@ BinaryFunction::BasicBlockOrderType BinaryFunction::dfs() const {
    MCInst *UncondBranch = nullptr;
    if (BB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch) &&
        CondBranch && BB->succ_size() == 2) {
-      if (BC.MIB->getCanonicalBranchOpcode(CondBranch->getOpcode()) ==
-          CondBranch->getOpcode()) {
+      if (BC.MIB->getCanonicalBranchCondCode(BC.MIB->getCondCode(
+              *CondBranch)) == BC.MIB->getCondCode(*CondBranch)) {
        Stack.push(BB->getConditionalSuccessor(true));
        Stack.push(BB->getConditionalSuccessor(false));
      } else {
@ -3845,7 +3966,7 @@ BinaryBasicBlock *BinaryFunction::splitEdge(BinaryBasicBlock *From,
  MCSymbol *Tmp;
  {
    auto L = BC.scopeLock();
-    Tmp = BC.Ctx->createTempSymbol("SplitEdge", true);
+    Tmp = BC.Ctx->createNamedTempSymbol("SplitEdge");
  }
  // Link new BBs to the original input offset of the From BB, so we can map
  // samples recorded in new BBs back to the original BB seem in the input
@ -3963,7 +4084,7 @@ bool BinaryFunction::isSymbolValidInScope(const SymbolRef &Symbol,
  if (cantFail(Symbol.getType()) != SymbolRef::ST_Unknown)
    return false;

-  if (Symbol.getFlags() & SymbolRef::SF_Global)
+  if (cantFail(Symbol.getFlags()) & SymbolRef::SF_Global)
    return false;

  return true;
@ -4245,9 +4366,10 @@ DebugAddressRangesVector BinaryFunction::translateInputToOutputRanges(
  uint64_t PrevEndAddress = 0;
  for (const auto &Range : InputRanges) {
    if (!containsAddress(Range.LowPC)) {
-      DEBUG(dbgs() << "BOLT-DEBUG: invalid debug address range detected for "
-                   << *this << " : [0x" << Twine::utohexstr(Range.LowPC)
-                   << ", 0x" << Twine::utohexstr(Range.HighPC) << "]\n");
+      LLVM_DEBUG(
+          dbgs() << "BOLT-DEBUG: invalid debug address range detected for "
+                 << *this << " : [0x" << Twine::utohexstr(Range.LowPC) << ", 0x"
+                 << Twine::utohexstr(Range.HighPC) << "]\n");
      PrevEndAddress = 0;
      continue;
    }
@ -4263,9 +4385,10 @@ DebugAddressRangesVector BinaryFunction::translateInputToOutputRanges(
    do {
      const auto *BB = BBI->second;
      if (InputOffset < BB->getOffset() || InputOffset >= BB->getEndOffset()) {
-        DEBUG(dbgs() << "BOLT-DEBUG: invalid debug address range detected for "
-                     << *this << " : [0x" << Twine::utohexstr(Range.LowPC)
-                     << ", 0x" << Twine::utohexstr(Range.HighPC) << "]\n");
+        LLVM_DEBUG(
+            dbgs() << "BOLT-DEBUG: invalid debug address range detected for "
+                   << *this << " : [0x" << Twine::utohexstr(Range.LowPC)
+                   << ", 0x" << Twine::utohexstr(Range.HighPC) << "]\n");
        PrevEndAddress = 0;
        break;
      }
@ -4331,9 +4454,9 @@ MCInst *BinaryFunction::getInstructionAtOffset(uint64_t Offset) {
  }
 }

-DWARFDebugLoc::LocationList BinaryFunction::translateInputToOutputLocationList(
-    DWARFDebugLoc::LocationList InputLL) const {
-  DWARFDebugLoc::LocationList OutputLL;
+DebugLocationsVector BinaryFunction::translateInputToOutputLocationList(
+    const DebugLocationsVector &InputLL) const {
+  DebugLocationsVector OutputLL;

  if (isFolded()) {
    return OutputLL;
@ -4345,14 +4468,15 @@ DWARFDebugLoc::LocationList BinaryFunction::translateInputToOutputLocationList(
  }

  uint64_t PrevEndAddress = 0;
-  SmallVectorImpl<char> *PrevLoc = nullptr;
-  for (const auto &Entry : InputLL.Entries) {
-    const auto Start = Entry.Begin;
-    const auto End = Entry.End;
+  SmallVectorImpl<uint8_t> *PrevExpr = nullptr;
+  for (const DebugLocationEntry &Entry : InputLL) {
+    const uint64_t Start = Entry.LowPC;
+    const uint64_t End = Entry.HighPC;
    if (!containsAddress(Start)) {
-      DEBUG(dbgs() << "BOLT-DEBUG: invalid debug address range detected for "
-                   << *this << " : [0x" << Twine::utohexstr(Start)
-                   << ", 0x" << Twine::utohexstr(End) << "]\n");
+      LLVM_DEBUG(dbgs() << "BOLT-DEBUG: invalid debug address range detected "
+                            "for "
+                        << *this << " : [0x" << Twine::utohexstr(Start)
+                        << ", 0x" << Twine::utohexstr(End) << "]\n");
      continue;
    }
    auto InputOffset = Start - getAddress();
@ -4365,9 +4489,10 @@ DWARFDebugLoc::LocationList BinaryFunction::translateInputToOutputLocationList(
    do {
      const auto *BB = BBI->second;
      if (InputOffset < BB->getOffset() || InputOffset >= BB->getEndOffset()) {
-        DEBUG(dbgs() << "BOLT-DEBUG: invalid debug address range detected for "
-                     << *this << " : [0x" << Twine::utohexstr(Start)
-                     << ", 0x" << Twine::utohexstr(End) << "]\n");
+        LLVM_DEBUG(dbgs() << "BOLT-DEBUG: invalid debug address range detected "
+                             "for "
+                          << *this << " : [0x" << Twine::utohexstr(Start)
+                          << ", 0x" << Twine::utohexstr(End) << "]\n");
        PrevEndAddress = 0;
        break;
      }
@ -4379,17 +4504,16 @@ DWARFDebugLoc::LocationList BinaryFunction::translateInputToOutputLocationList(
        if (InputEndOffset < BB->getEndOffset())
          EndAddress = StartAddress + InputEndOffset - InputOffset;

-        if (StartAddress == PrevEndAddress && Entry.Loc == *PrevLoc) {
-          OutputLL.Entries.back().End = std::max(OutputLL.Entries.back().End,
-                                                 EndAddress);
+        if (StartAddress == PrevEndAddress && Entry.Expr == *PrevExpr) {
+          OutputLL.back().HighPC = std::max(OutputLL.back().HighPC, EndAddress);
        } else {
-          OutputLL.Entries.emplace_back(
-              DWARFDebugLoc::Entry{StartAddress,
-                                   std::max(StartAddress, EndAddress),
-                                   Entry.Loc});
+          OutputLL.emplace_back(
+              DebugLocationEntry{StartAddress,
+                                 std::max(StartAddress, EndAddress),
+                                 Entry.Expr});
        }
-        PrevEndAddress = OutputLL.Entries.back().End;
-        PrevLoc = &OutputLL.Entries.back().Loc;
+        PrevEndAddress = OutputLL.back().HighPC;
+        PrevExpr = &OutputLL.back().Expr;
      }

      ++BBI;
@ -4398,26 +4522,23 @@ DWARFDebugLoc::LocationList BinaryFunction::translateInputToOutputLocationList(
  }

  // Sort and merge adjacent entries with identical location.
-  std::stable_sort(OutputLL.Entries.begin(), OutputLL.Entries.end(),
-      [] (const DWARFDebugLoc::Entry &A, const DWARFDebugLoc::Entry &B) {
-        return A.Begin < B.Begin;
+  std::stable_sort(OutputLL.begin(), OutputLL.end(),
+      [] (const DebugLocationEntry &A, const DebugLocationEntry &B) {
+        return A.LowPC < B.LowPC;
      });
-  DWARFDebugLoc::LocationList MergedLL;
+  DebugLocationsVector MergedLL;
  PrevEndAddress = 0;
-  PrevLoc = nullptr;
-  for (const auto &Entry : OutputLL.Entries) {
-    if (Entry.Begin <= PrevEndAddress && *PrevLoc == Entry.Loc) {
-      MergedLL.Entries.back().End = std::max(Entry.End,
-                                             MergedLL.Entries.back().End);
+  PrevExpr = nullptr;
+  for (const auto &Entry : OutputLL) {
+    if (Entry.LowPC <= PrevEndAddress && *PrevExpr == Entry.Expr) {
+      MergedLL.back().HighPC = std::max(Entry.HighPC, MergedLL.back().HighPC);
    } else {
-      const auto Begin = std::max(Entry.Begin, PrevEndAddress);
-      const auto End = std::max(Begin, Entry.End);
-      MergedLL.Entries.emplace_back(DWARFDebugLoc::Entry{Begin,
-                                                         End,
-                                                         Entry.Loc});
+      const uint64_t Begin = std::max(Entry.LowPC, PrevEndAddress);
+      const uint64_t End = std::max(Begin, Entry.HighPC);
+      MergedLL.emplace_back(DebugLocationEntry{Begin, End, Entry.Expr});
    }
-    PrevEndAddress = MergedLL.Entries.back().End;
-    PrevLoc = &MergedLL.Entries.back().Loc;
+    PrevEndAddress = MergedLL.back().HighPC;
+    PrevExpr = &MergedLL.back().Expr;
  }

  return MergedLL;
--- a/bolt/src/BinaryFunction.h
+++ b/bolt/src/BinaryFunction.h
@ -1234,7 +1234,7 @@ public:
    assert(BC.Ctx && "cannot be called with empty context");
    if (!FunctionEndLabel) {
      std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
-      FunctionEndLabel = BC.Ctx->createTempSymbol("func_end", true);
+      FunctionEndLabel = BC.Ctx->createNamedTempSymbol("func_end");
    }
    return FunctionEndLabel;
  }
@ -1243,7 +1243,7 @@ public:
  MCSymbol *getFunctionColdEndLabel() const {
    if (!FunctionColdEndLabel) {
      std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
-      FunctionColdEndLabel = BC.Ctx->createTempSymbol("func_cold_end", true);
+      FunctionColdEndLabel = BC.Ctx->createNamedTempSymbol("func_cold_end");
    }
    return FunctionColdEndLabel;
  }
@ -1254,7 +1254,7 @@ public:
  MCSymbol *getFunctionConstantIslandLabel() const {
    if (!FunctionConstantIslandLabel) {
      FunctionConstantIslandLabel =
-          BC.Ctx->createTempSymbol("func_const_island", true);
+          BC.Ctx->createNamedTempSymbol("func_const_island");
    }
    return FunctionConstantIslandLabel;
  }
@ -1262,7 +1262,7 @@ public:
  MCSymbol *getFunctionColdConstantIslandLabel() const {
    if (!FunctionColdConstantIslandLabel) {
      FunctionColdConstantIslandLabel =
-          BC.Ctx->createTempSymbol("func_cold_const_island", true);
+          BC.Ctx->createNamedTempSymbol("func_cold_const_island");
    }
    return FunctionColdConstantIslandLabel;
  }
@ -1367,7 +1367,7 @@ public:

  /// Assign a code section name to the function.
  void setCodeSectionName(StringRef Name) {
-    CodeSectionName = Name;
+    CodeSectionName = std::string(Name);
  }

  /// Get output code section.
@ -1382,7 +1382,7 @@ public:

  /// Assign a section name for the cold part of the function.
  void setColdCodeSectionName(StringRef Name) {
-    ColdCodeSectionName = Name;
+    ColdCodeSectionName = std::string(Name);
  }

  /// Get output code section for cold code of this function.
@ -1580,7 +1580,7 @@ public:
    assert(BC.Ctx && "cannot be called with empty context");
    if (!Label) {
      std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
-      Label = BC.Ctx->createTempSymbol("BB", true);
+      Label = BC.Ctx->createNamedTempSymbol("BB");
    }
    auto BB = std::unique_ptr<BinaryBasicBlock>(
      new BinaryBasicBlock(this, Label, Offset));
@ -1608,7 +1608,7 @@ public:

    if (!Label) {
      std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
-      Label = BC.Ctx->createTempSymbol("BB", true);
+      Label = BC.Ctx->createNamedTempSymbol("BB");
    }
    auto BBPtr = createBasicBlock(Offset, Label, DeriveAlignment);
    BasicBlocks.emplace_back(BBPtr.release());
@ -1835,7 +1835,7 @@ public:
  }

  /// Retrieve the MCCFIInstruction object associated with a CFI pseudo.
-  MCCFIInstruction* getCFIFor(const MCInst &Instr) {
+  const MCCFIInstruction *getCFIFor(const MCInst &Instr) const {
    if (!BC.MIB->isCFI(Instr))
      return nullptr;
    uint32_t Offset = Instr.getOperand(0).getImm();
@ -1843,14 +1843,19 @@ public:
    return &FrameInstructions[Offset];
  }

-  const MCCFIInstruction* getCFIFor(const MCInst &Instr) const {
-    if (!BC.MIB->isCFI(Instr))
-      return nullptr;
+  void setCFIFor(const MCInst &Instr, MCCFIInstruction &&CFIInst) {
+    assert(BC.MIB->isCFI(Instr) &&
+           "attempting to change CFI in a non-CFI inst");
    uint32_t Offset = Instr.getOperand(0).getImm();
    assert(Offset < FrameInstructions.size() && "Invalid CFI offset");
-    return &FrameInstructions[Offset];
+    FrameInstructions[Offset] = std::move(CFIInst);
  }

+  void mutateCFIRegisterFor(const MCInst &Instr, MCPhysReg NewReg);
+
+  const MCCFIInstruction *mutateCFIOffsetFor(const MCInst &Instr,
+                                             int64_t NewOffset);
+
  BinaryFunction &setFileOffset(uint64_t Offset) {
    FileOffset = Offset;
    return *this;
@ -2479,10 +2484,8 @@ public:

  /// Similar to translateInputToOutputRanges() but operates on location lists
  /// and moves associated data to output location lists.
-  ///
-  /// \p BaseAddress is applied to all addresses in \pInputLL.
-  DWARFDebugLoc::LocationList translateInputToOutputLocationList(
-      DWARFDebugLoc::LocationList InputLL) const;
+  DebugLocationsVector translateInputToOutputLocationList(
+      const DebugLocationsVector &InputLL) const;

  /// Return true if the function is an AArch64 linker inserted veneer
  bool isAArch64Veneer() const;
--- a/bolt/src/BinaryFunctionProfile.cpp
+++ b/bolt/src/BinaryFunctionProfile.cpp
@ -136,7 +136,7 @@ void BinaryFunction::postProcessProfile() {
      // function calls from the block.
      for (auto &Inst : *BB) {
        // Ignore non-call instruction
-        if (!BC.MIA->isCall(Inst))
+        if (!BC.MIB->isCall(Inst))
          continue;

        auto CountAnnt = BC.MIB->tryGetAnnotationAs<uint64_t>(Inst, "Count");
@ -325,7 +325,7 @@ void BinaryFunction::inferFallThroughCounts() {
    if (BBExecCount > TotalReportedJumps)
      Inferred = BBExecCount - TotalReportedJumps;

-    DEBUG(
+    LLVM_DEBUG(
      if (BBExecCount < TotalReportedJumps)
        dbgs()
            << "Fall-through inference is slightly inconsistent. "
--- a/bolt/src/BinaryPassManager.cpp
+++ b/bolt/src/BinaryPassManager.cpp
@ -386,85 +386,85 @@ void BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
  const auto InitialDynoStats = getDynoStats(BC.getBinaryFunctions());

  if (opts::Instrument) {
-    Manager.registerPass(llvm::make_unique<Instrumentation>(NeverPrint));
+    Manager.registerPass(std::make_unique<Instrumentation>(NeverPrint));
  }

  // Here we manage dependencies/order manually, since passes are run in the
  // order they're registered.

  // Run this pass first to use stats for the original functions.
-  Manager.registerPass(llvm::make_unique<PrintProgramStats>(NeverPrint));
+  Manager.registerPass(std::make_unique<PrintProgramStats>(NeverPrint));

  if (opts::PrintProfileStats)
-    Manager.registerPass(llvm::make_unique<PrintProfileStats>(NeverPrint));
+    Manager.registerPass(std::make_unique<PrintProfileStats>(NeverPrint));

-  Manager.registerPass(llvm::make_unique<ValidateInternalCalls>(NeverPrint));
+  Manager.registerPass(std::make_unique<ValidateInternalCalls>(NeverPrint));

-  Manager.registerPass(llvm::make_unique<StripRepRet>(NeverPrint),
+  Manager.registerPass(std::make_unique<StripRepRet>(NeverPrint),
                       opts::StripRepRet);

-  Manager.registerPass(llvm::make_unique<IdenticalCodeFolding>(PrintICF),
+  Manager.registerPass(std::make_unique<IdenticalCodeFolding>(PrintICF),
                       opts::ICF);

  if (BC.isAArch64())
      Manager.registerPass(
-          llvm::make_unique<VeneerElimination>(PrintVeneerElimination));
+          std::make_unique<VeneerElimination>(PrintVeneerElimination));

  Manager.registerPass(
-      llvm::make_unique<SpecializeMemcpy1>(NeverPrint, opts::SpecializeMemcpy1),
+      std::make_unique<SpecializeMemcpy1>(NeverPrint, opts::SpecializeMemcpy1),
      !opts::SpecializeMemcpy1.empty());

-  Manager.registerPass(llvm::make_unique<InlineMemcpy>(NeverPrint),
+  Manager.registerPass(std::make_unique<InlineMemcpy>(NeverPrint),
                       opts::StringOps);

-  Manager.registerPass(llvm::make_unique<IndirectCallPromotion>(PrintICP));
+  Manager.registerPass(std::make_unique<IndirectCallPromotion>(PrintICP));

  Manager.registerPass(
-      llvm::make_unique<JTFootprintReduction>(PrintJTFootprintReduction),
+      std::make_unique<JTFootprintReduction>(PrintJTFootprintReduction),
      opts::JTFootprintReductionFlag);

  Manager.registerPass(
-    llvm::make_unique<SimplifyRODataLoads>(PrintSimplifyROLoads),
+    std::make_unique<SimplifyRODataLoads>(PrintSimplifyROLoads),
    opts::SimplifyRODataLoads);

-  Manager.registerPass(llvm::make_unique<RegReAssign>(PrintRegReAssign),
+  Manager.registerPass(std::make_unique<RegReAssign>(PrintRegReAssign),
                       opts::RegReAssign);

-  Manager.registerPass(llvm::make_unique<Inliner>(PrintInline));
+  Manager.registerPass(std::make_unique<Inliner>(PrintInline));

-  Manager.registerPass(llvm::make_unique<IdenticalCodeFolding>(PrintICF),
+  Manager.registerPass(std::make_unique<IdenticalCodeFolding>(PrintICF),
                       opts::ICF);

-  Manager.registerPass(llvm::make_unique<PLTCall>(PrintPLT));
+  Manager.registerPass(std::make_unique<PLTCall>(PrintPLT));

-  Manager.registerPass(llvm::make_unique<ReorderBasicBlocks>(PrintReordered));
+  Manager.registerPass(std::make_unique<ReorderBasicBlocks>(PrintReordered));

  Manager.registerPass(
-    llvm::make_unique<EliminateUnreachableBlocks>(PrintUCE),
+    std::make_unique<EliminateUnreachableBlocks>(PrintUCE),
    opts::EliminateUnreachable);

-  Manager.registerPass(llvm::make_unique<SplitFunctions>(PrintSplit));
+  Manager.registerPass(std::make_unique<SplitFunctions>(PrintSplit));

  // This pass syncs local branches with CFG. If any of the following
  // passes breaks the sync - they either need to re-run the pass or
  // fix branches consistency internally.
-  Manager.registerPass(llvm::make_unique<FixupBranches>(PrintAfterBranchFixup));
+  Manager.registerPass(std::make_unique<FixupBranches>(PrintAfterBranchFixup));

  // This pass should come close to last since it uses the estimated hot
  // size of a function to determine the order.  It should definitely
  // also happen after any changes to the call graph are made, e.g. inlining.
  Manager.registerPass(
-    llvm::make_unique<ReorderFunctions>(PrintReorderedFunctions));
+    std::make_unique<ReorderFunctions>(PrintReorderedFunctions));

  // Print final dyno stats right while CFG and instruction analysis are intact.
  Manager.registerPass(
-    llvm::make_unique<DynoStatsPrintPass>(
+    std::make_unique<DynoStatsPrintPass>(
      InitialDynoStats, "after all optimizations before SCTC and FOP"),
    opts::PrintDynoStats | opts::DynoStatsAll);

  // Add the StokeInfo pass, which extract functions for stoke optimization and
  // get the liveness information for them
-  Manager.registerPass(llvm::make_unique<StokeInfo>(PrintStoke), opts::Stoke);
+  Manager.registerPass(std::make_unique<StokeInfo>(PrintStoke), opts::Stoke);

  // This pass introduces conditional jumps into external functions.
  // Between extending CFG to support this and isolating this pass we chose
@ -476,57 +476,57 @@ void BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
  // reordering so that it can tell whether calls are forward/backward
  // accurately.
  Manager.registerPass(
-      llvm::make_unique<SimplifyConditionalTailCalls>(PrintSCTC),
+      std::make_unique<SimplifyConditionalTailCalls>(PrintSCTC),
      opts::SimplifyConditionalTailCalls);

-  Manager.registerPass(llvm::make_unique<Peepholes>(PrintPeepholes));
+  Manager.registerPass(std::make_unique<Peepholes>(PrintPeepholes));

-  Manager.registerPass(llvm::make_unique<AlignerPass>());
+  Manager.registerPass(std::make_unique<AlignerPass>());

  // Perform reordering on data contained in one or more sections using
  // memory profiling data.
-  Manager.registerPass(llvm::make_unique<ReorderData>());
+  Manager.registerPass(std::make_unique<ReorderData>());

  // This pass should always run last.*
-  Manager.registerPass(llvm::make_unique<FinalizeFunctions>(PrintFinalized));
+  Manager.registerPass(std::make_unique<FinalizeFunctions>(PrintFinalized));

  // FrameOptimizer has an implicit dependency on FinalizeFunctions.
  // FrameOptimizer move values around and needs to update CFIs. To do this, it
  // must read CFI, interpret it and rewrite it, so CFIs need to be correctly
  // placed according to the final layout.
-  Manager.registerPass(llvm::make_unique<FrameOptimizerPass>(PrintFOP));
+  Manager.registerPass(std::make_unique<FrameOptimizerPass>(PrintFOP));

-  Manager.registerPass(llvm::make_unique<AllocCombinerPass>(PrintFOP));
+  Manager.registerPass(std::make_unique<AllocCombinerPass>(PrintFOP));

  Manager.registerPass(
-      llvm::make_unique<RetpolineInsertion>(PrintRetpolineInsertion));
+      std::make_unique<RetpolineInsertion>(PrintRetpolineInsertion));

  // Assign each function an output section.
-  Manager.registerPass(llvm::make_unique<AssignSections>());
+  Manager.registerPass(std::make_unique<AssignSections>());

  // Patch original function entries
  if (BC.HasRelocations)
-    Manager.registerPass(llvm::make_unique<PatchEntries>());
+    Manager.registerPass(std::make_unique<PatchEntries>());

  // Tighten branches according to offset differences between branch and
  // targets. No extra instructions after this pass, otherwise we may have
  // relocations out of range and crash during linking.
  if (BC.isAArch64())
-    Manager.registerPass(llvm::make_unique<LongJmpPass>(PrintLongJmp));
+    Manager.registerPass(std::make_unique<LongJmpPass>(PrintLongJmp));

  // This pass turns tail calls into jumps which makes them invisible to
  // function reordering. It's unsafe to use any CFG or instruction analysis
  // after this point.
  Manager.registerPass(
-    llvm::make_unique<InstructionLowering>(PrintAfterLowering));
+    std::make_unique<InstructionLowering>(PrintAfterLowering));

  // In non-relocation mode, mark functions that do not fit into their original
  // space as non-simple if we have to (e.g. for correct debug info update).
  // NOTE: this pass depends on finalized code.
  if (!BC.HasRelocations)
-    Manager.registerPass(llvm::make_unique<CheckLargeFunctions>(NeverPrint));
+    Manager.registerPass(std::make_unique<CheckLargeFunctions>(NeverPrint));

-  Manager.registerPass(llvm::make_unique<LowerAnnotations>(NeverPrint));
+  Manager.registerPass(std::make_unique<LowerAnnotations>(NeverPrint));

  Manager.runPasses();
 }
--- a/bolt/src/BinarySection.cpp
+++ b/bolt/src/BinarySection.cpp
@ -79,17 +79,17 @@ void BinarySection::emitAsData(MCStreamer &Streamer, StringRef NewName) const {
                                           getELFFlags());

  Streamer.SwitchSection(ELFSection);
-  Streamer.EmitValueToAlignment(getAlignment());
+  Streamer.emitValueToAlignment(getAlignment());

  if (BC.HasRelocations && opts::HotData && isReordered())
-    Streamer.EmitLabel(BC.Ctx->getOrCreateSymbol("__hot_data_start"));
+    Streamer.emitLabel(BC.Ctx->getOrCreateSymbol("__hot_data_start"));

-  DEBUG(dbgs() << "BOLT-DEBUG: emitting "
-               << (isAllocatable() ? "" : "non-")
-               << "allocatable data section " << SectionName << '\n');
+  LLVM_DEBUG(dbgs() << "BOLT-DEBUG: emitting "
+                    << (isAllocatable() ? "" : "non-")
+                    << "allocatable data section " << SectionName << '\n');

  if (!hasRelocations()) {
-    Streamer.EmitBytes(SectionContents);
+    Streamer.emitBytes(SectionContents);
  } else {
    uint64_t SectionOffset = 0;
    for (auto &Relocation : relocations()) {
@ -98,28 +98,28 @@ void BinarySection::emitAsData(MCStreamer &Streamer, StringRef NewName) const {
      if (BC.UndefinedSymbols.count(Relocation.Symbol))
        continue;
      if (SectionOffset < Relocation.Offset) {
-        Streamer.EmitBytes(
+        Streamer.emitBytes(
            SectionContents.substr(SectionOffset,
                                   Relocation.Offset - SectionOffset));
        SectionOffset = Relocation.Offset;
      }
-      DEBUG(dbgs() << "BOLT-DEBUG: emitting relocation for symbol "
-            << (Relocation.Symbol ? Relocation.Symbol->getName()
-                                  : StringRef("<none>"))
-            << " at offset 0x" << Twine::utohexstr(Relocation.Offset)
-            << " with size "
-            << Relocation::getSizeForType(Relocation.Type) << '\n');
+      LLVM_DEBUG(dbgs() << "BOLT-DEBUG: emitting relocation for symbol "
+                        << (Relocation.Symbol ? Relocation.Symbol->getName()
+                                              : StringRef("<none>"))
+                        << " at offset 0x"
+                        << Twine::utohexstr(Relocation.Offset) << " with size "
+                        << Relocation::getSizeForType(Relocation.Type) << '\n');
      auto RelocationSize = Relocation.emit(&Streamer);
      SectionOffset += RelocationSize;
    }
    assert(SectionOffset <= SectionContents.size() && "overflow error");
    if (SectionOffset < SectionContents.size()) {
-      Streamer.EmitBytes(SectionContents.substr(SectionOffset));
+      Streamer.emitBytes(SectionContents.substr(SectionOffset));
    }
  }

  if (BC.HasRelocations && opts::HotData && isReordered())
-    Streamer.EmitLabel(BC.Ctx->getOrCreateSymbol("__hot_data_end"));
+    Streamer.emitLabel(BC.Ctx->getOrCreateSymbol("__hot_data_end"));
 }

 void BinarySection::flushPendingRelocations(raw_pwrite_stream &OS,
@ -135,11 +135,11 @@ void BinarySection::flushPendingRelocations(raw_pwrite_stream &OS,
  // sections, the output offset should always be a valid one.
  const uint64_t SectionFileOffset = isAllocatable() ? getInputFileOffset()
                                                     : getOutputFileOffset();
-  DEBUG(dbgs() << "BOLT-DEBUG: flushing pending relocations for section "
-               << getName() << '\n'
-               << "  address: 0x" << Twine::utohexstr(SectionAddress) << '\n'
-               << "  offset: 0x" << Twine::utohexstr(SectionFileOffset) << '\n'
-  );
+  LLVM_DEBUG(
+      dbgs() << "BOLT-DEBUG: flushing pending relocations for section "
+             << getName() << '\n'
+             << "  address: 0x" << Twine::utohexstr(SectionAddress) << '\n'
+             << "  offset: 0x" << Twine::utohexstr(SectionFileOffset) << '\n');

  for (auto &Patch : Patches) {
    OS.pwrite(Patch.Bytes.data(),
@ -167,31 +167,26 @@ void BinarySection::flushPendingRelocations(raw_pwrite_stream &OS,
      OS.pwrite(reinterpret_cast<const char*>(&Value),
                Relocation::getSizeForType(Reloc.Type),
                SectionFileOffset + Reloc.Offset);
-      DEBUG(
-        dbgs() << "BOLT-DEBUG: writing value 0x"
-                     << Twine::utohexstr(Value)
-                     << " of size " << Relocation::getSizeForType(Reloc.Type)
-                     << " at offset 0x"
-                     << Twine::utohexstr(Reloc.Offset)
-                     << " address 0x"
-                     << Twine::utohexstr(SectionAddress + Reloc.Offset)
-                     << " Offset 0x"
-                     << Twine::utohexstr(SectionFileOffset + Reloc.Offset)
-                     << '\n';
-      );
+      LLVM_DEBUG(dbgs() << "BOLT-DEBUG: writing value 0x"
+                        << Twine::utohexstr(Value) << " of size "
+                        << Relocation::getSizeForType(Reloc.Type)
+                        << " at offset 0x" << Twine::utohexstr(Reloc.Offset)
+                        << " address 0x"
+                        << Twine::utohexstr(SectionAddress + Reloc.Offset)
+                        << " Offset 0x"
+                        << Twine::utohexstr(SectionFileOffset + Reloc.Offset)
+                        << '\n';);
      break;
    }
    }
-    DEBUG(dbgs() << "BOLT-DEBUG: writing value 0x"
-                 << Twine::utohexstr(Value)
-                 << " of size " << Relocation::getSizeForType(Reloc.Type)
-                 << " at section offset 0x"
-                 << Twine::utohexstr(Reloc.Offset)
-                 << " address 0x"
-                 << Twine::utohexstr(SectionAddress + Reloc.Offset)
-                 << " file offset 0x"
-                 << Twine::utohexstr(SectionFileOffset + Reloc.Offset)
-                 << '\n';);
+    LLVM_DEBUG(
+        dbgs() << "BOLT-DEBUG: writing value 0x" << Twine::utohexstr(Value)
+               << " of size " << Relocation::getSizeForType(Reloc.Type)
+               << " at section offset 0x" << Twine::utohexstr(Reloc.Offset)
+               << " address 0x"
+               << Twine::utohexstr(SectionAddress + Reloc.Offset)
+               << " file offset 0x"
+               << Twine::utohexstr(SectionFileOffset + Reloc.Offset) << '\n';);
  }

  clearList(PendingRelocations);
@ -255,7 +250,8 @@ std::set<Relocation> BinarySection::reorderRelocations(bool Inplace) const {
    auto RelOffset = RelAddr - BD->getAddress();
    NewRel.Offset = BD->getOutputOffset() + RelOffset;
    assert(NewRel.Offset < getSize());
-    DEBUG(dbgs() << "BOLT-DEBUG: moving " << Rel << " -> " << NewRel << "\n");
+    LLVM_DEBUG(dbgs() << "BOLT-DEBUG: moving " << Rel << " -> " << NewRel
+                      << "\n");
    auto Res = NewRelocations.emplace(std::move(NewRel));
    (void)Res;
    assert(Res.second && "Can't overwrite existing relocation");
@ -272,7 +268,7 @@ void BinarySection::reorderContents(const std::vector<BinaryData *> &Order,
  std::string Str;
  raw_string_ostream OS(Str);
  auto *Src = Contents.data();
-  DEBUG(dbgs() << "BOLT-DEBUG: reorderContents for " << Name << "\n");
+  LLVM_DEBUG(dbgs() << "BOLT-DEBUG: reorderContents for " << Name << "\n");
  for (auto *BD : Order) {
    assert((BD->isMoved() || !Inplace) && !BD->isJumpTable());
    assert(BD->isAtomic() && BD->isMoveable());
@ -282,8 +278,8 @@ void BinarySection::reorderContents(const std::vector<BinaryData *> &Order,
    while (OS.tell() < BD->getOutputOffset()) {
      OS.write((unsigned char)0);
    }
-    DEBUG(dbgs() << "BOLT-DEBUG: " << BD->getName()
-                 << " @ " << OS.tell() << "\n");
+    LLVM_DEBUG(dbgs() << "BOLT-DEBUG: " << BD->getName() << " @ " << OS.tell()
+                      << "\n");
    OS.write(&Src[SrcOffset], BD->getOutputSize());
  }
  if (Relocations.empty()) {
--- a/bolt/src/BinarySection.h
+++ b/bolt/src/BinarySection.h
@ -97,22 +97,21 @@ class BinarySection {
  BinarySection &operator=(BinarySection &&) = delete;

  static StringRef getName(SectionRef Section) {
-    StringRef Name;
-    Section.getName(Name);
-    return Name;
+    return cantFail(Section.getName());
  }
  static StringRef getContents(SectionRef Section) {
-    StringRef Contents;
    if (Section.getObject()->isELF() &&
        ELFSectionRef(Section).getType() == ELF::SHT_NOBITS)
-      return Contents;
+      return StringRef();

-    if (auto EC = Section.getContents(Contents)) {
+    auto ContentsOrErr = Section.getContents();
+    if (!ContentsOrErr) {
+      auto E = ContentsOrErr.takeError();
      errs() << "BOLT-ERROR: cannot get section contents for "
-             << getName(Section) << ": " << EC.message() << ".\n";
+             << getName(Section) << ": " << E << ".\n";
      exit(1);
    }
-    return Contents;
+    return *ContentsOrErr;
  }

  /// Get the set of relocations refering to data in this section that
@ -453,7 +452,7 @@ public:
    Index = I;
  }
  void setOutputName(StringRef Name) {
-    OutputName = Name;
+    OutputName = std::string(Name);
  }
  void setAnonymous(bool Flag) {
    IsAnonymous = Flag;
--- a/bolt/src/BoltAddressTranslation.cpp
+++ b/bolt/src/BoltAddressTranslation.cpp
@ -31,9 +31,9 @@ void BoltAddressTranslation::writeEntriesForBB(MapTy &Map,
         "Every output BB must track back to an input BB for profile "
         "collection in bolted binaries");

-  DEBUG(dbgs() << "BB " << BB.getName() <<"\n");
-  DEBUG(dbgs() << "  Key: " << Twine::utohexstr(BBOutputOffset)
-               << " Val: " << Twine::utohexstr(BBInputOffset) << "\n");
+  LLVM_DEBUG(dbgs() << "BB " << BB.getName() << "\n");
+  LLVM_DEBUG(dbgs() << "  Key: " << Twine::utohexstr(BBOutputOffset)
+                    << " Val: " << Twine::utohexstr(BBInputOffset) << "\n");
  // In case of conflicts (same Key mapping to different Vals), the last
  // update takes precedence. Of course it is not ideal to have conflicts and
  // those happen when we have an empty BB that either contained only
@ -51,16 +51,15 @@ void BoltAddressTranslation::writeEntriesForBB(MapTy &Map,
    if (OutputOffset == BBOutputOffset)
      continue;

-    DEBUG(dbgs() << "  Key: " << Twine::utohexstr(OutputOffset)
-                 << " Val: " << Twine::utohexstr(InputOffset)
-                 << " (branch)\n");
+    LLVM_DEBUG(dbgs() << "  Key: " << Twine::utohexstr(OutputOffset) << " Val: "
+                      << Twine::utohexstr(InputOffset) << " (branch)\n");
    Map.insert(
        std::pair<uint32_t, uint32_t>(OutputOffset, InputOffset | BRANCHENTRY));
  }
 }

 void BoltAddressTranslation::write(raw_ostream &OS) {
-  DEBUG(dbgs() << "BOLT-DEBUG: Writing BOLT Address Translation Tables\n");
+  LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Writing BOLT Address Translation Tables\n");
  for (auto &BFI : BC.getBinaryFunctions()) {
    auto &Function = BFI.second;
    // We don't need a translation table if the body of the function hasn't
@ -68,9 +67,9 @@ void BoltAddressTranslation::write(raw_ostream &OS) {
    if (!BC.HasRelocations && !Function.isSimple())
      continue;

-    DEBUG(dbgs() << "Function name: " << Function.getPrintName() << "\n");
-    DEBUG(dbgs() << " Address reference: 0x"
-                 << Twine::utohexstr(Function.getOutputAddress()) << "\n");
+    LLVM_DEBUG(dbgs() << "Function name: " << Function.getPrintName() << "\n");
+    LLVM_DEBUG(dbgs() << " Address reference: 0x"
+                      << Twine::utohexstr(Function.getOutputAddress()) << "\n");
    MapTy Map;
    const bool IsSplit = Function.isSplit();
    for (const auto &BB : Function.layout()) {
@ -85,7 +84,7 @@ void BoltAddressTranslation::write(raw_ostream &OS) {

    // Cold map
    Map.clear();
-    DEBUG(dbgs() << " Cold part\n");
+    LLVM_DEBUG(dbgs() << " Cold part\n");
    for (const auto &BB : Function.layout()) {
      if (!BB->isCold())
        continue;
@ -98,13 +97,13 @@ void BoltAddressTranslation::write(raw_ostream &OS) {

  const uint32_t NumFuncs = Maps.size();
  OS.write(reinterpret_cast<const char *>(&NumFuncs), 4);
-  DEBUG(dbgs() << "Writing " << NumFuncs << " functions for BAT.\n");
+  LLVM_DEBUG(dbgs() << "Writing " << NumFuncs << " functions for BAT.\n");
  for (auto &MapEntry : Maps) {
    const uint64_t Address = MapEntry.first;
    MapTy &Map = MapEntry.second;
    const uint32_t NumEntries = Map.size();
-    DEBUG(dbgs() << "Writing " << NumEntries << " entries for 0x"
-                 << Twine::utohexstr(Address) << ".\n");
+    LLVM_DEBUG(dbgs() << "Writing " << NumEntries << " entries for 0x"
+                      << Twine::utohexstr(Address) << ".\n");
    OS.write(reinterpret_cast<const char *>(&Address), 8);
    OS.write(reinterpret_cast<const char *>(&NumEntries), 4);
    for (auto &KeyVal : Map) {
@ -113,13 +112,14 @@ void BoltAddressTranslation::write(raw_ostream &OS) {
    }
  }
  const uint32_t NumColdEntries = ColdPartSource.size();
-  DEBUG(dbgs() << "Writing " << NumColdEntries << " cold part mappings.\n");
+  LLVM_DEBUG(dbgs() << "Writing " << NumColdEntries
+                    << " cold part mappings.\n");
  OS.write(reinterpret_cast<const char *>(&NumColdEntries), 4);
  for (auto &ColdEntry : ColdPartSource) {
    OS.write(reinterpret_cast<const char *>(&ColdEntry.first), 8);
    OS.write(reinterpret_cast<const char *>(&ColdEntry.second), 8);
-    DEBUG(dbgs() << " " << Twine::utohexstr(ColdEntry.first) << " -> "
-          << Twine::utohexstr(ColdEntry.second) << "\n");
+    LLVM_DEBUG(dbgs() << " " << Twine::utohexstr(ColdEntry.first) << " -> "
+                      << Twine::utohexstr(ColdEntry.second) << "\n");
  }

  outs() << "BOLT-INFO: Wrote " << Maps.size() << " BAT maps\n";
@ -129,7 +129,7 @@ void BoltAddressTranslation::write(raw_ostream &OS) {

 std::error_code BoltAddressTranslation::parse(StringRef Buf) {
  DataExtractor DE = DataExtractor(Buf, true, 8);
-  uint32_t Offset = 0;
+  uint64_t Offset = 0;
  if (Buf.size() < 12)
    return make_error_code(llvm::errc::io_error);

@ -150,7 +150,7 @@ std::error_code BoltAddressTranslation::parse(StringRef Buf) {
    return make_error_code(llvm::errc::io_error);

  const uint32_t NumFunctions = DE.getU32(&Offset);
-  DEBUG(dbgs() << "Parsing " << NumFunctions << " functions\n");
+  LLVM_DEBUG(dbgs() << "Parsing " << NumFunctions << " functions\n");
  for (uint32_t I = 0; I < NumFunctions; ++I) {
    if (Buf.size() - Offset < 12)
      return make_error_code(llvm::errc::io_error);
@ -159,16 +159,16 @@ std::error_code BoltAddressTranslation::parse(StringRef Buf) {
    const uint32_t NumEntries = DE.getU32(&Offset);
    MapTy Map;

-    DEBUG(dbgs() << "Parsing " << NumEntries << " entries for 0x"
-                 << Twine::utohexstr(Address) << "\n");
+    LLVM_DEBUG(dbgs() << "Parsing " << NumEntries << " entries for 0x"
+                      << Twine::utohexstr(Address) << "\n");
    if (Buf.size() - Offset < 8 * NumEntries)
      return make_error_code(llvm::errc::io_error);
    for (uint32_t J = 0; J < NumEntries; ++J) {
      const uint32_t OutputAddr = DE.getU32(&Offset);
      const uint32_t InputAddr = DE.getU32(&Offset);
      Map.insert(std::pair<uint32_t, uint32_t>(OutputAddr, InputAddr));
-      DEBUG(dbgs() << Twine::utohexstr(OutputAddr) << " -> "
-                   << Twine::utohexstr(InputAddr) << "\n");
+      LLVM_DEBUG(dbgs() << Twine::utohexstr(OutputAddr) << " -> "
+                        << Twine::utohexstr(InputAddr) << "\n");
    }
    Maps.insert(std::pair<uint64_t, MapTy>(Address, Map));
  }
@ -177,7 +177,7 @@ std::error_code BoltAddressTranslation::parse(StringRef Buf) {
    return make_error_code(llvm::errc::io_error);

  const uint32_t NumColdEntries = DE.getU32(&Offset);
-  DEBUG(dbgs() << "Parsing " << NumColdEntries << " cold part mappings\n");
+  LLVM_DEBUG(dbgs() << "Parsing " << NumColdEntries << " cold part mappings\n");
  for (uint32_t I = 0; I < NumColdEntries; ++I) {
    if (Buf.size() - Offset < 16)
      return make_error_code(llvm::errc::io_error);
@ -185,8 +185,8 @@ std::error_code BoltAddressTranslation::parse(StringRef Buf) {
    const uint32_t HotAddress = DE.getU64(&Offset);
    ColdPartSource.insert(
        std::pair<uint64_t, uint64_t>(ColdAddress, HotAddress));
-    DEBUG(dbgs() << Twine::utohexstr(ColdAddress) << " -> "
-                 << Twine::utohexstr(HotAddress) << "\n");
+    LLVM_DEBUG(dbgs() << Twine::utohexstr(ColdAddress) << " -> "
+                      << Twine::utohexstr(HotAddress) << "\n");
  }
  outs() << "BOLT-INFO: Parsed " << Maps.size() << " BAT entries\n";
  outs() << "BOLT-INFO: Parsed " << NumColdEntries
@ -283,11 +283,11 @@ uint64_t BoltAddressTranslation::fetchParentAddress(uint64_t Address) const {
 bool BoltAddressTranslation::enabledFor(
    llvm::object::ELFObjectFileBase *InputFile) const {
  for (const auto &Section : InputFile->sections()) {
-    StringRef SectionName;
-    if (std::error_code EC = Section.getName(SectionName))
+    auto SectionNameOrErr = Section.getName();
+    if (auto E = SectionNameOrErr.takeError())
      continue;

-    if (SectionName == SECTION_NAME)
+    if (SectionNameOrErr.get() == SECTION_NAME)
      return true;
  }
  return false;
--- a/bolt/src/CacheMetrics.cpp
+++ b/bolt/src/CacheMetrics.cpp
@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//

 #include "CacheMetrics.h"
-#include "llvm/Support/Options.h"
+#include "llvm/Support/CommandLine.h"

 using namespace llvm;
 using namespace bolt;
--- a/bolt/src/DWARFRewriter.cpp
+++ b/bolt/src/DWARFRewriter.cpp
@ -67,8 +67,8 @@ DeterministicDebugInfo("deterministic-debuginfo",
 } // namespace opts

 void DWARFRewriter::updateDebugInfo() {
-  SectionPatchers[".debug_abbrev"] = llvm::make_unique<DebugAbbrevPatcher>();
-  SectionPatchers[".debug_info"] = llvm::make_unique<SimpleBinaryPatcher>();
+  SectionPatchers[".debug_abbrev"] = std::make_unique<DebugAbbrevPatcher>();
+  SectionPatchers[".debug_info"] = std::make_unique<SimpleBinaryPatcher>();

  DebugInfoPatcher =
      static_cast<SimpleBinaryPatcher *>(SectionPatchers[".debug_info"].get());
@ -76,8 +76,8 @@ void DWARFRewriter::updateDebugInfo() {
      static_cast<DebugAbbrevPatcher *>(SectionPatchers[".debug_abbrev"].get());
  assert(DebugInfoPatcher && AbbrevPatcher && "Patchers not initialized.");

-  ARangesSectionWriter = llvm::make_unique<DebugARangesSectionWriter>();
-  RangesSectionWriter = llvm::make_unique<DebugRangesSectionWriter>(&BC);
+  ARangesSectionWriter = std::make_unique<DebugARangesSectionWriter>();
+  RangesSectionWriter = std::make_unique<DebugRangesSectionWriter>(&BC);

  size_t NumCUs = BC.DwCtx->getNumCompileUnits();
  if (opts::NoThreads || opts::DeterministicDebugInfo) {
@ -88,7 +88,7 @@ void DWARFRewriter::updateDebugInfo() {
  LocListWritersByCU.resize(NumCUs);

  for (size_t CUIndex = 0; CUIndex < NumCUs; ++CUIndex) {
-    LocListWritersByCU[CUIndex] = llvm::make_unique<DebugLocWriter>(&BC);
+    LocListWritersByCU[CUIndex] = std::make_unique<DebugLocWriter>(&BC);
  }

  auto processUnitDIE = [&](size_t CUIndex, DWARFUnit *Unit) {
@ -122,9 +122,8 @@ void DWARFRewriter::updateUnitDebugInfo(size_t CUIndex, DWARFUnit *Unit) {
  // Cache debug ranges so that the offset for identical ranges could be reused.
  std::map<DebugAddressRangesVector, uint64_t> CachedRanges;

-  const uint32_t HeaderSize = Unit->getVersion() <= 4 ? 11 : 12;
-  uint32_t DIEOffset = Unit->getOffset() + HeaderSize;
-  uint32_t NextCUOffset = Unit->getNextUnitOffset();
+  uint64_t DIEOffset = Unit->getOffset() + Unit->getHeaderSize();
+  uint64_t NextCUOffset = Unit->getNextUnitOffset();
  DWARFDebugInfoEntry Die;
  DWARFDataExtractor DebugInfoData = Unit->getDebugInfoExtractor();
  uint32_t Depth = 0;
@ -145,9 +144,10 @@ void DWARFRewriter::updateUnitDebugInfo(size_t CUIndex, DWARFUnit *Unit) {
    DWARFDie DIE(Unit, &Die);
    switch (DIE.getTag()) {
    case dwarf::DW_TAG_compile_unit: {
-      const DWARFAddressRangesVector ModuleRanges = DIE.getAddressRanges();
+      const DWARFAddressRangesVector ModuleRanges =
+          cantFail(DIE.getAddressRanges());
      DebugAddressRangesVector OutputRanges =
-        BC.translateModuleAddressRanges(ModuleRanges);
+          BC.translateModuleAddressRanges(ModuleRanges);
      const uint64_t RangesSectionOffset =
        RangesSectionWriter->addRanges(OutputRanges);
      ARangesSectionWriter->addCURanges(Unit->getOffset(),
@ -161,7 +161,7 @@ void DWARFRewriter::updateUnitDebugInfo(size_t CUIndex, DWARFUnit *Unit) {
      uint64_t Address;
      uint64_t SectionIndex, HighPC;
      if (!DIE.getLowAndHighPC(Address, HighPC, SectionIndex)) {
-        auto Ranges = DIE.getAddressRanges();
+        auto Ranges = cantFail(DIE.getAddressRanges());
        // Not a function definition.
        if (Ranges.empty())
          break;
@ -216,14 +216,15 @@ void DWARFRewriter::updateUnitDebugInfo(size_t CUIndex, DWARFUnit *Unit) {
    case dwarf::DW_TAG_catch_block: {
      uint64_t RangesSectionOffset =
          RangesSectionWriter->getEmptyRangesOffset();
-      const DWARFAddressRangesVector Ranges = DIE.getAddressRanges();
-      const BinaryFunction *Function = Ranges.empty() ? nullptr :
-          BC.getBinaryFunctionContainingAddress(Ranges.front().LowPC);
+      Expected<DWARFAddressRangesVector> RangesOrError = DIE.getAddressRanges();
+      const BinaryFunction *Function = RangesOrError && !RangesOrError->empty()
+          ? BC.getBinaryFunctionContainingAddress(RangesOrError->front().LowPC)
+          : nullptr;
      if (Function) {
        DebugAddressRangesVector OutputRanges =
-            Function->translateInputToOutputRanges(Ranges);
-        DEBUG(
-          if (OutputRanges.empty() != Ranges.empty()) {
+            Function->translateInputToOutputRanges(*RangesOrError);
+        LLVM_DEBUG(
+          if (OutputRanges.empty() != RangesOrError->empty()) {
            dbgs() << "BOLT-DEBUG: problem with DIE at 0x"
                   << Twine::utohexstr(DIE.getOffset()) << " in CU at 0x"
                   << Twine::utohexstr(Unit->getOffset())
@ -239,52 +240,67 @@ void DWARFRewriter::updateUnitDebugInfo(size_t CUIndex, DWARFUnit *Unit) {
    default: {
      // Handle any tag that can have DW_AT_location attribute.
      DWARFFormValue Value;
-      uint32_t AttrOffset;
+      uint64_t AttrOffset;
      if (auto V = DIE.find(dwarf::DW_AT_location, &AttrOffset)) {
        Value = *V;
        if (Value.isFormClass(DWARFFormValue::FC_Constant) ||
            Value.isFormClass(DWARFFormValue::FC_SectionOffset)) {
-          // Location list offset in the output section.
-          uint64_t LocListOffset = DebugLocWriter::EmptyListTag;
-
-          // Limit parsing to a single list to save memory.
-          DWARFDebugLoc::LocationList LL;
-          LL.Offset = Value.isFormClass(DWARFFormValue::FC_Constant) ?
-            Value.getAsUnsignedConstant().getValue() :
-            Value.getAsSectionOffset().getValue();
-
-          uint32_t LLOff = LL.Offset;
-          Optional<DWARFDebugLoc::LocationList> InputLL =
-            Unit->getContext().getOneDebugLocList(
-                &LLOff, Unit->getBaseAddress()->Address);
-          if (!InputLL || InputLL->Entries.empty()) {
+          uint64_t Offset = Value.isFormClass(DWARFFormValue::FC_Constant)
+                                      ? Value.getAsUnsignedConstant().getValue()
+                                      : Value.getAsSectionOffset().getValue();
+          DebugLocationsVector InputLL;
+          uint64_t BaseAddress = Unit->getBaseAddress()->Address;
+          Error E = Unit->getLocationTable().visitLocationList(
+              &Offset,
+              [&](const DWARFLocationEntry &Entry) {
+                switch (Entry.Kind) {
+                case dwarf::DW_LLE_end_of_list:
+                  return false;
+                case dwarf::DW_LLE_base_address:
+                  assert(Entry.SectionIndex == SectionedAddress::UndefSection &&
+                         "absolute address expected");
+                  BaseAddress = Entry.Value0;
+                  break;
+                case dwarf::DW_LLE_offset_pair:
+                  assert(Entry.SectionIndex == SectionedAddress::UndefSection &&
+                         "absolute address expected");
+                  InputLL.emplace_back(DebugLocationEntry{
+                      BaseAddress + Entry.Value0,
+                      BaseAddress + Entry.Value1,
+                      Entry.Loc});
+                  break;
+                }
+                return true;
+              });
+          uint64_t OutputLocListOffset = DebugLocWriter::EmptyListTag;
+          if (E || InputLL.empty()) {
            errs() << "BOLT-WARNING: empty location list detected at 0x"
-                   << Twine::utohexstr(LLOff) << " for DIE at 0x"
+                   << Twine::utohexstr(Offset) << " for DIE at 0x"
                   << Twine::utohexstr(DIE.getOffset()) << " in CU at 0x"
                   << Twine::utohexstr(Unit->getOffset())
                   << '\n';
          } else {
+            const uint64_t Address = InputLL.front().LowPC;
            if (const BinaryFunction *Function =
-                    BC.getBinaryFunctionContainingAddress(
-                        InputLL->Entries.front().Begin)) {
-              const DWARFDebugLoc::LocationList OutputLL =
-                  Function->translateInputToOutputLocationList(
-                      std::move(*InputLL));
-              DEBUG(if (OutputLL.Entries.empty()) {
+                    BC.getBinaryFunctionContainingAddress(Address)) {
+              const DebugLocationsVector OutputLL = Function
+                  ->translateInputToOutputLocationList(InputLL);
+              LLVM_DEBUG(if (OutputLL.empty()) {
                dbgs() << "BOLT-DEBUG: location list translated to an empty "
                          "one at 0x"
                       << Twine::utohexstr(DIE.getOffset()) << " in CU at 0x"
                       << Twine::utohexstr(Unit->getOffset())
                       << '\n';
              });
-              LocListOffset = LocListWritersByCU[CUIndex]->addList(OutputLL);
+              OutputLocListOffset =
+                LocListWritersByCU[CUIndex]->addList(OutputLL);
            }
          }

-          if (LocListOffset != DebugLocWriter::EmptyListTag) {
+          if (OutputLocListOffset != DebugLocWriter::EmptyListTag) {
            std::lock_guard<std::mutex> Lock(LocListDebugInfoPatchesMutex);
            LocListDebugInfoPatches.push_back(
-                {AttrOffset, CUIndex, LocListOffset});
+                {AttrOffset, CUIndex, OutputLocListOffset});
          } else {
            std::lock_guard<std::mutex> Lock(DebugInfoPatcherMutex);
            DebugInfoPatcher->addLE32Patch(AttrOffset,
@ -304,10 +320,11 @@ void DWARFRewriter::updateUnitDebugInfo(size_t CUIndex, DWARFUnit *Unit) {
          if (const BinaryFunction *Function =
                  BC.getBinaryFunctionContainingAddress(Address)) {
            NewAddress = Function->translateInputToOutputAddress(Address);
-            DEBUG(dbgs() << "BOLT-DEBUG: Fixing low_pc 0x"
-                         << Twine::utohexstr(Address)
-                         << " for DIE with tag " << DIE.getTag()
-                         << " to 0x" << Twine::utohexstr(NewAddress) << '\n');
+            LLVM_DEBUG(dbgs()
+                       << "BOLT-DEBUG: Fixing low_pc 0x"
+                       << Twine::utohexstr(Address) << " for DIE with tag "
+                       << DIE.getTag() << " to 0x"
+                       << Twine::utohexstr(NewAddress) << '\n');
          }

          std::lock_guard<std::mutex> Lock(DebugInfoPatcherMutex);
@ -349,7 +366,7 @@ void DWARFRewriter::updateDWARFObjectAddressRanges(
  if (AbbreviationDecl->findAttributeIndex(dwarf::DW_AT_ranges)) {
    // Case 1: The object was already non-contiguous and had DW_AT_ranges.
    // In this case we simply need to update the value of DW_AT_ranges.
-    uint32_t AttrOffset = -1U;
+    uint64_t AttrOffset = -1U;
    DIE.find(dwarf::DW_AT_ranges, &AttrOffset);
    assert(AttrOffset != -1U &&  "failed to locate DWARF attribute");

@ -382,33 +399,28 @@ void DWARFRewriter::updateDWARFObjectAddressRanges(
 }

 void DWARFRewriter::updateLineTableOffsets() {
-  const auto *LineSection =
+  const MCSection *LineSection =
    BC.Ctx->getObjectFileInfo()->getDwarfLineSection();
  auto CurrentFragment = LineSection->begin();
-  uint32_t CurrentOffset = 0;
-  uint32_t Offset = 0;
+  uint64_t CurrentOffset = 0;
+  uint64_t Offset = 0;

-  // Line tables are stored in MCContext in ascending order of offset in the
-  // output file, thus we can compute all table's offset by passing through
-  // each fragment at most once, continuing from the last CU's beginning
-  // instead of from the first fragment.
-  for (const auto &CUIDLineTablePair : BC.Ctx->getMCDwarfLineTables()) {
-    auto Label = CUIDLineTablePair.second.getLabel();
+  for (const auto &CU : BC.DwCtx->compile_units()) {
+    const unsigned CUID = CU->getOffset();
+    MCSymbol *Label = BC.Ctx->getMCDwarfLineTable(CUID).getLabel();
    if (!Label)
      continue;

-    auto CUOffset = CUIDLineTablePair.first;
-    if (CUOffset == -1U)
-      continue;
-
-    auto *CU = BC.DwCtx->getCompileUnitForOffset(CUOffset);
-    assert(CU && "no CU found at offset");
-    auto LTOffset =
-      BC.DwCtx->getAttrFieldOffsetForUnit(CU, dwarf::DW_AT_stmt_list);
+    const uint64_t LTOffset =
+      BC.DwCtx->getAttrFieldOffsetForUnit(CU.get(), dwarf::DW_AT_stmt_list);
    if (!LTOffset)
      continue;

-    auto Fragment = Label->getFragment();
+    // Line tables are stored in MCContext in ascending order of offset in the
+    // output file, thus we can compute all table's offset by passing through
+    // each fragment at most once, continuing from the last CU's beginning
+    // instead of from the first fragment.
+    MCFragment *Fragment = Label->getFragment();
    while (&*CurrentFragment != Fragment) {
      switch (CurrentFragment->getKind()) {
      case MCFragment::FT_Dwarf:
@ -443,8 +455,8 @@ void DWARFRewriter::updateLineTableOffsets() {
    // that the pending relocations will be processed and not ignored.
    DbgInfoSection->setIsFinalized();

-    DEBUG(dbgs() << "BOLT-DEBUG: CU " << CUIDLineTablePair.first
-                << " has line table at " << Offset << "\n");
+    LLVM_DEBUG(dbgs() << "BOLT-DEBUG: CU " << CUID
+                      << " has line table at " << Offset << "\n");
  }
 }

@ -456,9 +468,8 @@ void DWARFRewriter::finalizeDebugSections() {

    auto MAB = std::unique_ptr<MCAsmBackend>(BC.TheTarget->createMCAsmBackend(
        *BC.STI, *BC.MRI, MCTargetOptions()));
-    auto Writer = std::unique_ptr<MCObjectWriter>(MAB->createObjectWriter(OS));

-    ARangesSectionWriter->writeARangesSection(Writer.get());
+    ARangesSectionWriter->writeARangesSection(OS);
    const auto &ARangesContents = OS.str();

    BC.registerOrUpdateNoteSection(".debug_aranges",
@ -514,7 +525,7 @@ void DWARFRewriter::updateGdbIndexSection() {
    exit(1);
  }
  for (unsigned Index = 0; Index < NumCUs; ++Index, Data += 16) {
-    const auto *CU = BC.DwCtx->getCompileUnitAtIndex(Index);
+    const auto *CU = BC.DwCtx->getUnitAtIndex(Index);
    const auto Offset = read64le(Data);
    if (CU->getOffset() != Offset) {
      errs() << "BOLT-ERROR: .gdb_index CU offset mismatch\n";
@ -641,24 +652,24 @@ void DWARFRewriter::convertPending(const DWARFAbbreviationDeclaration *Abbrev) {
 }

 std::unique_ptr<LocBufferVector> DWARFRewriter::makeFinalLocListsSection() {
-  auto LocBuffer = llvm::make_unique<LocBufferVector>();
-  auto LocStream = llvm::make_unique<raw_svector_ostream>(*LocBuffer);
+  auto LocBuffer = std::make_unique<LocBufferVector>();
+  auto LocStream = std::make_unique<raw_svector_ostream>(*LocBuffer);
  auto Writer =
    std::unique_ptr<MCObjectWriter>(BC.createObjectWriter(*LocStream));

-  uint32_t SectionOffset = 0;
+  uint64_t SectionOffset = 0;

  // Add an empty list as the first entry;
-  Writer->writeLE64(0);
-  Writer->writeLE64(0);
+  const char Zeroes[16] = {0};
+  *LocStream << StringRef(Zeroes, 16);
  SectionOffset += 2 * 8;

-  std::vector<uint32_t> SectionOffsetByCU(LocListWritersByCU.size());
+  std::vector<uint64_t> SectionOffsetByCU(LocListWritersByCU.size());

  for (size_t CUIndex = 0; CUIndex < LocListWritersByCU.size(); ++CUIndex) {
    SectionOffsetByCU[CUIndex] = SectionOffset;
    auto CurrCULocationLists = LocListWritersByCU[CUIndex]->finalize();
-    Writer->writeBytes(*CurrCULocationLists);
+    *LocStream << *CurrCULocationLists;
    SectionOffset += CurrCULocationLists->size();
  }

@ -684,7 +695,7 @@ namespace {

 void getRangeAttrData(
    DWARFDie DIE,
-    uint32_t &LowPCOffset, uint32_t &HighPCOffset,
+    uint64_t &LowPCOffset, uint64_t &HighPCOffset,
    DWARFFormValue &LowPCFormValue, DWARFFormValue &HighPCFormValue) {
  LowPCOffset = -1U;
  HighPCOffset = -1U;
@ -710,7 +721,7 @@ void getRangeAttrData(
 }

 void DWARFRewriter::patchLowHigh(DWARFDie DIE, DebugAddressRange Range) {
-  uint32_t LowPCOffset, HighPCOffset;
+  uint64_t LowPCOffset, HighPCOffset;
  DWARFFormValue LowPCFormValue, HighPCFormValue;
  getRangeAttrData(
      DIE, LowPCOffset, HighPCOffset, LowPCFormValue, HighPCFormValue);
@ -725,7 +736,7 @@ void DWARFRewriter::patchLowHigh(DWARFDie DIE, DebugAddressRange Range) {

 void DWARFRewriter::convertToRanges(DWARFDie DIE,
                                    uint64_t RangesSectionOffset) {
-  uint32_t LowPCOffset, HighPCOffset;
+  uint64_t LowPCOffset, HighPCOffset;
  DWARFFormValue LowPCFormValue, HighPCFormValue;
  getRangeAttrData(
      DIE, LowPCOffset, HighPCOffset, LowPCFormValue, HighPCFormValue);
@ -752,4 +763,3 @@ void DWARFRewriter::convertToRanges(DWARFDie DIE,
    DebugInfoPatcher->addLE64Patch(LowPCOffset + 4, 0);
  }
 }
-
--- a/bolt/src/DWARFRewriter.h
+++ b/bolt/src/DWARFRewriter.h
@ -53,7 +53,7 @@ class DWARFRewriter {
  std::vector<std::unique_ptr<DebugLocWriter>> LocListWritersByCU;

  struct LocListDebugInfoPatchType {
-    uint32_t DebugInfoOffset;
+    uint64_t DebugInfoOffset;
    size_t CUIndex;
    uint64_t CUWriterOffset;
  };
--- a/bolt/src/DataAggregator.cpp
+++ b/bolt/src/DataAggregator.cpp
@ -19,9 +19,9 @@
 #include "ExecutableFileMemoryManager.h"
 #include "Heatmap.h"
 #include "Utils.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/Support/Options.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/raw_ostream.h"
@ -243,7 +243,7 @@ void DataAggregator::abort() {

 void DataAggregator::launchPerfProcess(StringRef Name, PerfProcessInfo &PPI,
                                       const char *ArgsString, bool Wait) {
-  SmallVector<const char*, 4> Argv;
+  SmallVector<StringRef, 4> Argv;

  outs() << "PERF2BOLT: spawning perf job to read " << Name << '\n';
  Argv.push_back(PerfPath.data());
@ -262,7 +262,6 @@ void DataAggregator::launchPerfProcess(StringRef Name, PerfProcessInfo &PPI,
  Argv.push_back("-f");
  Argv.push_back("-i");
  Argv.push_back(Filename.c_str());
-  Argv.push_back(nullptr);

  if (auto Errc = sys::fs::createTemporaryFile("perf.script", "out",
                                               PPI.StdoutPath)) {
@ -286,21 +285,19 @@ void DataAggregator::launchPerfProcess(StringRef Name, PerfProcessInfo &PPI,
      StringRef(PPI.StdoutPath.data()),  // Stdout
      StringRef(PPI.StderrPath.data())}; // Stderr

-  DEBUG({
-      dbgs() << "Launching perf: ";
-      for (const char *Arg : Argv)
-        dbgs() << Arg << " ";
-      dbgs() << " 1> "
-             << PPI.StdoutPath.data() << " 2> "
-             << PPI.StderrPath.data() << "\n";
-    });
+  LLVM_DEBUG({
+    dbgs() << "Launching perf: ";
+    for (StringRef Arg : Argv)
+      dbgs() << Arg << " ";
+    dbgs() << " 1> " << PPI.StdoutPath.data() << " 2> " << PPI.StderrPath.data()
+           << "\n";
+  });

  if (Wait) {
-    PPI.PI.ReturnCode =
-      sys::ExecuteAndWait(PerfPath.data(), Argv.data(), /*envp*/ nullptr,
-                          Redirects);
+    PPI.PI.ReturnCode = sys::ExecuteAndWait(PerfPath.data(), Argv,
+                                            /*envp*/ llvm::None, Redirects);
  } else {
-    PPI.PI = sys::ExecuteNoWait(PerfPath.data(), Argv.data(), /*envp*/ nullptr,
+    PPI.PI = sys::ExecuteNoWait(PerfPath.data(), Argv, /*envp*/ llvm::None,
                                Redirects);
  }

@ -355,7 +352,7 @@ void DataAggregator::processFileBuildID(StringRef FileBuildID) {
    }
  } else if (*FileName != llvm::sys::path::filename(BC->getFilename())) {
    errs() << "PERF2BOLT-WARNING: build-id matched a different file name\n";
-    BuildIDBinaryName = *FileName;
+    BuildIDBinaryName = std::string(*FileName);
  } else {
    outs() << "PERF2BOLT: matched build-id and file name\n";
  }
@ -411,7 +408,7 @@ std::error_code DataAggregator::writeAutoFDOData(StringRef OutputFilename) {
                     TimerGroupName, TimerGroupDesc, opts::TimeAggregator);

  std::error_code EC;
-  raw_fd_ostream OutFile(OutputFilename, EC, sys::fs::OpenFlags::F_None);
+  raw_fd_ostream OutFile(OutputFilename, EC, sys::fs::OpenFlags::OF_None);
  if (EC)
    return EC;

@ -765,17 +762,17 @@ bool DataAggregator::doIntraBranch(BinaryFunction &Func, uint64_t From,

  From -= Func.getAddress();
  To -= Func.getAddress();
-  DEBUG(dbgs() << "BOLT-DEBUG: bumpBranchCount: " << Func.getPrintName()
-               << " @ " << Twine::utohexstr(From) << " -> "
-               << Func.getPrintName() << " @ " << Twine::utohexstr(To)
-               << '\n');
+  LLVM_DEBUG(dbgs() << "BOLT-DEBUG: bumpBranchCount: " << Func.getPrintName()
+                    << " @ " << Twine::utohexstr(From) << " -> "
+                    << Func.getPrintName() << " @ " << Twine::utohexstr(To)
+                    << '\n');
  if (BAT) {
    From = BAT->translate(Func, From, /*IsBranchSrc=*/true);
    To = BAT->translate(Func, To, /*IsBranchSrc=*/false);
-    DEBUG(dbgs() << "BOLT-DEBUG: BAT translation on bumpBranchCount: "
-                 << Func.getPrintName() << " @ " << Twine::utohexstr(From)
-                 << " -> " << Func.getPrintName() << " @ "
-                 << Twine::utohexstr(To) << '\n');
+    LLVM_DEBUG(dbgs() << "BOLT-DEBUG: BAT translation on bumpBranchCount: "
+                      << Func.getPrintName() << " @ " << Twine::utohexstr(From)
+                      << " -> " << Func.getPrintName() << " @ "
+                      << Twine::utohexstr(To) << '\n');
  }

  AggrData->bumpBranchCount(From, To, Count, Mispreds);
@ -849,7 +846,7 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second,
  auto *FromFunc = getBinaryFunctionContainingAddress(First.To);
  auto *ToFunc = getBinaryFunctionContainingAddress(Second.From);
  if (!FromFunc || !ToFunc) {
-    DEBUG(
+    LLVM_DEBUG(
        dbgs() << "Out of range trace starting in " << FromFunc->getPrintName()
               << " @ " << Twine::utohexstr(First.To - FromFunc->getAddress())
               << " and ending in " << ToFunc->getPrintName() << " @ "
@ -860,31 +857,32 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second,
  }
  if (FromFunc != ToFunc) {
    NumInvalidTraces += Count;
-    DEBUG(dbgs() << "Invalid trace starting in " << FromFunc->getPrintName()
-                 << " @ " << Twine::utohexstr(First.To - FromFunc->getAddress())
-                 << " and ending in " << ToFunc->getPrintName() << " @ "
-                 << ToFunc->getPrintName() << " @ "
-                 << Twine::utohexstr(Second.From - ToFunc->getAddress())
-                 << '\n');
+    LLVM_DEBUG(
+        dbgs() << "Invalid trace starting in " << FromFunc->getPrintName()
+               << " @ " << Twine::utohexstr(First.To - FromFunc->getAddress())
+               << " and ending in " << ToFunc->getPrintName() << " @ "
+               << ToFunc->getPrintName() << " @ "
+               << Twine::utohexstr(Second.From - ToFunc->getAddress()) << '\n');
    return false;
  }

  auto FTs = BAT ? BAT->getFallthroughsInTrace(*FromFunc, First.To, Second.From)
                 : getFallthroughsInTrace(*FromFunc, First, Second, Count);
  if (!FTs) {
-    DEBUG(dbgs() << "Invalid trace starting in " << FromFunc->getPrintName()
-                 << " @ " << Twine::utohexstr(First.To - FromFunc->getAddress())
-                 << " and ending in " << ToFunc->getPrintName() << " @ "
-                 << ToFunc->getPrintName() << " @ "
-                 << Twine::utohexstr(Second.From - ToFunc->getAddress())
-                 << '\n');
+    LLVM_DEBUG(
+        dbgs() << "Invalid trace starting in " << FromFunc->getPrintName()
+               << " @ " << Twine::utohexstr(First.To - FromFunc->getAddress())
+               << " and ending in " << ToFunc->getPrintName() << " @ "
+               << ToFunc->getPrintName() << " @ "
+               << Twine::utohexstr(Second.From - ToFunc->getAddress()) << '\n');
    NumInvalidTraces += Count;
    return false;
  }

-  DEBUG(dbgs() << "Processing " << FTs->size() << " fallthroughs for "
-               << FromFunc->getPrintName() << ":" << Twine::utohexstr(First.To)
-               << " to " << Twine::utohexstr(Second.From) << ".\n");
+  LLVM_DEBUG(dbgs() << "Processing " << FTs->size() << " fallthroughs for "
+                    << FromFunc->getPrintName() << ":"
+                    << Twine::utohexstr(First.To) << " to "
+                    << Twine::utohexstr(Second.From) << ".\n");
  for (const auto &Pair : *FTs) {
    doIntraBranch(*FromFunc, Pair.first + FromFunc->getAddress(),
                  Pair.second + FromFunc->getAddress(), Count, false);
@ -929,10 +927,11 @@ bool DataAggregator::recordTrace(
      if (Instr && BC.MIB->isCall(*Instr)) {
        FromBB = PrevBB;
      } else {
-        DEBUG(dbgs() << "invalid incoming LBR (no call): " << FirstLBR << '\n');
+        LLVM_DEBUG(dbgs() << "invalid incoming LBR (no call): " << FirstLBR
+                          << '\n');
      }
    } else {
-      DEBUG(dbgs() << "invalid incoming LBR: " << FirstLBR << '\n');
+      LLVM_DEBUG(dbgs() << "invalid incoming LBR: " << FirstLBR << '\n');
    }
  }

@ -952,9 +951,9 @@ bool DataAggregator::recordTrace(

    // Check for bad LBRs.
    if (!BB->getSuccessor(NextBB->getLabel())) {
-      DEBUG(dbgs() << "no fall-through for the trace:\n"
-                   << "  " << FirstLBR << '\n'
-                   << "  " << SecondLBR << '\n');
+      LLVM_DEBUG(dbgs() << "no fall-through for the trace:\n"
+                        << "  " << FirstLBR << '\n'
+                        << "  " << SecondLBR << '\n');
      return false;
    }

@ -1464,15 +1463,16 @@ std::error_code DataAggregator::parseBranchEvents() {
            }
        } else {
          if (TraceBF && getBinaryFunctionContainingAddress(TraceTo)) {
-            DEBUG(dbgs() << "Invalid trace starting in "
-                         << TraceBF->getPrintName() << " @ "
-                         << Twine::utohexstr(TraceFrom - TraceBF->getAddress())
-                         << " and ending @ " << Twine::utohexstr(TraceTo)
-                         << '\n');
+            LLVM_DEBUG(dbgs()
+                       << "Invalid trace starting in "
+                       << TraceBF->getPrintName() << " @ "
+                       << Twine::utohexstr(TraceFrom - TraceBF->getAddress())
+                       << " and ending @ " << Twine::utohexstr(TraceTo)
+                       << '\n');
            ++NumInvalidTraces;
          } else {
-            DEBUG(
-                dbgs() << "Out of range trace starting in "
+            LLVM_DEBUG(dbgs()
+                       << "Out of range trace starting in "
                       << (TraceBF ? TraceBF->getPrintName() : "None") << " @ "
                       << Twine::utohexstr(
                              TraceFrom - (TraceBF ? TraceBF->getAddress() : 0))
@ -1722,7 +1722,7 @@ void DataAggregator::processMemEvents() {
    // Try to resolve symbol for PC
    auto *Func = getBinaryFunctionContainingAddress(PC);
    if (!Func) {
-      DEBUG(if (PC != 0) {
+      LLVM_DEBUG(if (PC != 0) {
        dbgs() << "Skipped mem event: 0x" << Twine::utohexstr(PC) << " => 0x"
               << Twine::utohexstr(Addr) << "\n";
      });
@ -1747,7 +1747,7 @@ void DataAggregator::processMemEvents() {
    auto *MemData = &NamesToMemEvents[FuncName];
    setMemData(*Func, MemData);
    MemData->update(FuncLoc, AddrLoc);
-    DEBUG(dbgs() << "Mem event: " << FuncLoc << " = " << AddrLoc << "\n");
+    LLVM_DEBUG(dbgs() << "Mem event: " << FuncLoc << " = " << AddrLoc << "\n");
  }
 }

@ -2032,7 +2032,7 @@ std::error_code DataAggregator::parseMMapEvents() {
    GlobalMMapInfo.insert(FileMMapInfo);
  }

-  DEBUG(
+  LLVM_DEBUG(
    dbgs() << "FileName -> mmap info:\n";
    for (const auto &Pair : GlobalMMapInfo) {
      dbgs() << "  " << Pair.first << " : " << Pair.second.PID << " [0x"
@ -2140,7 +2140,7 @@ std::error_code DataAggregator::parseTaskEvents() {
  outs() << "PERF2BOLT: input binary is associated with "
         << BinaryMMapInfo.size() << " PID(s)\n";

-  DEBUG(
+  LLVM_DEBUG(
    for (auto &MMI : BinaryMMapInfo) {
      outs() << "  " << MMI.second.PID << (MMI.second.Forked ? " (forked)" : "")
             << ": (0x" << Twine::utohexstr(MMI.second.BaseAddress)
@ -2183,7 +2183,7 @@ DataAggregator::getFileNameForBuildID(StringRef FileBuildID) {
 std::error_code
 DataAggregator::writeAggregatedFile(StringRef OutputFilename) const {
  std::error_code EC;
-  raw_fd_ostream OutFile(OutputFilename, EC, sys::fs::OpenFlags::F_None);
+  raw_fd_ostream OutFile(OutputFilename, EC, sys::fs::OpenFlags::OF_None);
  if (EC)
    return EC;

--- a/bolt/src/DataReader.cpp
+++ b/bolt/src/DataReader.cpp
@ -427,8 +427,8 @@ void DataReader::readProfile(BinaryFunction &BF) {

    if (!recordBranch(BF, BI.From.Offset, BI.To.Offset,
                      BI.Branches, BI.Mispreds)) {
-      DEBUG(dbgs() << "bad branch : " << BI.From.Offset << " -> "
-                   << BI.To.Offset << '\n');
+      LLVM_DEBUG(dbgs() << "bad branch : " << BI.From.Offset << " -> "
+                        << BI.To.Offset << '\n');
      ++MismatchedBranches;
    }
  }
@ -579,14 +579,11 @@ float DataReader::evaluateProfileData(BinaryFunction &BF,
      continue;
    }

-    DEBUG(dbgs()
-        << "\tinvalid branch in " << BF << " : 0x"
-        << Twine::utohexstr(BI.From.Offset) << " -> ";
-        if (BI.From.Name == BI.To.Name)
-          dbgs() << "0x" << Twine::utohexstr(BI.To.Offset) << '\n';
-        else
-          dbgs() << "<outbounds>\n";
-    );
+    LLVM_DEBUG(dbgs() << "\tinvalid branch in " << BF << " : 0x"
+                      << Twine::utohexstr(BI.From.Offset) << " -> ";
+               if (BI.From.Name == BI.To.Name) dbgs()
+               << "0x" << Twine::utohexstr(BI.To.Offset) << '\n';
+               else dbgs() << "<outbounds>\n";);
  }

  const auto MatchRatio = (float) NumMatchedBranches / BranchData.Data.size();
@ -716,7 +713,7 @@ bool DataReader::recordBranch(BinaryFunction &BF,
  auto *ToBB = BF.getBasicBlockContainingOffset(To);

  if (!FromBB || !ToBB) {
-    DEBUG(dbgs() << "failed to get block for recorded branch\n");
+    LLVM_DEBUG(dbgs() << "failed to get block for recorded branch\n");
    return false;
  }

@ -757,8 +754,9 @@ bool DataReader::recordBranch(BinaryFunction &BF,
      }

      if (To <= LastInstrOffset) {
-        DEBUG(dbgs() << "branch recorded into the middle of the block" << " in "
-                     << BF << " : " << From << " -> " << To << '\n');
+        LLVM_DEBUG(dbgs() << "branch recorded into the middle of the block"
+                          << " in " << BF << " : " << From << " -> " << To
+                          << '\n');
        return false;
      }
    }
@ -783,14 +781,14 @@ bool DataReader::recordBranch(BinaryFunction &BF,
    // evaluate to true if the first instr is not a branch (call/jmp/ret/etc)
    if (collectedInBoltedBinary()) {
      if (FromBB->getInputOffset() != From) {
-        DEBUG(dbgs() << "offset " << From << " does not match a BB in " << BF
-                     << '\n');
+        LLVM_DEBUG(dbgs() << "offset " << From << " does not match a BB in "
+                          << BF << '\n');
        return false;
      }
      FromInstruction = nullptr;
    } else {
-      DEBUG(dbgs() << "no instruction for offset " << From << " in " << BF
-                   << '\n');
+      LLVM_DEBUG(dbgs() << "no instruction for offset " << From << " in " << BF
+                        << '\n');
      return false;
    }
  }
@ -810,9 +808,9 @@ bool DataReader::recordBranch(BinaryFunction &BF,
    if (collectedInBoltedBinary() && FromBB == ToBB)
      return true;

-    DEBUG(dbgs() << "invalid branch in " << BF << '\n'
-                 << Twine::utohexstr(From) << " -> "
-                 << Twine::utohexstr(To) << '\n');
+    LLVM_DEBUG(dbgs() << "invalid branch in " << BF << '\n'
+                      << Twine::utohexstr(From) << " -> "
+                      << Twine::utohexstr(To) << '\n');
    return false;
  }

--- a/bolt/src/DebugData.cpp
+++ b/bolt/src/DebugData.cpp
@ -15,6 +15,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/EndianStream.h"
 #include "llvm/Support/LEB128.h"
 #include <algorithm>
 #include <cassert>
@ -39,30 +40,30 @@ namespace {
 // Terminates the list by writing a pair of two zeroes.
 // Returns the number of written bytes.
 uint64_t writeAddressRanges(
-    MCObjectWriter *Writer,
+    raw_svector_ostream &Stream,
    const DebugAddressRangesVector &AddressRanges,
    const bool WriteRelativeRanges = false) {
  for (auto &Range : AddressRanges) {
-    Writer->writeLE64(Range.LowPC);
-    Writer->writeLE64(WriteRelativeRanges ? Range.HighPC - Range.LowPC
-                                          : Range.HighPC);
+    support::endian::write(Stream, Range.LowPC, support::little);
+    support::endian::write(
+        Stream, WriteRelativeRanges ? Range.HighPC - Range.LowPC : Range.HighPC,
+        support::little);
  }
  // Finish with 0 entries.
-  Writer->writeLE64(0);
-  Writer->writeLE64(0);
+  support::endian::write(Stream, 0ULL, support::little);
+  support::endian::write(Stream, 0ULL, support::little);
  return AddressRanges.size() * 16 + 16;
 }

 } // namespace

 DebugRangesSectionWriter::DebugRangesSectionWriter(BinaryContext *BC) {
-  RangesBuffer = llvm::make_unique<RangesBufferVector>();
-  RangesStream = llvm::make_unique<raw_svector_ostream>(*RangesBuffer);
-  Writer =
-    std::unique_ptr<MCObjectWriter>(BC->createObjectWriter(*RangesStream));
+  RangesBuffer = std::make_unique<RangesBufferVector>();
+  RangesStream = std::make_unique<raw_svector_ostream>(*RangesBuffer);

  // Add an empty range as the first entry;
-  SectionOffset += writeAddressRanges(Writer.get(), DebugAddressRangesVector{});
+  SectionOffset +=
+      writeAddressRanges(*RangesStream.get(), DebugAddressRangesVector{});
 }

 uint64_t DebugRangesSectionWriter::addRanges(
@ -90,7 +91,7 @@ DebugRangesSectionWriter::addRanges(const DebugAddressRangesVector &Ranges) {
  // unique and correct offsets in patches.
  std::lock_guard<std::mutex> Lock(WriterMutex);
  const auto EntryOffset = SectionOffset;
-  SectionOffset += writeAddressRanges(Writer.get(), Ranges);
+  SectionOffset += writeAddressRanges(*RangesStream.get(), Ranges);

  return EntryOffset;
 }
@ -101,8 +102,8 @@ void DebugARangesSectionWriter::addCURanges(uint64_t CUOffset,
  CUAddressRanges.emplace(CUOffset, std::move(Ranges));
 }

-void
-DebugARangesSectionWriter::writeARangesSection(MCObjectWriter *Writer) const {
+void DebugARangesSectionWriter::writeARangesSection(
+    raw_svector_ostream &RangesStream) const {
  // For reference on the format of the .debug_aranges section, see the DWARF4
  // specification, section 6.1.4 Lookup by Address
  // http://www.dwarfstd.org/doc/DWARF4.pdf
@ -116,58 +117,62 @@ DebugARangesSectionWriter::writeARangesSection(MCObjectWriter *Writer) const {
    // + 2*sizeof(uint64_t) bytes for each of the ranges, plus an extra
    // pair of uint64_t's for the terminating, zero-length range.
    // Does not include size field itself.
-    uint64_t Size = 8 + 4 + 2*sizeof(uint64_t) * (AddressRanges.size() + 1);
+    uint32_t Size = 8 + 4 + 2*sizeof(uint64_t) * (AddressRanges.size() + 1);

    // Header field #1: set size.
-    Writer->writeLE32(Size);
+    support::endian::write(RangesStream, Size, support::little);

    // Header field #2: version number, 2 as per the specification.
-    Writer->writeLE16(2);
+    support::endian::write(RangesStream, static_cast<uint16_t>(2),
+                           support::little);

    // Header field #3: debug info offset of the correspondent compile unit.
-    Writer->writeLE32(Offset);
+    support::endian::write(RangesStream, static_cast<uint32_t>(Offset),
+                           support::little);

    // Header field #4: address size.
    // 8 since we only write ELF64 binaries for now.
-    Writer->write8(8);
+    RangesStream << char(8);

    // Header field #5: segment size of target architecture.
-    Writer->write8(0);
+    RangesStream << char(0);

    // Padding before address table - 4 bytes in the 64-bit-pointer case.
-    Writer->writeLE32(0);
+    support::endian::write(RangesStream, static_cast<uint32_t>(0),
+                           support::little);

-    writeAddressRanges(Writer, AddressRanges, true);
+    writeAddressRanges(RangesStream, AddressRanges, true);
  }
 }

 DebugLocWriter::DebugLocWriter(BinaryContext *BC) {
-  LocBuffer = llvm::make_unique<LocBufferVector>();
-  LocStream = llvm::make_unique<raw_svector_ostream>(*LocBuffer);
-  Writer =
-    std::unique_ptr<MCObjectWriter>(BC->createObjectWriter(*LocStream));
+  LocBuffer = std::make_unique<LocBufferVector>();
+  LocStream = std::make_unique<raw_svector_ostream>(*LocBuffer);
 }

 // DWARF 4: 2.6.2
-uint64_t DebugLocWriter::addList(const DWARFDebugLoc::LocationList &LocList) {
-  if (LocList.Entries.empty())
+uint64_t
+DebugLocWriter::addList(const DebugLocationsVector &LocList) {
+  if (LocList.empty())
    return EmptyListTag;

  // Since there is a separate DebugLocWriter for each thread,
  // we don't need a lock to read the SectionOffset and update it.
  const auto EntryOffset = SectionOffset;

-  for (const auto &Entry : LocList.Entries) {
-    Writer->writeLE64(Entry.Begin);
-    Writer->writeLE64(Entry.End);
-    Writer->writeLE16(Entry.Loc.size());
-    Writer->writeBytes(StringRef(
-        reinterpret_cast<const char *>(Entry.Loc.data()), Entry.Loc.size()));
-    SectionOffset += 2 * 8 + 2 + Entry.Loc.size();
+  for (const DebugLocationEntry &Entry : LocList) {
+    support::endian::write(*LocStream, static_cast<uint64_t>(Entry.LowPC),
+                           support::little);
+    support::endian::write(*LocStream, static_cast<uint64_t>(Entry.HighPC),
+                           support::little);
+    support::endian::write(*LocStream, static_cast<uint16_t>(Entry.Expr.size()),
+                           support::little);
+    *LocStream << StringRef(reinterpret_cast<const char *>(Entry.Expr.data()),
+                            Entry.Expr.size());
+    SectionOffset += 2 * 8 + 2 + Entry.Expr.size();
  }
-  Writer->writeLE64(0);
-  Writer->writeLE64(0);
-  SectionOffset += 2 * 8;
+  LocStream->write_zeros(16);
+  SectionOffset += 16;

  return EntryOffset;
 }
--- a/bolt/src/DebugData.h
+++ b/bolt/src/DebugData.h
@ -55,6 +55,16 @@ static inline bool operator<(const DebugAddressRange &LHS,
 /// DebugAddressRangesVector - represents a set of absolute address ranges.
 using DebugAddressRangesVector = SmallVector<DebugAddressRange, 2>;

+/// Address range with location used by .debug_loc section.
+/// More compact than DWARFLocationEntry and uses absolute addresses.
+struct DebugLocationEntry {
+  uint64_t LowPC;
+  uint64_t HighPC;
+  SmallVector<uint8_t, 4> Expr;
+};
+
+using DebugLocationsVector = SmallVector<DebugLocationEntry, 4>;
+
 /// References a row in a DWARFDebugLine::LineTable by the DWARF
 /// Context index of the DWARF Compile Unit that owns the Line Table and the row
 /// index. This is tied to our IR during disassembly so that we can later update
@ -122,8 +132,6 @@ private:

  std::unique_ptr<raw_svector_ostream> RangesStream;

-  std::unique_ptr<MCObjectWriter> Writer;
-
  std::mutex WriterMutex;

  /// Current offset in the section (updated as new entries are written).
@ -141,7 +149,7 @@ public:
  void addCURanges(uint64_t CUOffset, DebugAddressRangesVector &&Ranges);

  /// Writes .debug_aranges with the added ranges to the MCObjectWriter.
-  void writeARangesSection(MCObjectWriter *Writer) const;
+  void writeARangesSection(raw_svector_ostream &RangesStream) const;

  /// Resets the writer to a clear state.
  void reset() {
@ -172,7 +180,7 @@ class DebugLocWriter {
 public:
  DebugLocWriter(BinaryContext *BC);

-  uint64_t addList(const DWARFDebugLoc::LocationList &LocList);
+  uint64_t addList(const DebugLocationsVector &LocList);

  std::unique_ptr<LocBufferVector> finalize() {
    return std::move(LocBuffer);
@ -191,8 +199,6 @@ private:

  std::unique_ptr<raw_svector_ostream> LocStream;

-  std::unique_ptr<MCObjectWriter> Writer;
-
  /// Current offset in the section (updated as new entries are written).
  /// Starts with 0 here since this only writes part of a full location lists
  /// section. In the final section, the first 16 bytes are reserved for an
--- a/bolt/src/DynoStats.cpp
+++ b/bolt/src/DynoStats.cpp
@ -181,7 +181,7 @@ DynoStats getDynoStats(const BinaryFunction &BF) {
    const auto *LastInstr = BB->getLastNonPseudoInstr();
    if (BC.MIB->getJumpTable(*LastInstr)) {
      Stats[DynoStats::JUMP_TABLE_BRANCHES] += BBExecutionCount;
-      DEBUG(
+      LLVM_DEBUG(
        static uint64_t MostFrequentJT;
        if (BBExecutionCount > MostFrequentJT) {
          MostFrequentJT = BBExecutionCount;
--- a/bolt/src/Exceptions.cpp
+++ b/bolt/src/Exceptions.cpp
@ -115,7 +115,7 @@ void BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
      StringRef(reinterpret_cast<const char *>(LSDASectionData.data()),
                LSDASectionData.size()),
      BC.DwCtx->getDWARFObj().isLittleEndian(), 8);
-  uint32_t Offset = getLSDAAddress() - LSDASectionAddress;
+  uint64_t Offset = getLSDAAddress() - LSDASectionAddress;
  assert(Data.isValidOffset(Offset) && "wrong LSDA address");

  uint8_t LPStartEncoding = Data.getU8(&Offset);
@ -145,10 +145,10 @@ void BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
  }

  // Table to store list of indices in type table. Entries are uleb128 values.
-  const uint32_t TypeIndexTableStart = Offset + TTypeEnd;
+  const uint64_t TypeIndexTableStart = Offset + TTypeEnd;

  // Offset past the last decoded index.
-  uint32_t MaxTypeIndexTableOffset = 0;
+  uint64_t MaxTypeIndexTableOffset = 0;

  // Max positive index used in type table.
  unsigned MaxTypeIndex = 0;
@ -204,7 +204,7 @@ void BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
        if (Label != Labels.end()) {
          LPSymbol = Label->second;
        } else {
-          LPSymbol = BC.Ctx->createTempSymbol("LP", true);
+          LPSymbol = BC.Ctx->createNamedTempSymbol("LP");
          Labels[LandingPad] = LPSymbol;
        }
      }
@ -231,7 +231,7 @@ void BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
    if (ActionEntry != 0) {
      auto printType = [&](int Index, raw_ostream &OS) {
        assert(Index > 0 && "only positive indices are valid");
-        uint32_t TTEntry = TypeTableStart - Index * TTypeEncodingSize;
+        uint64_t TTEntry = TypeTableStart - Index * TTypeEncodingSize;
        const uint64_t TTEntryAddress = TTEntry + LSDASectionAddress;
        uint64_t TypeAddress =
            *Data.getEncodedPointer(&TTEntry, TTypeEncoding, TTEntryAddress);
@ -255,7 +255,7 @@ void BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
      };
      if (opts::PrintExceptions)
        outs() << "    actions: ";
-      uint32_t ActionPtr = ActionTableStart + ActionEntry - 1;
+      uint64_t ActionPtr = ActionTableStart + ActionEntry - 1;
      int64_t ActionType;
      int64_t ActionNext;
      auto Sep = "";
@ -284,7 +284,7 @@ void BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
          // of indices with base 1.
          // E.g. -1 means offset 0, -2 is offset 1, etc. The indices are
          // encoded using uleb128 thus we cannot directly dereference them.
-          uint32_t TypeIndexTablePtr = TypeIndexTableStart - ActionType - 1;
+          uint64_t TypeIndexTablePtr = TypeIndexTableStart - ActionType - 1;
          while (auto Index = Data.getULEB128(&TypeIndexTablePtr)) {
            MaxTypeIndex = std::max(MaxTypeIndex, static_cast<unsigned>(Index));
            if (opts::PrintExceptions) {
@ -319,7 +319,7 @@ void BinaryFunction::parseLSDA(ArrayRef<uint8_t> LSDASectionData,
                              MaxTypeIndex * TTypeEncodingSize -
                              ActionTableStart);
    for (unsigned Index = 1; Index <= MaxTypeIndex; ++Index) {
-      uint32_t TTEntry = TypeTableStart - Index * TTypeEncodingSize;
+      uint64_t TTEntry = TypeTableStart - Index * TTypeEncodingSize;
      const auto TTEntryAddress = TTEntry + LSDASectionAddress;
      uint64_t TypeAddress =
          *Data.getEncodedPointer(&TTEntry, TTypeEncoding, TTEntryAddress);
@ -413,7 +413,7 @@ void BinaryFunction::updateEHRanges() {
      MCInst EHLabel;
      {
        std::unique_lock<std::shared_timed_mutex> Lock(BC.CtxMutex);
-        EHSymbol = BC.Ctx->createTempSymbol("EH", true);
+        EHSymbol = BC.Ctx->createNamedTempSymbol("EH");
        BC.MIB->createEHLabel(EHLabel, EHSymbol, BC.Ctx.get());
      }

@ -579,12 +579,12 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const {
          break;
        case DW_CFA_def_cfa:
          Function.addCFIInstruction(
-              Offset, MCCFIInstruction::createDefCfa(nullptr, Instr.Ops[0],
-                                                     Instr.Ops[1]));
+              Offset, MCCFIInstruction::cfiDefCfa(nullptr, Instr.Ops[0],
+                                                  Instr.Ops[1]));
          break;
        case DW_CFA_def_cfa_sf:
          Function.addCFIInstruction(
-              Offset, MCCFIInstruction::createDefCfa(
+              Offset, MCCFIInstruction::cfiDefCfa(
                          nullptr, Instr.Ops[0],
                          DataAlignment * int64_t(Instr.Ops[1])));
          break;
@ -596,11 +596,11 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const {
        case DW_CFA_def_cfa_offset:
          Function.addCFIInstruction(
              Offset,
-              MCCFIInstruction::createDefCfaOffset(nullptr, Instr.Ops[0]));
+              MCCFIInstruction::cfiDefCfaOffset(nullptr, Instr.Ops[0]));
          break;
        case DW_CFA_def_cfa_offset_sf:
          Function.addCFIInstruction(
-              Offset, MCCFIInstruction::createDefCfaOffset(
+              Offset, MCCFIInstruction::cfiDefCfaOffset(
                          nullptr, DataAlignment * int64_t(Instr.Ops[0])));
          break;
        case DW_CFA_GNU_args_size:
@ -615,37 +615,21 @@ bool CFIReaderWriter::fillCFIInfoFor(BinaryFunction &Function) const {
            errs() << "BOLT-WARNING: DWARF val_offset() unimplemented\n";
          }
          return false;
-        case DW_CFA_expression:
        case DW_CFA_def_cfa_expression:
-        case DW_CFA_val_expression: {
-          MCDwarfExprBuilder Builder;
-          for (auto &ExprOp : *Instr.Expression) {
-            const DWARFExpression::Operation::Description &Desc =
-                ExprOp.getDescription();
-            if (Desc.Op[0] == DWARFExpression::Operation::SizeNA) {
-              Builder.appendOperation(ExprOp.getCode());
-            } else if (Desc.Op[1] == DWARFExpression::Operation::SizeNA) {
-              Builder.appendOperation(ExprOp.getCode(),
-                                      ExprOp.getRawOperand(0));
-            } else {
-              Builder.appendOperation(ExprOp.getCode(), ExprOp.getRawOperand(0),
-                                      ExprOp.getRawOperand(1));
-            }
-          }
-          if (Opcode == DW_CFA_expression) {
-            Function.addCFIInstruction(
-                Offset, MCCFIInstruction::createExpression(
-                            nullptr, Instr.Ops[0], Builder.take()));
-          } else if (Opcode == DW_CFA_def_cfa_expression) {
-            Function.addCFIInstruction(Offset,
-                                       MCCFIInstruction::createDefCfaExpression(
-                                           nullptr, Builder.take()));
-          } else {
-            assert(Opcode == DW_CFA_val_expression && "Unexpected opcode");
-            Function.addCFIInstruction(
-                Offset, MCCFIInstruction::createValExpression(
-                            nullptr, Instr.Ops[0], Builder.take()));
+        case DW_CFA_val_expression:
+        case DW_CFA_expression: {
+          StringRef ExprBytes = Instr.Expression->getData();
+          std::string Str;
+          raw_string_ostream OS(Str);
+          // Manually encode this instruction using CFI escape
+          OS << Opcode;
+          if (Opcode != DW_CFA_def_cfa_expression) {
+            encodeULEB128(Instr.Ops[0], OS);
          }
+          encodeULEB128(ExprBytes.size(), OS);
+          OS << ExprBytes;
+          Function.addCFIInstruction(
+              Offset, MCCFIInstruction::createEscape(nullptr, OS.str()));
          break;
        }
        case DW_CFA_MIPS_advance_loc8:
@ -697,48 +681,54 @@ std::vector<char> CFIReaderWriter::generateEHFrameHeader(
  std::sort(FailedAddresses.begin(), FailedAddresses.end());

  // Initialize PCToFDE using NewEHFrame.
-  NewEHFrame.for_each_FDE([&](const dwarf::FDE *FDE) {
+  for (dwarf::FrameEntry &Entry : NewEHFrame.entries()) {
+    const dwarf::FDE *FDE = dyn_cast<dwarf::FDE>(&Entry);
+    if (FDE == nullptr)
+      continue;
    const auto FuncAddress = FDE->getInitialLocation();
    const auto FDEAddress = NewEHFrame.getEHFrameAddress() + FDE->getOffset();

    // Ignore unused FDEs.
    if (FuncAddress == 0)
-      return;
+      continue;

    // Add the address to the map unless we failed to write it.
    if (!std::binary_search(FailedAddresses.begin(), FailedAddresses.end(),
                            FuncAddress)) {
-      DEBUG(dbgs() << "BOLT-DEBUG: FDE for function at 0x"
-                   << Twine::utohexstr(FuncAddress) << " is at 0x"
-                   << Twine::utohexstr(FDEAddress) << '\n');
+      LLVM_DEBUG(dbgs() << "BOLT-DEBUG: FDE for function at 0x"
+                        << Twine::utohexstr(FuncAddress) << " is at 0x"
+                        << Twine::utohexstr(FDEAddress) << '\n');
      PCToFDE[FuncAddress] = FDEAddress;
    }
-  });
+  };

-  DEBUG(dbgs() << "BOLT-DEBUG: new .eh_frame contains "
-               << std::distance(NewEHFrame.entries().begin(),
-                                NewEHFrame.entries().end())
-               << " entries\n");
+  LLVM_DEBUG(dbgs() << "BOLT-DEBUG: new .eh_frame contains "
+                    << std::distance(NewEHFrame.entries().begin(),
+                                     NewEHFrame.entries().end())
+                    << " entries\n");

  // Add entries from the original .eh_frame corresponding to the functions
  // that we did not update.
-  OldEHFrame.for_each_FDE([&](const dwarf::FDE *FDE) {
+  for (const dwarf::FrameEntry &Entry : OldEHFrame) {
+    const dwarf::FDE *FDE = dyn_cast<dwarf::FDE>(&Entry);
+    if (FDE == nullptr)
+      continue;
    const auto FuncAddress = FDE->getInitialLocation();
    const auto FDEAddress = OldEHFrame.getEHFrameAddress() + FDE->getOffset();

    // Add the address if we failed to write it.
    if (PCToFDE.count(FuncAddress) == 0) {
-      DEBUG(dbgs() << "BOLT-DEBUG: old FDE for function at 0x"
-                   << Twine::utohexstr(FuncAddress) << " is at 0x"
-                   << Twine::utohexstr(FDEAddress) << '\n');
+      LLVM_DEBUG(dbgs() << "BOLT-DEBUG: old FDE for function at 0x"
+                        << Twine::utohexstr(FuncAddress) << " is at 0x"
+                        << Twine::utohexstr(FDEAddress) << '\n');
      PCToFDE[FuncAddress] = FDEAddress;
    }
-  });
+  };

-  DEBUG(dbgs() << "BOLT-DEBUG: old .eh_frame contains "
-               << std::distance(OldEHFrame.entries().begin(),
-                                OldEHFrame.entries().end())
-               << " entries\n");
+  LLVM_DEBUG(dbgs() << "BOLT-DEBUG: old .eh_frame contains "
+                    << std::distance(OldEHFrame.entries().begin(),
+                                     OldEHFrame.entries().end())
+                    << " entries\n");

  // Generate a new .eh_frame_hdr based on the new map.

@ -778,5 +768,143 @@ std::vector<char> CFIReaderWriter::generateEHFrameHeader(
  return EHFrameHeader;
 }

+Error EHFrameParser::parseCIE(uint64_t StartOffset) {
+  uint8_t Version = Data.getU8(&Offset);
+  const char *Augmentation = Data.getCStr(&Offset);
+  StringRef AugmentationString(Augmentation ? Augmentation : "");
+  uint8_t AddressSize =
+      Version < 4 ? Data.getAddressSize() : Data.getU8(&Offset);
+  Data.setAddressSize(AddressSize);
+  // Skip segment descriptor size
+  if (Version >= 4)
+    Offset += 1;
+  // Skip code alignment factor
+  Data.getULEB128(&Offset);
+  // Skip data alignment
+  Data.getSLEB128(&Offset);
+  // Skip return address register
+  if (Version == 1) {
+    Offset += 1;
+  } else {
+    Data.getULEB128(&Offset);
+  }
+
+  uint32_t FDEPointerEncoding = DW_EH_PE_absptr;
+  uint32_t LSDAPointerEncoding = DW_EH_PE_omit;
+  // Walk the augmentation string to get all the augmentation data.
+  for (unsigned i = 0, e = AugmentationString.size(); i != e; ++i) {
+    switch (AugmentationString[i]) {
+    default:
+      return createStringError(
+          errc::invalid_argument,
+          "unknown augmentation character in entry at 0x%" PRIx64, StartOffset);
+    case 'L':
+      LSDAPointerEncoding = Data.getU8(&Offset);
+      break;
+    case 'P': {
+      uint32_t PersonalityEncoding = Data.getU8(&Offset);
+      Optional<uint64_t> Personality =
+          Data.getEncodedPointer(&Offset, PersonalityEncoding,
+                                 EHFrameAddress ? EHFrameAddress + Offset : 0);
+      // Patch personality address
+      if (Personality)
+        PatcherCallback(*Personality, Offset, PersonalityEncoding);
+      break;
+    }
+    case 'R':
+      FDEPointerEncoding = Data.getU8(&Offset);
+      break;
+    case 'z':
+      if (i)
+        return createStringError(
+            errc::invalid_argument,
+            "'z' must be the first character at 0x%" PRIx64, StartOffset);
+      // Skip augmentation length
+      Data.getULEB128(&Offset);
+      break;
+    case 'S':
+    case 'B':
+      break;
+    }
+  }
+  Entries.emplace_back(std::make_unique<CIEInfo>(
+      FDEPointerEncoding, LSDAPointerEncoding, AugmentationString));
+  CIEs[StartOffset] = &*Entries.back();
+  return Error::success();
+}
+
+Error EHFrameParser::parseFDE(uint64_t CIEPointer,
+                               uint64_t StartStructureOffset) {
+  Optional<uint64_t> LSDAAddress;
+  CIEInfo *Cie = CIEs[StartStructureOffset - CIEPointer];
+
+  // The address size is encoded in the CIE we reference.
+  if (!Cie)
+    return createStringError(errc::invalid_argument,
+                             "parsing FDE data at 0x%" PRIx64
+                             " failed due to missing CIE",
+                             StartStructureOffset);
+  // Patch initial location
+  if (auto Val = Data.getEncodedPointer(&Offset, Cie->FDEPtrEncoding,
+                                        EHFrameAddress + Offset)) {
+    PatcherCallback(*Val, Offset, Cie->FDEPtrEncoding);
+  }
+  // Skip address range
+  Data.getEncodedPointer(&Offset, Cie->FDEPtrEncoding, 0);
+
+  // Process augmentation data for this FDE.
+  StringRef AugmentationString = Cie->AugmentationString;
+  if (!AugmentationString.empty() && Cie->LSDAPtrEncoding != DW_EH_PE_omit) {
+    // Skip augmentation length
+    Data.getULEB128(&Offset);
+    LSDAAddress =
+        Data.getEncodedPointer(&Offset, Cie->LSDAPtrEncoding,
+                               EHFrameAddress ? Offset + EHFrameAddress : 0);
+    // Patch LSDA address
+    PatcherCallback(*LSDAAddress, Offset, Cie->LSDAPtrEncoding);
+  }
+  return Error::success();
+}
+
+Error EHFrameParser::parse() {
+  while (Data.isValidOffset(Offset)) {
+    const uint64_t StartOffset = Offset;
+
+    uint64_t Length;
+    DwarfFormat Format;
+    std::tie(Length, Format) = Data.getInitialLength(&Offset);
+
+    // If the Length is 0, then this CIE is a terminator
+    if (Length == 0)
+      break;
+
+    const uint64_t StartStructureOffset = Offset;
+    const uint64_t EndStructureOffset = Offset + Length;
+
+    Error Err = Error::success();
+    const uint64_t Id = Data.getRelocatedValue(4, &Offset,
+                                               /*SectionIndex=*/nullptr, &Err);
+    if (Err)
+      return Err;
+
+    if (!Id) {
+      if (Error Err = parseCIE(StartOffset))
+        return Err;
+    } else {
+      if (Error Err = parseFDE(Id, StartStructureOffset))
+        return Err;
+    }
+    Offset = EndStructureOffset;
+  }
+
+  return Error::success();
+}
+
+Error EHFrameParser::parse(DWARFDataExtractor Data, uint64_t EHFrameAddress,
+                            PatcherCallbackTy PatcherCallback) {
+  EHFrameParser Parser(Data, EHFrameAddress, PatcherCallback);
+  return Parser.parse();
+}
+
 } // namespace bolt
 } // namespace llvm
--- a/bolt/src/Exceptions.h
+++ b/bolt/src/Exceptions.h
@ -14,8 +14,11 @@

 #include "BinaryContext.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugFrame.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Error.h"
 #include <map>

 namespace llvm {
@ -62,6 +65,51 @@ private:
  FDEsMap FDEs;
 };

+/// Parse an existing .eh_frame and invoke the callback for each
+/// address that needs to be fixed if we want to preserve the original
+/// .eh_frame while changing code location.
+/// This code is based on DWARFDebugFrame::parse(), but trimmed down to
+/// parse only the structures that have address references.
+class EHFrameParser {
+public:
+  using PatcherCallbackTy = std::function<void(uint64_t, uint64_t, uint64_t)>;
+
+  /// Call PatcherCallback for every encountered external reference in frame
+  /// data. The expected signature is:
+  ///
+  ///   void PatcherCallback(uint64_t Value, uint64_t Offset, uint64_t Type);
+  ///
+  /// where Value is a value of the reference, Offset - is an offset into the
+  /// frame data at which the reference occured, and Type is a DWARF encoding
+  /// type of the reference.
+  static Error parse(DWARFDataExtractor Data, uint64_t EHFrameAddress,
+                     PatcherCallbackTy PatcherCallback);
+
+private:
+  EHFrameParser(DWARFDataExtractor D, uint64_t E, PatcherCallbackTy P)
+      : Data(D), EHFrameAddress(E), PatcherCallback(P), Offset(0) {}
+
+  struct CIEInfo {
+    uint64_t FDEPtrEncoding;
+    uint64_t LSDAPtrEncoding;
+    StringRef AugmentationString;
+
+    CIEInfo(uint64_t F, uint64_t L, StringRef A)
+        : FDEPtrEncoding(F), LSDAPtrEncoding(L), AugmentationString(A) {}
+  };
+
+  Error parseCIE(uint64_t StartOffset);
+  Error parseFDE(uint64_t CIEPointer, uint64_t StartStructureOffset);
+  Error parse();
+
+  DWARFDataExtractor Data;
+  uint64_t EHFrameAddress;
+  PatcherCallbackTy PatcherCallback;
+  uint64_t Offset;
+  DenseMap<uint64_t, CIEInfo *> CIEs;
+  std::vector<std::unique_ptr<CIEInfo>> Entries;
+};
+
 } // namespace bolt
 } // namespace llvm

--- a/bolt/src/ExecutableFileMemoryManager.cpp
+++ b/bolt/src/ExecutableFileMemoryManager.cpp
@ -41,6 +41,15 @@ uint8_t *ExecutableFileMemoryManager::allocateSection(intptr_t Size,
    return DataCopy;
  }

+  if (!IsCode &&
+      (SectionName == ".strtab" ||
+       SectionName == ".symtab" ||
+       SectionName == "" ||
+       SectionName.startswith(".rela."))) {
+    return SectionMemoryManager::allocateDataSection(Size, Alignment, SectionID,
+                                                     SectionName, IsReadOnly);
+  }
+
  uint8_t *Ret;
  if (IsCode) {
    Ret = SectionMemoryManager::allocateCodeSection(Size, Alignment,
@ -72,16 +81,17 @@ uint8_t *ExecutableFileMemoryManager::allocateSection(intptr_t Size,
  assert(Section.isAllocatable() &&
         "verify that allocatable is marked as allocatable");

-  DEBUG(dbgs() << "BOLT: allocating "
-               << (IsCode ? "code" : (IsReadOnly ? "read-only data" : "data"))
-               << " section : " << SectionName
-               << " with size " << Size << ", alignment " << Alignment
-               << " at 0x" << Ret << ", ID = " << SectionID << "\n");
+  LLVM_DEBUG(
+      dbgs() << "BOLT: allocating "
+             << (IsCode ? "code" : (IsReadOnly ? "read-only data" : "data"))
+             << " section : " << SectionName << " with size " << Size
+             << ", alignment " << Alignment << " at 0x" << Ret
+             << ", ID = " << SectionID << "\n");
  return Ret;
 }

 bool ExecutableFileMemoryManager::finalizeMemory(std::string *ErrMsg) {
-  DEBUG(dbgs() << "BOLT: finalizeMemory()\n");
+  LLVM_DEBUG(dbgs() << "BOLT: finalizeMemory()\n");
  ++ObjectsLoaded;
  return SectionMemoryManager::finalizeMemory(ErrMsg);
 }
--- a/bolt/src/ExecutableFileMemoryManager.h
+++ b/bolt/src/ExecutableFileMemoryManager.h
@ -44,7 +44,7 @@ public:
  uint32_t ObjectsLoaded{0};

  ExecutableFileMemoryManager(BinaryContext &BC, bool AllowStubs)
-    : BC(BC), AllowStubs(AllowStubs) {}
+      : BC(BC), AllowStubs(AllowStubs) {}

  ~ExecutableFileMemoryManager();

@ -62,6 +62,17 @@ public:
                           /*IsCode=*/false, IsReadOnly);
  }

+  // Ignore TLS sections by treating them as a regular data section
+  TLSSection allocateTLSSection(uintptr_t Size, unsigned Alignment,
+                                unsigned SectionID,
+                                StringRef SectionName) override {
+    TLSSection Res;
+    Res.Offset = 0;
+    Res.InitializationImage = allocateDataSection(
+        Size, Alignment, SectionID, SectionName, /*IsReadOnly=*/false);
+    return Res;
+  }
+
  bool allowStubAllocation() const override { return AllowStubs; }

  bool finalizeMemory(std::string *ErrMsg = nullptr) override;
--- a/bolt/src/Heatmap.cpp
+++ b/bolt/src/Heatmap.cpp
@ -50,8 +50,8 @@ void Heatmap::registerAddressRange(uint64_t StartAddress, uint64_t EndAddress,

  if (StartAddress > EndAddress ||
      EndAddress - StartAddress > 64 * 1024) {
-    DEBUG(dbgs() << "invalid range : 0x" << Twine::utohexstr(StartAddress)
-                 << " -> 0x" << Twine::utohexstr(EndAddress) << '\n');
+    LLVM_DEBUG(dbgs() << "invalid range : 0x" << Twine::utohexstr(StartAddress)
+                      << " -> 0x" << Twine::utohexstr(EndAddress) << '\n');
    ++NumSkippedRanges;
    return;
  }
@ -64,7 +64,7 @@ void Heatmap::registerAddressRange(uint64_t StartAddress, uint64_t EndAddress,

 void Heatmap::print(StringRef FileName) const {
  std::error_code EC;
-  raw_fd_ostream OS(FileName, EC, sys::fs::OpenFlags::F_None);
+  raw_fd_ostream OS(FileName, EC, sys::fs::OpenFlags::OF_None);
  if (EC) {
    errs() << "error opening output file: " << EC.message() << '\n';
    exit(1);
@ -241,7 +241,7 @@ void Heatmap::print(raw_ostream &OS) const {

 void Heatmap::printCDF(StringRef FileName) const {
  std::error_code EC;
-  raw_fd_ostream OS(FileName, EC, sys::fs::OpenFlags::F_None);
+  raw_fd_ostream OS(FileName, EC, sys::fs::OpenFlags::OF_None);
  if (EC) {
    errs() << "error opening output file: " << EC.message() << '\n';
    exit(1);
--- a/bolt/src/JumpTable.cpp
+++ b/bolt/src/JumpTable.cpp
@ -100,11 +100,11 @@ void JumpTable::updateOriginal() {
    const auto RelType =
      Type == JTT_NORMAL ? ELF::R_X86_64_64 : ELF::R_X86_64_PC32;
    const uint64_t RelAddend = (Type == JTT_NORMAL ? 0 : Offset - BaseOffset);
-    DEBUG(dbgs() << "BOLT-DEBUG: adding relocation to section "
-                 << getSectionName() << " at offset 0x"
-                 << Twine::utohexstr(Offset) << " for symbol "
-                 << Entry->getName() << " with addend "
-                 << Twine::utohexstr(RelAddend) << '\n');
+    LLVM_DEBUG(dbgs() << "BOLT-DEBUG: adding relocation to section "
+                      << getSectionName() << " at offset 0x"
+                      << Twine::utohexstr(Offset) << " for symbol "
+                      << Entry->getName() << " with addend "
+                      << Twine::utohexstr(RelAddend) << '\n');
    getOutputSection().addRelocation(Offset, Entry, RelType, RelAddend);
    Offset += EntrySize;
  }
--- a/bolt/src/MCPlusBuilder.cpp
+++ b/bolt/src/MCPlusBuilder.cpp
@ -54,10 +54,14 @@ bool MCPlusBuilder::equals(const MCOperand &A, const MCOperand &B,
    if (!B.isImm())
      return false;
    return A.getImm() == B.getImm();
-  } else if (A.isFPImm()) {
-    if (!B.isFPImm())
+  } else if (A.isSFPImm()) {
+    if (!B.isSFPImm())
      return false;
-    return A.getFPImm() == B.getFPImm();
+    return A.getSFPImm() == B.getSFPImm();
+  } else if (A.isDFPImm()) {
+    if (!B.isDFPImm())
+      return false;
+    return A.getDFPImm() == B.getDFPImm();
  } else if (A.isExpr()) {
    if (!B.isExpr())
      return false;
@ -232,12 +236,16 @@ bool MCPlusBuilder::removeAnnotation(MCInst &Inst, unsigned Index) {
  return false;
 }

-void MCPlusBuilder::stripAnnotations(MCInst &Inst) {
+void MCPlusBuilder::stripAnnotations(MCInst &Inst, bool KeepTC) {
  auto *AnnotationInst = getAnnotationInst(Inst);
  if (!AnnotationInst)
    return;
+  // Preserve TailCall annotation.
+  auto IsTCOrErr = tryGetAnnotationAs<bool>(Inst, "TC");

  Inst.erase(std::prev(Inst.end()));
+  if (KeepTC && IsTCOrErr)
+    addAnnotation(Inst, "TC", *IsTCOrErr);
 }

 void
@ -383,8 +391,8 @@ MCPlusBuilder::getAliases(MCPhysReg Reg,
    SuperReg.emplace_back(I);
  }
  std::queue<MCPhysReg> Worklist;
-  // Propagate alias info upwards
-  for (MCPhysReg I = 0, E = RegInfo->getNumRegs(); I != E; ++I) {
+  // Propagate alias info upwards. Skip reg 0 (mapped to NoRegister)
+  for (MCPhysReg I = 1, E = RegInfo->getNumRegs(); I < E; ++I) {
    Worklist.push(I);
  }
  while (!Worklist.empty()) {
@ -398,7 +406,7 @@ MCPlusBuilder::getAliases(MCPhysReg Reg,
    }
  }
  // Propagate parent reg downwards
-  for (MCPhysReg I = 0, E = RegInfo->getNumRegs(); I != E; ++I) {
+  for (MCPhysReg I = 1, E = RegInfo->getNumRegs(); I < E; ++I) {
    Worklist.push(I);
  }
  while (!Worklist.empty()) {
@ -410,7 +418,7 @@ MCPlusBuilder::getAliases(MCPhysReg Reg,
    }
  }

-  DEBUG({
+  LLVM_DEBUG({
    dbgs() << "Dumping reg alias table:\n";
    for (MCPhysReg I = 0, E = RegInfo->getNumRegs(); I != E; ++I) {
      dbgs() << "Reg " << I << ": ";
@ -442,7 +450,7 @@ MCPlusBuilder::getRegSize(MCPhysReg Reg) const {
  for (auto I = RegInfo->regclass_begin(), E = RegInfo->regclass_end(); I != E;
       ++I) {
    for (MCPhysReg Reg : *I) {
-      SizeMap[Reg] = I->getSize();
+      SizeMap[Reg] = I->getSizeInBits() / 8;
    }
  }

--- a/bolt/src/MCPlusBuilder.h
+++ b/bolt/src/MCPlusBuilder.h
@ -33,7 +33,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ErrorOr.h"
-#include "llvm/Support/StringPool.h"
 #include <cassert>
 #include <cstdint>
 #include <map>
@ -154,14 +153,6 @@ protected:
  const MCInstrInfo *Info;
  const MCRegisterInfo *RegInfo;

-  /// Hash a PooledStringPtr.  It's ok to use the address since all these
-  /// strings are interned.
-  struct HashPooledStringPtr {
-    size_t operator()(const PooledStringPtr &Str) const {
-      return reinterpret_cast<size_t>(Str.begin());
-    }
-  };
-
  /// Map annotation name into an annotation index.
  StringMap<uint64_t> AnnotationNameIndexMap;

@ -569,6 +560,10 @@ public:
    return false;
  }

+  /// If non-zero, this is used to fill the executable space with instructions
+  /// that will trap. Defaults to 0.
+  virtual unsigned getTrapFillValue() const { return 0; }
+
  /// Interface and basic functionality of a MCInstMatcher. The idea is to make
  /// it easy to match one or more MCInsts against a tree-like pattern and
  /// extract the fragment operands. Example:
@ -983,7 +978,7 @@ public:
  /// ConstantData array starting from \p offset and assuming little-endianess.
  /// Return true on success. The given instruction is modified in place.
  virtual bool replaceMemOperandWithImm(MCInst &Inst, StringRef ConstantData,
-                                        uint32_t Offset) const {
+                                        uint64_t Offset) const {
    llvm_unreachable("not implemented");
    return false;
  }
@ -1068,7 +1063,7 @@ public:

  /// Return MCSymbol extracted from a target expression
  virtual const MCSymbol *getTargetSymbol(const MCExpr *Expr) const {
-    return &Expr->getSymbol();
+    return &cast<const MCSymbolRefExpr>(Expr)->getSymbol();
  }

  /// Return MCSymbol/offset extracted from a target expression
@ -1218,7 +1213,7 @@ public:
  }

  /// Replace instruction opcode to be a regural call instead of tail call.
-  virtual bool convertTailCallToCall(MCInst &Inst) const {
+  virtual bool convertTailCallToCall(MCInst &Inst) {
    llvm_unreachable("not implemented");
    return false;
  }
@ -1227,13 +1222,13 @@ public:
  /// a destination from a memory location pointed by \p TargetLocation symbol.
  virtual bool convertCallToIndirectCall(MCInst &Inst,
                                         const MCSymbol *TargetLocation,
-                                         MCContext *Ctx) const {
+                                         MCContext *Ctx) {
    llvm_unreachable("not implemented");
    return false;
  }

  /// Morph an indirect call into a load where \p Reg holds the call target.
-  virtual void convertIndirectCallToLoad(MCInst &Inst, MCPhysReg Reg) const {
+  virtual void convertIndirectCallToLoad(MCInst &Inst, MCPhysReg Reg) {
    llvm_unreachable("not implemented");
  }

@ -1369,7 +1364,7 @@ public:
  }

  virtual bool createIndirectCall(MCInst &Inst, const MCSymbol *TargetLocation,
-                                  MCContext *Ctx, bool IsTailCall) const {
+                                  MCContext *Ctx, bool IsTailCall) {
    llvm_unreachable("not implemented");
    return false;
  }
@ -1510,10 +1505,17 @@ public:
    return false;
  }

+  /// Return the conditional code used in a conditional jump instruction.
+  /// Returns invalid code if not conditional jump.
+  virtual unsigned getCondCode(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
  /// Return canonical branch opcode for a reversible branch opcode. For every
  /// opposite branch opcode pair Op <-> OpR this function returns one of the
  /// opcodes which is considered a canonical.
-  virtual unsigned getCanonicalBranchOpcode(unsigned BranchOpcode) const {
+  virtual unsigned getCanonicalBranchCondCode(unsigned CC) const {
    llvm_unreachable("not implemented");
    return false;
  }
@ -1554,7 +1556,7 @@ public:
    const auto Index =
        AnnotationNameIndexMap.size() + MCPlus::MCAnnotation::kGeneric;
    AnnotationNameIndexMap.insert(std::make_pair(Name, Index));
-    AnnotationNames.push_back(Name);
+    AnnotationNames.emplace_back(std::string(Name));
    return Index;
  }

@ -1718,12 +1720,12 @@ public:
  }

  /// Remove meta-data, but don't destroy it.
-  void stripAnnotations(MCInst &Inst);
+  void stripAnnotations(MCInst &Inst, bool KeepTC = false);

  virtual std::vector<MCInst>
  createInstrumentedIndirectCall(const MCInst &CallInst, bool TailCall,
                                 MCSymbol *HandlerFuncAddr, int CallSiteID,
-                                 MCContext *Ctx) const {
+                                 MCContext *Ctx) {
    llvm_unreachable("not implemented");
    return std::vector<MCInst>();
  }
--- a/bolt/src/MachORewriteInstance.cpp
+++ b/bolt/src/MachORewriteInstance.cpp
@ -25,6 +25,8 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/ToolOutputFile.h"

@ -32,7 +34,8 @@ namespace opts {

 using namespace llvm;
 extern cl::opt<unsigned> AlignText;
-extern cl::opt<bool> CheckOverlappingElements;
+//FIXME! Upstream change
+//extern cl::opt<bool> CheckOverlappingElements;
 extern cl::opt<bool> ForcePatch;
 extern cl::opt<bool> Instrument;
 extern cl::opt<bool> InstrumentCalls;
@ -61,9 +64,7 @@ MachORewriteInstance::MachORewriteInstance(object::MachOObjectFile *InputFile,
    : InputFile(InputFile), ToolPath(ToolPath),
      BC(BinaryContext::createBinaryContext(
          InputFile, /* IsPIC */ true,
-          DWARFContext::create(*InputFile, nullptr,
-                               DWARFContext::defaultErrorHandler, "",
-                               false))) {}
+          DWARFContext::create(*InputFile))) {}

 Error MachORewriteInstance::setProfile(StringRef Filename) {
  if (!sys::fs::exists(Filename))
@ -76,7 +77,7 @@ Error MachORewriteInstance::setProfile(StringRef Filename) {
        " and " + Filename, inconvertibleErrorCode());
  }

-  ProfileReader = llvm::make_unique<DataReader>(Filename);
+  ProfileReader = std::make_unique<DataReader>(Filename);
  return Error::success();
 }

@ -103,15 +104,16 @@ void MachORewriteInstance::processProfileData() {

 void MachORewriteInstance::readSpecialSections() {
  for (const auto &Section : InputFile->sections()) {
-    StringRef SectionName;
-    check_error(Section.getName(SectionName), "cannot get section name");
+    Expected<StringRef> SectionName = Section.getName();;
+    check_error(SectionName.takeError(), "cannot get section name");
    // Only register sections with names.
-    if (!SectionName.empty()) {
+    if (!SectionName->empty()) {
      BC->registerSection(Section);
-      DEBUG(dbgs() << "BOLT-DEBUG: registering section " << SectionName
-                   << " @ 0x" << Twine::utohexstr(Section.getAddress()) << ":0x"
-                   << Twine::utohexstr(Section.getAddress() + Section.getSize())
-                   << "\n");
+      LLVM_DEBUG(
+          dbgs() << "BOLT-DEBUG: registering section " << *SectionName
+                 << " @ 0x" << Twine::utohexstr(Section.getAddress()) << ":0x"
+                 << Twine::utohexstr(Section.getAddress() + Section.getSize())
+                 << "\n");
    }
  }

@ -199,10 +201,10 @@ void MachORewriteInstance::discoverFileObjects() {
    return;
  std::stable_sort(FunctionSymbols.begin(), FunctionSymbols.end(),
                   [](const SymbolRef &LHS, const SymbolRef &RHS) {
-                     return LHS.getValue() < RHS.getValue();
+                     return cantFail(LHS.getValue()) < cantFail(RHS.getValue());
                   });
  for (size_t Index = 0; Index < FunctionSymbols.size(); ++Index) {
-    const uint64_t Address = FunctionSymbols[Index].getValue();
+    const uint64_t Address = cantFail(FunctionSymbols[Index].getValue());
    auto Section = BC->getSectionForAddress(Address);
    // TODO: It happens for some symbols (e.g. __mh_execute_header).
    // Add proper logic to handle them correctly.
@ -216,7 +218,7 @@ void MachORewriteInstance::discoverFileObjects() {
        cantFail(FunctionSymbols[Index].getName(), "cannot get symbol name")
            .str();
    // Uniquify names of local symbols.
-    if (!(FunctionSymbols[Index].getFlags() & SymbolRef::SF_Global))
+    if (!(cantFail(FunctionSymbols[Index].getFlags()) & SymbolRef::SF_Global))
      SymbolName = NR.uniquify(SymbolName);

    section_iterator S = cantFail(FunctionSymbols[Index].getSection());
@ -225,11 +227,11 @@ void MachORewriteInstance::discoverFileObjects() {
    size_t NFIndex = Index + 1;
    // Skip aliases.
    while (NFIndex < FunctionSymbols.size() &&
-           FunctionSymbols[NFIndex].getValue() == Address)
+           cantFail(FunctionSymbols[NFIndex].getValue()) == Address)
      ++NFIndex;
    if (NFIndex < FunctionSymbols.size() &&
        S == cantFail(FunctionSymbols[NFIndex].getSection()))
-      EndAddress = FunctionSymbols[NFIndex].getValue();
+      EndAddress = cantFail(FunctionSymbols[NFIndex].getValue());

    const uint64_t SymbolSize = EndAddress - Address;
    const auto It = BC->getBinaryFunctions().find(Address);
@ -318,21 +320,21 @@ void MachORewriteInstance::postProcessFunctions() {
 void MachORewriteInstance::runOptimizationPasses() {
  BinaryFunctionPassManager Manager(*BC);
  if (opts::Instrument) {
-    Manager.registerPass(llvm::make_unique<PatchEntries>());
-    Manager.registerPass(llvm::make_unique<Instrumentation>(opts::NeverPrint));
+    Manager.registerPass(std::make_unique<PatchEntries>());
+    Manager.registerPass(std::make_unique<Instrumentation>(opts::NeverPrint));
  }
  Manager.registerPass(
-      llvm::make_unique<ReorderBasicBlocks>(opts::PrintReordered));
+      std::make_unique<ReorderBasicBlocks>(opts::PrintReordered));
  Manager.registerPass(
-      llvm::make_unique<FixupBranches>(opts::PrintAfterBranchFixup));
+      std::make_unique<FixupBranches>(opts::PrintAfterBranchFixup));
  // This pass should always run last.*
  Manager.registerPass(
-      llvm::make_unique<FinalizeFunctions>(opts::PrintFinalized));
+      std::make_unique<FinalizeFunctions>(opts::PrintFinalized));

  Manager.runPasses();
 }

-void MachORewriteInstance::mapInstrumentationSection(orc::VModuleKey Key, StringRef SectionName) {
+void MachORewriteInstance::mapInstrumentationSection(StringRef SectionName) {
  if (!opts::Instrument)
    return;
  ErrorOr<BinarySection &> Section = BC->getUniqueSectionByName(SectionName);
@ -342,10 +344,11 @@ void MachORewriteInstance::mapInstrumentationSection(orc::VModuleKey Key, String
  }
  if (!Section->hasValidSectionID())
    return;
-  OLT->mapSectionAddress(Key, Section->getSectionID(), Section->getAddress());
+  RTDyld->reassignSectionAddress(Section->getSectionID(),
+                                 Section->getAddress());
 }

-void MachORewriteInstance::mapCodeSections(orc::VModuleKey Key) {
+void MachORewriteInstance::mapCodeSections() {
  for (BinaryFunction *Function : BC->getAllBinaryFunctions()) {
    if (!Function->isEmitted())
      continue;
@ -359,11 +362,11 @@ void MachORewriteInstance::mapCodeSections(orc::VModuleKey Key) {
          FuncSection.getError());

    FuncSection->setOutputAddress(Function->getOutputAddress());
-    DEBUG(dbgs() << "BOLT: mapping 0x"
+    LLVM_DEBUG(dbgs() << "BOLT: mapping 0x"
                 << Twine::utohexstr(FuncSection->getAllocAddress()) << " to 0x"
                 << Twine::utohexstr(Function->getOutputAddress()) << '\n');
-    OLT->mapSectionAddress(Key, FuncSection->getSectionID(),
-                           Function->getOutputAddress());
+    RTDyld->reassignSectionAddress(FuncSection->getSectionID(),
+                                   Function->getOutputAddress());
    Function->setImageAddress(FuncSection->getAllocAddress());
    Function->setImageSize(FuncSection->getOutputSize());
  }
@ -384,7 +387,7 @@ void MachORewriteInstance::mapCodeSections(orc::VModuleKey Key) {
      assert(FuncSection && "cannot find section for function");
      Addr = llvm::alignTo(Addr, 4);
      FuncSection->setOutputAddress(Addr);
-      OLT->mapSectionAddress(Key, FuncSection->getSectionID(), Addr);
+      RTDyld->reassignSectionAddress(FuncSection->getSectionID(), Addr);
      Function->setFileOffset(Addr - BOLT->getAddress() +
                              BOLT->getInputFileOffset());
      Function->setImageAddress(FuncSection->getAllocAddress());
@ -395,27 +398,56 @@ void MachORewriteInstance::mapCodeSections(orc::VModuleKey Key) {
  }
 }

+namespace {
+
+class BOLTSymbolResolver : public LegacyJITSymbolResolver {
+  BinaryContext &BC;
+public:
+  BOLTSymbolResolver(BinaryContext &BC) : BC(BC) {}
+
+  JITSymbol findSymbolInLogicalDylib(const std::string &Name) override {
+    return JITSymbol(nullptr);
+  }
+
+  JITSymbol findSymbol(const std::string &Name) override {
+    LLVM_DEBUG(dbgs() << "BOLT: looking for " << Name << "\n");
+    if (auto *I = BC.getBinaryDataByName(Name)) {
+      const uint64_t Address = I->isMoved() && !I->isJumpTable()
+                                   ? I->getOutputAddress()
+                                   : I->getAddress();
+      LLVM_DEBUG(dbgs() << "Resolved to address 0x" << Twine::utohexstr(Address)
+                        << "\n");
+      return JITSymbol(Address, JITSymbolFlags());
+    }
+    LLVM_DEBUG(dbgs() << "Resolved to address 0x0\n");
+    return JITSymbol(nullptr);
+  }
+};
+
+} // end anonymous namespace
+
 void MachORewriteInstance::emitAndLink() {
  std::error_code EC;
  std::unique_ptr<::llvm::ToolOutputFile> TempOut =
-      llvm::make_unique<::llvm::ToolOutputFile>(
-          opts::OutputFilename + ".bolt.o", EC, sys::fs::F_None);
+      std::make_unique<::llvm::ToolOutputFile>(
+          opts::OutputFilename + ".bolt.o", EC, sys::fs::OF_None);
  check_error(EC, "cannot create output object file");

  if (opts::KeepTmp)
    TempOut->keep();

  std::unique_ptr<buffer_ostream> BOS =
-      make_unique<buffer_ostream>(TempOut->os());
+      std::make_unique<buffer_ostream>(TempOut->os());
  raw_pwrite_stream *OS = BOS.get();

  MCCodeEmitter *MCE =
      BC->TheTarget->createMCCodeEmitter(*BC->MII, *BC->MRI, *BC->Ctx);
  MCAsmBackend *MAB =
      BC->TheTarget->createMCAsmBackend(*BC->STI, *BC->MRI, MCTargetOptions());
+  std::unique_ptr<MCObjectWriter> OW = MAB->createObjectWriter(*OS);
  std::unique_ptr<MCStreamer> Streamer(BC->TheTarget->createMCObjectStreamer(
-      *BC->TheTriple, *BC->Ctx, std::unique_ptr<MCAsmBackend>(MAB), *OS,
-      std::unique_ptr<MCCodeEmitter>(MCE), *BC->STI,
+      *BC->TheTriple, *BC->Ctx, std::unique_ptr<MCAsmBackend>(MAB),
+      std::move(OW), std::unique_ptr<MCCodeEmitter>(MCE), *BC->STI,
      /* RelaxAll */ false,
      /* IncrementalLinkerCompatible */ false,
      /* DWARFMustBeAtTheEnd */ false));
@ -429,68 +461,42 @@ void MachORewriteInstance::emitAndLink() {
      "error creating in-memory object");
  assert(Obj && "createObjectFile cannot return nullptr");

-  auto Resolver = orc::createLegacyLookupResolver(
-      [&](const std::string &Name) -> JITSymbol {
-        llvm::errs() << "looking for " << Name << "\n";
-        DEBUG(dbgs() << "BOLT: looking for " << Name << "\n");
-        if (auto *I = BC->getBinaryDataByName(Name)) {
-          const uint64_t Address = I->isMoved() && !I->isJumpTable()
-                                       ? I->getOutputAddress()
-                                       : I->getAddress();
-          DEBUG(dbgs() << "Resolved to address 0x" << Twine::utohexstr(Address)
-                       << "\n");
-          return JITSymbol(Address, JITSymbolFlags());
-        }
-        DEBUG(dbgs() << "Resolved to address 0x0\n");
-        return JITSymbol(nullptr);
-      },
-      [](Error Err) { cantFail(std::move(Err), "lookup failed"); });
-
-  Resolver->setAllowsZeroSymbols(true);
+  auto Resolver = BOLTSymbolResolver(*BC);

  MCAsmLayout FinalLayout(
      static_cast<MCObjectStreamer *>(Streamer.get())->getAssembler());

-  SSP.reset(new decltype(SSP)::element_type());
-  ES.reset(new decltype(ES)::element_type(*SSP));
  BC->EFMM.reset(new ExecutableFileMemoryManager(*BC, /*AllowStubs*/ false));

-  const orc::VModuleKey K = ES->allocateVModule();
-  OLT.reset(new decltype(OLT)::element_type(
-      *ES,
-      [this, &Resolver](orc::VModuleKey Key) {
-        orc::RTDyldObjectLinkingLayer::Resources R;
-        R.MemMgr = BC->EFMM;
-        R.Resolver = Resolver;
-        return R;
-      },
-      [&](orc::VModuleKey Key, const object::ObjectFile &Obj,
-          const RuntimeDyld::LoadedObjectInfo &) {
-        if (Key == K) {
-          mapCodeSections(Key);
-          mapInstrumentationSection(Key, "__counters");
-          mapInstrumentationSection(Key, "__tables");
-        } else {
+  RTDyld.reset(new decltype(RTDyld)::element_type(*BC->EFMM, Resolver));
+  RTDyld->setProcessAllSections(true);
+  RTDyld->loadObject(*Obj);
+  if (RTDyld->hasError()) {
+    outs() << "BOLT-ERROR: RTDyld failed.\n";
+    exit(1);
+  }
+
+  // Assign addresses to all sections. If key corresponds to the object
+  // created by ourselves, call our regular mapping function. If we are
+  // loading additional objects as part of runtime libraries for
+  // instrumentation, treat them as extra sections.
+  mapCodeSections();
+  mapInstrumentationSection("__counters");
+  mapInstrumentationSection("__tables");
+
          // TODO: Refactor addRuntimeLibSections to work properly on Mach-O
          // and use it here.
-          mapInstrumentationSection(Key, "I__setup");
-          mapInstrumentationSection(Key, "I__fini");
-          mapInstrumentationSection(Key, "I__data");
-          mapInstrumentationSection(Key, "I__text");
-          mapInstrumentationSection(Key, "I__cstring");
-          mapInstrumentationSection(Key, "I__literal16");
-        }
-      },
-      [&](orc::VModuleKey Key) {
-      }));
+  //FIXME! Put this in RtLibrary->link
+//          mapInstrumentationSection("I__setup");
+//          mapInstrumentationSection("I__fini");
+//          mapInstrumentationSection("I__data");
+//          mapInstrumentationSection("I__text");
+//          mapInstrumentationSection("I__cstring");
+//          mapInstrumentationSection("I__literal16");

-  OLT->setProcessAllSections(true);
-  cantFail(OLT->addObject(K, std::move(ObjectMemBuffer)));
-  cantFail(OLT->emitAndFinalize(K));
-
-  if (auto *RtLibrary = BC->getRuntimeLibrary()) {
-    RtLibrary->link(*BC, ToolPath, *ES, *OLT);
-  }
+//  if (auto *RtLibrary = BC->getRuntimeLibrary()) {
+//    RtLibrary->link(*BC, ToolPath, *ES, *OLT);
+//  }
 }

 void MachORewriteInstance::writeInstrumentationSection(StringRef SectionName,
@ -514,9 +520,8 @@ void MachORewriteInstance::writeInstrumentationSection(StringRef SectionName,

 void MachORewriteInstance::rewriteFile() {
  std::error_code EC;
-  Out = llvm::make_unique<ToolOutputFile>(
-      opts::OutputFilename, EC, sys::fs::F_None,
-      sys::fs::all_read | sys::fs::all_write | sys::fs::all_exe);
+  Out = std::make_unique<ToolOutputFile>(opts::OutputFilename, EC,
+                                         sys::fs::OF_None);
  check_error(EC, "cannot create output executable file");
  raw_fd_ostream &OS = Out->os();
  OS << InputFile->getData();
@ -552,10 +557,14 @@ void MachORewriteInstance::rewriteFile() {
  writeInstrumentationSection("I__literal16", OS);

  Out->keep();
+  EC = sys::fs::setPermissions(opts::OutputFilename,
+                               sys::fs::perms::all_all);
+  check_error(EC, "cannot set permissions of output file");
 }

 void MachORewriteInstance::adjustCommandLineOptions() {
-  opts::CheckOverlappingElements = false;
+//FIXME! Upstream change
+//  opts::CheckOverlappingElements = false;
  if (!opts::AlignText.getNumOccurrences())
    opts::AlignText = BC->PageAlign;
  if (opts::Instrument.getNumOccurrences())
--- a/bolt/src/MachORewriteInstance.h
+++ b/bolt/src/MachORewriteInstance.h
@ -43,9 +43,7 @@ class MachORewriteInstance {

  NameResolver NR;

-  std::unique_ptr<orc::SymbolStringPool> SSP;
-  std::unique_ptr<orc::ExecutionSession> ES;
-  std::unique_ptr<orc::RTDyldObjectLinkingLayer> OLT;
+  std::unique_ptr<RuntimeDyld> RTDyld;

  std::unique_ptr<ToolOutputFile> Out;

@ -56,8 +54,8 @@ class MachORewriteInstance {

  static StringRef getOrgSecPrefix() { return ".bolt.org"; }

-  void mapInstrumentationSection(orc::VModuleKey Key, StringRef SectionName);
-  void mapCodeSections(orc::VModuleKey Key);
+  void mapInstrumentationSection(StringRef SectionName);
+  void mapCodeSections();

  void adjustCommandLineOptions();
  void readSpecialSections();
--- a/bolt/src/ParallelUtilities.cpp
+++ b/bolt/src/ParallelUtilities.cpp
@ -22,7 +22,7 @@ extern cl::OptionCategory BoltCategory;
 cl::opt<unsigned>
 ThreadCount("thread-count",
  cl::desc("number of threads"),
-  cl::init(hardware_concurrency()),
+  cl::init(hardware_concurrency().compute_thread_count()),
  cl::cat(BoltCategory));

 cl::opt<bool>
@ -101,7 +101,8 @@ ThreadPool &getThreadPool() {
  if (ThreadPoolPtr.get())
    return *ThreadPoolPtr;

-  ThreadPoolPtr = std::make_unique<ThreadPool>(opts::ThreadCount);
+  ThreadPoolPtr = std::make_unique<ThreadPool>(
+      llvm::hardware_concurrency(opts::ThreadCount));
  return *ThreadPoolPtr;
 }

@ -115,7 +116,7 @@ void runOnEachFunction(BinaryContext &BC, SchedulingPolicy SchedPolicy,
  auto runBlock = [&](std::map<uint64_t, BinaryFunction>::iterator BlockBegin,
                      std::map<uint64_t, BinaryFunction>::iterator BlockEnd) {
    Timer T(LogName, LogName);
-    DEBUG(T.startTimer());
+    LLVM_DEBUG(T.startTimer());

    for (auto It = BlockBegin; It != BlockEnd; ++It) {
      auto &BF = It->second;
@ -124,7 +125,7 @@ void runOnEachFunction(BinaryContext &BC, SchedulingPolicy SchedPolicy,

      WorkFunction(BF);
    }
-    DEBUG(T.stopTimer());
+    LLVM_DEBUG(T.stopTimer());
  };

  if (opts::NoThreads || ForceSequential) {
@ -170,7 +171,7 @@ void runOnEachFunctionWithUniqueAllocId(
                      std::map<uint64_t, BinaryFunction>::iterator BlockEnd,
                      MCPlusBuilder::AllocatorIdTy AllocId) {
    Timer T(LogName, LogName);
-    DEBUG(T.startTimer());
+    LLVM_DEBUG(T.startTimer());
    std::shared_lock<std::shared_timed_mutex> Lock(MainLock);
    for (auto It = BlockBegin; It != BlockEnd; ++It) {
      auto &BF = It->second;
@ -179,7 +180,7 @@ void runOnEachFunctionWithUniqueAllocId(

      WorkFunction(BF, AllocId);
    }
-    DEBUG(T.stopTimer());
+    LLVM_DEBUG(T.stopTimer());
  };

  if (opts::NoThreads || ForceSequential) {
--- a/bolt/src/ParallelUtilities.h
+++ b/bolt/src/ParallelUtilities.h
@ -19,6 +19,7 @@
 #include "BinaryContext.h"
 #include "BinaryFunction.h"
 #include "MCPlusBuilder.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ThreadPool.h"

 using namespace llvm;
--- a/bolt/src/Passes/Aligner.cpp
+++ b/bolt/src/Passes/Aligner.cpp
@ -153,7 +153,7 @@ void AlignerPass::alignBlocks(BinaryFunction &Function,
    BB->setAlignmentMaxBytes(BytesToUse);

    // Update stats.
-    DEBUG(
+    LLVM_DEBUG(
      std::unique_lock<std::shared_timed_mutex> Lock(AlignHistogramMtx);
      AlignHistogram[BytesToUse]++;
      AlignedBlocksCount += BB->getKnownExecutionCount();
@ -184,7 +184,7 @@ void AlignerPass::runOnFunctions(BinaryContext &BC) {
      BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun,
      ParallelUtilities::PredicateTy(nullptr), "AlignerPass");

-  DEBUG(
+  LLVM_DEBUG(
    dbgs() << "BOLT-DEBUG: max bytes per basic block alignment distribution:\n";
    for (unsigned I = 1; I < AlignHistogram.size(); ++I) {
      dbgs() << "  " << I << " : " << AlignHistogram[I] << '\n';
--- a/bolt/src/Passes/AllocCombiner.cpp
+++ b/bolt/src/Passes/AllocCombiner.cpp
@ -73,7 +73,7 @@ void AllocCombinerPass::combineAdjustments(BinaryContext &BC,
        continue;
      }

-      DEBUG({
+      LLVM_DEBUG({
        dbgs() << "At \"" << BF.getPrintName() << "\", combining: \n";
        Inst.dump();
        Prev->dump();
@ -85,7 +85,7 @@ void AllocCombinerPass::combineAdjustments(BinaryContext &BC,

      BC.MIB->addToImm(Inst, Adjustment, BC.Ctx.get());

-      DEBUG({
+      LLVM_DEBUG({
        dbgs() << "After adjustment:\n";
        Inst.dump();
      });
--- a/bolt/src/Passes/BinaryFunctionCallGraph.cpp
+++ b/bolt/src/Passes/BinaryFunctionCallGraph.cpp
@ -12,8 +12,9 @@
 #include "BinaryFunctionCallGraph.h"
 #include "BinaryFunction.h"
 #include "BinaryContext.h"
-#include "llvm/Support/Options.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Timer.h"
+#include <stack>

 #define DEBUG_TYPE "callgraph"

@ -140,8 +141,8 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC,
      if (auto *DstFunc =
          DestSymbol ? BC.getFunctionForSymbol(DestSymbol) : nullptr) {
        if (DstFunc == Function) {
-          DEBUG(dbgs() << "BOLT-INFO: recursive call detected in "
-                       << *DstFunc << "\n");
+          LLVM_DEBUG(dbgs() << "BOLT-INFO: recursive call detected in "
+                            << *DstFunc << "\n");
          ++RecursiveCallsites;
          if (IgnoreRecursiveCalls)
            return false;
@ -155,7 +156,7 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC,
        if (!IsValidCount)
          ++NoProfileCallsites;
        Cg.incArcWeight(SrcId, DstId, AdjCount, Offset);
-        DEBUG(
+        LLVM_DEBUG(
          if (opts::Verbosity > 1) {
            dbgs() << "BOLT-DEBUG: buildCallGraph: call " << *Function
                   << " -> " << *DstFunc << " @ " << Offset << "\n";
@ -194,8 +195,9 @@ BinaryFunctionCallGraph buildCallGraph(BinaryContext &BC,
    // fall back to the CFG walker which attempts to handle missing data.
    if (!Function->hasValidProfile() && CgFromPerfData &&
        !Function->getAllCallSites().empty()) {
-      DEBUG(dbgs() << "BOLT-DEBUG: buildCallGraph: Falling back to perf data"
-                   << " for " << *Function << "\n");
+      LLVM_DEBUG(
+          dbgs() << "BOLT-DEBUG: buildCallGraph: Falling back to perf data"
+                 << " for " << *Function << "\n");
      ++NumFallbacks;
      const auto Size = functionSize(Function);
      for (const auto &CSI : Function->getAllCallSites()) {
--- a/bolt/src/Passes/BinaryPasses.cpp
+++ b/bolt/src/Passes/BinaryPasses.cpp
@ -14,7 +14,7 @@
 #include "ParallelUtilities.h"
 #include "Passes/ReorderAlgorithm.h"
 #include "Passes/ReorderFunctions.h"
-#include "llvm/Support/Options.h"
+#include "llvm/Support/CommandLine.h"

 #include <numeric>
 #include <vector>
@ -277,7 +277,7 @@ void EliminateUnreachableBlocks::runOnFunction(BinaryFunction& Function) {
    unsigned Count;
    uint64_t Bytes;
    Function.markUnreachableBlocks();
-    DEBUG({
+    LLVM_DEBUG({
      for (auto *BB : Function.layout()) {
        if (!BB->isValid()) {
          dbgs() << "BOLT-INFO: UCE found unreachable block " << BB->getName()
@ -403,10 +403,10 @@ void ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF,
    Algo.reset(new ReverseReorderAlgorithm());
  } else if (BF.size() <= opts::TSPThreshold && Type != LT_OPTIMIZE_SHUFFLE) {
    // Work on optimal solution if problem is small enough
-    DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n");
+    LLVM_DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n");
    Algo.reset(new TSPReorderAlgorithm());
  } else {
-    DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n");
+    LLVM_DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n");

    std::unique_ptr<ClusterAlgorithm> CAlgo;
    if (MinBranchClusters)
@ -557,6 +557,13 @@ void LowerAnnotations::runOnFunctions(BinaryContext &BC) {
      }
    }
  }
+  for (BinaryFunction *BF : BC.getInjectedBinaryFunctions()) {
+    for (BinaryBasicBlock &BB : *BF) {
+      for (MCInst &Instruction : BB) {
+        BC.MIB->stripAnnotations(Instruction);
+      }
+    }
+  }

  // Release all memory taken by annotations
  BC.MIB->freeAnnotations();
@ -601,8 +608,8 @@ uint64_t fixDoubleJumps(BinaryContext &BC,
        MCInst *UncondBranch = nullptr;
        auto Res = Pred->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
        if(!Res) {
-          DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n";
-                Pred->dump());
+          LLVM_DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n";
+                     Pred->dump());
          return false;
        }
        Pred->replaceSuccessor(&BB, Succ);
@ -638,10 +645,10 @@ uint64_t fixDoubleJumps(BinaryContext &BC,
      }

      ++NumDoubleJumps;
-      DEBUG(dbgs() << "Removed double jump in " << Function << " from "
-                   << Pred->getName() << " -> " << BB.getName() << " to "
-                   << Pred->getName() << " -> " << SuccSym->getName()
-                   << (!Succ ? " (tail)\n" : "\n"));
+      LLVM_DEBUG(dbgs() << "Removed double jump in " << Function << " from "
+                        << Pred->getName() << " -> " << BB.getName() << " to "
+                        << Pred->getName() << " -> " << SuccSym->getName()
+                        << (!Succ ? " (tail)\n" : "\n"));

      return true;
    };
@ -781,8 +788,8 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC,

      // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz
      if (!Result) {
-        DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n";
-              PredBB->dump());
+        LLVM_DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n";
+                   PredBB->dump());
        continue;
      }

@ -902,12 +909,12 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryContext &BC,
    assert(BF.validateCFG());
  }

-  DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs
-               << " conditional tail calls from a total of "
-               << NumLocalCTCCandidates << " candidates in function " << BF
-               << ". CTCs execution count for this function is "
-               << LocalCTCExecCount << " and CTC taken count is "
-               << LocalCTCTakenCount << "\n";);
+  LLVM_DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs
+                    << " conditional tail calls from a total of "
+                    << NumLocalCTCCandidates << " candidates in function " << BF
+                    << ". CTCs execution count for this function is "
+                    << LocalCTCExecCount << " and CTC taken count is "
+                    << LocalCTCTakenCount << "\n";);

  NumTailCallsPatched += NumLocalCTCs;
  NumCandidateTailCalls += NumLocalCTCCandidates;
@ -1263,7 +1270,7 @@ void PrintProfileStats::runOnFunctions(BinaryContext &BC) {
  if (WorstBiasFunc && opts::Verbosity >= 1) {
    outs() << "Worst average bias observed in " << WorstBiasFunc->getPrintName()
           << "\n";
-    DEBUG(WorstBiasFunc->dump());
+    LLVM_DEBUG(WorstBiasFunc->dump());
  }
 }

--- a/bolt/src/Passes/BranchPredictionInfo.cpp
+++ b/bolt/src/Passes/BranchPredictionInfo.cpp
@ -1,142 +0,0 @@
-#include "Passes/BranchPredictionInfo.h"
-#include "BinaryBasicBlock.h"
-
-namespace llvm {
-namespace bolt {
-
-void BranchPredictionInfo::findLoopEdgesInfo(const BinaryLoopInfo &LoopsInfo) {
-  // Traverse discovered loops
-  std::stack<BinaryLoop *> Loops;
-  for (BinaryLoop *BL : LoopsInfo)
-    Loops.push(BL);
-
-  while (!Loops.empty()) {
-    BinaryLoop *Loop = Loops.top();
-    Loops.pop();
-    BinaryBasicBlock *LoopHeader = Loop->getHeader();
-    LoopHeaders.insert(LoopHeader);
-
-    // Add nested loops in the stack.
-    for (BinaryLoop::iterator I = Loop->begin(), E = Loop->end(); I != E; ++I) {
-      Loops.push(*I);
-    }
-
-    SmallVector<BinaryBasicBlock *, 1> Latches;
-    Loop->getLoopLatches(Latches);
-
-    // Find back edges.
-    for (BinaryBasicBlock *Latch : Latches) {
-      for (BinaryBasicBlock *Succ : Latch->successors()) {
-        if (Succ == LoopHeader) {
-          Edge CFGEdge = std::make_pair(Latch->getLabel(), Succ->getLabel());
-          BackEdges.insert(CFGEdge);
-        }
-      }
-    }
-
-    // Find exit edges.
-    SmallVector<BinaryLoop::Edge, 1> AuxExitEdges;
-    Loop->getExitEdges(AuxExitEdges);
-    for (BinaryLoop::Edge &Exit : AuxExitEdges) {
-      ExitEdges.insert(Exit);
-    }
-  }
-}
-
-void BranchPredictionInfo::findBasicBlockInfo(const BinaryFunction &Function,
-                                              BinaryContext &BC) {
-  for (auto &BB : Function) {
-    for (auto &Inst : BB) {
-      if (BC.MIB->isCall(Inst))
-        CallSet.insert(&BB);
-      else if (BC.MIB->isStore(Inst))
-        StoreSet.insert(&BB);
-    }
-  }
-}
-
-bool BranchPredictionInfo::isBackEdge(const Edge &CFGEdge) const {
-  return BackEdges.count(CFGEdge);
-}
-
-bool BranchPredictionInfo::isBackEdge(const BinaryBasicBlock *SrcBB,
-                                      const BinaryBasicBlock *DstBB) const {
-  const Edge CFGEdge = std::make_pair(SrcBB->getLabel(), DstBB->getLabel());
-  return isBackEdge(CFGEdge);
-}
-
-bool BranchPredictionInfo::isExitEdge(const BinaryLoop::Edge &CFGEdge) const {
-  return ExitEdges.count(CFGEdge);
-}
-
-bool BranchPredictionInfo::isExitEdge(const BinaryBasicBlock *SrcBB,
-                                      const BinaryBasicBlock *DstBB) const {
-  const BinaryLoop::Edge CFGEdge = std::make_pair(SrcBB, DstBB);
-  return isExitEdge(CFGEdge);
-}
-
-bool BranchPredictionInfo::isLoopHeader(const BinaryBasicBlock *BB) const {
-  return LoopHeaders.count(BB);
-}
-
-bool BranchPredictionInfo::hasCallInst(const BinaryBasicBlock *BB) const {
-  return CallSet.count(BB);
-}
-
-bool BranchPredictionInfo::hasStoreInst(const BinaryBasicBlock *BB) const {
-  return StoreSet.count(BB);
-}
-
-bool BranchPredictionInfo::callToExit(BinaryBasicBlock *BB,
-                                      BinaryContext &BC) const {
-  auto &currBB = *BB;
-  for (auto &Inst : currBB) {
-    if (BC.MIB->isCall(Inst)) {
-      if (const auto *CalleeSymbol = BC.MIB->getTargetSymbol(Inst)) {
-        StringRef CalleeName = CalleeSymbol->getName();
-        if (CalleeName == "__cxa_throw@PLT" ||
-            CalleeName == "_Unwind_Resume@PLT" ||
-            CalleeName == "__cxa_rethrow@PLT" || CalleeName == "exit@PLT" ||
-            CalleeName == "abort@PLT")
-          return true;
-      }
-    }
-  }
-
-  return false;
-}
-
-unsigned BranchPredictionInfo::countBackEdges(BinaryBasicBlock *BB) const {
-  unsigned CountEdges = 0;
-
-  for (BinaryBasicBlock *SuccBB : BB->successors()) {
-    const Edge CFGEdge = std::make_pair(BB->getLabel(), SuccBB->getLabel());
-    if (BackEdges.count(CFGEdge))
-      ++CountEdges;
-  }
-
-  return CountEdges;
-}
-
-unsigned BranchPredictionInfo::countExitEdges(BinaryBasicBlock *BB) const {
-  unsigned CountEdges = 0;
-
-  for (BinaryBasicBlock *SuccBB : BB->successors()) {
-    const BinaryLoop::Edge CFGEdge = std::make_pair(BB, SuccBB);
-    if (ExitEdges.count(CFGEdge))
-      ++CountEdges;
-  }
-
-  return CountEdges;
-}
-
-void BranchPredictionInfo::clear() {
-  LoopHeaders.clear();
-  BackEdges.clear();
-  ExitEdges.clear();
-  CallSet.clear();
-  StoreSet.clear();
-}
-
-} // namespace bolt
-} // namespace llvm
--- a/bolt/src/Passes/BranchPredictionInfo.h
+++ b/bolt/src/Passes/BranchPredictionInfo.h
@ -1,110 +0,0 @@
-//===------ Passes/BranchPredictionInfo.h ---------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This is an auxiliary class to the feature miner, static branch probability
-// and frequency passes. This class is responsible for finding loop info (loop
-// back edges, loop exit edges and loop headers) of a function. It also finds
-// basic block info (if a block contains store and call instructions) and if a
-// basic block contains a call to the exit.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_BRANCHPREDICTIONINFO_H_
-#define LLVM_TOOLS_LLVM_BOLT_PASSES_BRANCHPREDICTIONINFO_H_
-
-#include "BinaryContext.h"
-#include "BinaryFunction.h"
-#include "BinaryLoop.h"
-#include "llvm/MC/MCSymbol.h"
-
-namespace llvm {
-namespace bolt {
-
-class BranchPredictionInfo {
-
-public:
-  /// An edge indicates that a control flow may go from a basic block (source)
-  /// to an other one (destination), and this pair of basic blocks will be used
-  /// to index maps and retrieve content of sets.
-  typedef std::pair<const MCSymbol *, const MCSymbol *> Edge;
-
-private:
-  /// Holds the loop headers of a given function.
-  DenseSet<const BinaryBasicBlock *> LoopHeaders;
-
-  /// Holds the loop backedges of a given function.
-  DenseSet<Edge> BackEdges;
-
-  /// Holds the loop exit edges of a given function.
-  DenseSet<BinaryLoop::Edge> ExitEdges;
-
-  /// Holds the basic blocks of a given function
-  /// that contains at least one call instructions.
-  DenseSet<const BinaryBasicBlock *> CallSet;
-
-  /// Holds the basic blocks of a given function
-  /// that contains at least one store instructions.
-  DenseSet<const BinaryBasicBlock *> StoreSet;
-
-  unsigned NumLoads;
-  unsigned NumStores;
-
-public:
-  unsigned getNumLoads() { return NumLoads; }
-
-  unsigned getNumStores() { return NumStores; }
-
-  /// findLoopEdgesInfo - Finds all loop back edges, loop exit eges
-  /// and loop headers within the function.
-  void findLoopEdgesInfo(const BinaryLoopInfo &LoopsInfo);
-
-  /// findBasicBlockInfo - Finds all call and store instructions within
-  /// the basic blocks of a given function.
-  void findBasicBlockInfo(const BinaryFunction &Function, BinaryContext &BC);
-
-  /// isBackEdge - Checks if the edge is a loop back edge.
-  bool isBackEdge(const Edge &CFGEdge) const;
-
-  /// isBackEdge - Checks if the edge is a loop back edge.
-  bool isBackEdge(const BinaryBasicBlock *SrcBB,
-                  const BinaryBasicBlock *DstBB) const;
-
-  /// isExitEdge - Checks if the edge is a loop exit edge.
-  bool isExitEdge(const BinaryLoop::Edge &CFGEdge) const;
-
-  /// isExitEdge - Checks if the edge is a loop exit edge.
-  bool isExitEdge(const BinaryBasicBlock *SrcBB,
-                  const BinaryBasicBlock *DstBB) const;
-
-  /// isLoopHeader - Checks if the basic block is a loop header.
-  bool isLoopHeader(const BinaryBasicBlock *BB) const;
-
-  /// hasCallInst - Checks if the basic block has a call instruction.
-  bool hasCallInst(const BinaryBasicBlock *BB) const;
-
-  /// hasStoreInst - Checks if the basic block has a store instruction.
-  bool hasStoreInst(const BinaryBasicBlock *BB) const;
-
-  /// callToExit - Checks if a basic block invokes exit function.
-  bool callToExit(BinaryBasicBlock *BB, BinaryContext &BC) const;
-
-  /// countBackEdges - Compute the number of BB's successor that are back edges.
-  unsigned countBackEdges(BinaryBasicBlock *BB) const;
-
-  /// countExitEdges - Compute the number of BB's successor that are exit edges.
-  unsigned countExitEdges(BinaryBasicBlock *BB) const;
-
-  /// clear - Cleans up all the content from the data structs used.
-  void clear();
-};
-
-} // namespace bolt
-} // namespace llvm
-
-#endif /* LLVM_TOOLS_LLVM_BOLT_PASSES_BRANCHPREDICTIONINFO_H_ */
--- a/bolt/src/Passes/CMakeLists.txt
+++ b/bolt/src/Passes/CMakeLists.txt
@ -43,4 +43,4 @@ add_llvm_library(LLVMBOLTPasses
  intrinsics_gen
  )

-include_directories( ${LLVM_MAIN_SRC_DIR}/tools/llvm-bolt/src )
+include_directories( ${BOLT_SOURCE_DIR}/src )
--- a/bolt/src/Passes/CachePlusReorderAlgorithm.cpp
+++ b/bolt/src/Passes/CachePlusReorderAlgorithm.cpp
@ -1,712 +0,0 @@
-//===--- CachePlusReorderAlgorithm.cpp - Order basic blocks ---------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#include "BinaryBasicBlock.h"
-#include "BinaryFunction.h"
-#include "CacheMetrics.h"
-#include "ReorderAlgorithm.h"
-#include "ReorderUtils.h"
-#include "llvm/Support/Options.h"
-
-using namespace llvm;
-using namespace bolt;
-using EdgeList = std::vector<std::pair<BinaryBasicBlock *, uint64_t>>;
-
-namespace opts {
-
-extern cl::OptionCategory BoltOptCategory;
-extern cl::opt<bool> NoThreads;
-
-cl::opt<unsigned>
-ClusterSplitThreshold("cluster-split-threshold",
-  cl::desc("The maximum size of a cluster to apply splitting"),
-  cl::init(128),
-  cl::ZeroOrMore,
-  cl::cat(BoltOptCategory));
-
-}
-
-namespace llvm {
-namespace bolt {
-
-namespace {
-
-// A cluster (ordered sequence) of basic blocks
-class Cluster {
-public:
-  Cluster(BinaryBasicBlock *BB, uint64_t ExecutionCount_, uint64_t Size_)
-  : Id(BB->getLayoutIndex()),
-    IsEntry(BB->getLayoutIndex() == 0),
-    ExecutionCount(ExecutionCount_),
-    Size(Size_),
-    Score(0) {
-    Blocks.push_back(BB);
-  }
-
-  size_t id() const {
-    return Id;
-  }
-
-  uint64_t size() const {
-    return Size;
-  }
-
-  double density() const {
-    return static_cast<double>(ExecutionCount) / Size;
-  }
-
-  bool isCold() const {
-    return ExecutionCount == 0;
-  }
-
-  uint64_t executionCount() const {
-    return ExecutionCount;
-  }
-
-  bool isEntryPoint() const {
-    return IsEntry;
-  }
-
-  double score() const {
-    return Score;
-  }
-
-  const std::vector<BinaryBasicBlock *> &blocks() const {
-    return Blocks;
-  }
-
-  /// Update the list of basic blocks and aggregated cluster data
-  void merge(const Cluster *Other,
-             const std::vector<BinaryBasicBlock *> &MergedBlocks,
-             double MergedScore) {
-    Blocks = MergedBlocks;
-    IsEntry |= Other->IsEntry;
-    ExecutionCount += Other->ExecutionCount;
-    Size += Other->Size;
-    Score = MergedScore;
-  }
-
-  void clear() {
-    Blocks.clear();
-  }
-
-private:
-  std::vector<BinaryBasicBlock *> Blocks;
-  size_t Id;
-  bool IsEntry;
-  uint64_t ExecutionCount;
-  uint64_t Size;
-  double Score;
-};
-
-using ClusterIter = std::vector<BinaryBasicBlock *>::const_iterator;
-
-// A wrapper around three clusters of basic blocks; it is used to avoid extra
-// instantiation of the vectors.
-class MergedCluster {
-public:
-  MergedCluster(ClusterIter Begin1,
-                ClusterIter End1,
-                ClusterIter Begin2,
-                ClusterIter End2,
-                ClusterIter Begin3,
-                ClusterIter End3)
-  : Begin1(Begin1),
-    End1(End1),
-    Begin2(Begin2),
-    End2(End2),
-    Begin3(Begin3),
-    End3(End3) {}
-
-  template<typename F>
-  void forEach(const F &Func) const {
-    for (auto It = Begin1; It != End1; It++)
-      Func(*It);
-    for (auto It = Begin2; It != End2; It++)
-      Func(*It);
-    for (auto It = Begin3; It != End3; It++)
-      Func(*It);
-  }
-
-  std::vector<BinaryBasicBlock *> getBlocks() const {
-    std::vector<BinaryBasicBlock *> Result;
-    Result.reserve(std::distance(Begin1, End1) +
-                   std::distance(Begin2, End2) +
-                   std::distance(Begin3, End3));
-    Result.insert(Result.end(), Begin1, End1);
-    Result.insert(Result.end(), Begin2, End2);
-    Result.insert(Result.end(), Begin3, End3);
-    return Result;
-  }
-
-  const BinaryBasicBlock *getFirstBlock() const {
-    return *Begin1;
-  }
-
-private:
-  ClusterIter Begin1;
-  ClusterIter End1;
-  ClusterIter Begin2;
-  ClusterIter End2;
-  ClusterIter Begin3;
-  ClusterIter End3;
-};
-
-/// Deterministically compare clusters by their density in decreasing order
-bool compareClusters(const Cluster *C1, const Cluster *C2) {
-  // Original entry point to the front
-  if (C1->isEntryPoint())
-    return true;
-  if (C2->isEntryPoint())
-    return false;
-
-  const double D1 = C1->density();
-  const double D2 = C2->density();
-  if (D1 != D2)
-    return D1 > D2;
-
-  // Making the order deterministic
-  return C1->id() < C2->id();
-}
-
-/// Deterministically compare pairs of clusters
-bool compareClusterPairs(const Cluster *A1, const Cluster *B1,
-                         const Cluster *A2, const Cluster *B2) {
-  const auto Samples1 = A1->executionCount() + B1->executionCount();
-  const auto Samples2 = A2->executionCount() + B2->executionCount();
-  if (Samples1 != Samples2)
-    return Samples1 < Samples2;
-
-  // Making the order deterministic
-  if (A1 != A2)
-    return A1->id() < A2->id();
-  return B1->id() < B2->id();
-}
-
-} // end namespace anonymous
-
-/// CachePlus - layout of basic blocks with i-cache optimization.
-///
-/// Similarly to OptimizeCacheReorderAlgorithm, this algorithm is a greedy
-/// heuristic that works with clusters (ordered sequences) of basic blocks.
-/// Initially all clusters are isolated basic blocks. On every iteration,
-/// we pick a pair of clusters whose merging yields the biggest increase in
-/// the ExtTSP metric (see CacheMetrics.cpp for exact implementation), which
-/// models how i-cache "friendly" a specific cluster is. A pair of clusters
-/// giving the maximum gain is merged into a new cluster. The procedure stops
-/// when there is only one cluster left, or when merging does not increase
-/// ExtTSP. In the latter case, the remaining clusters are sorted by density.
-///
-/// An important aspect is the way two clusters are merged. Unlike earlier
-/// algorithms (e.g., OptimizeCacheReorderAlgorithm or Pettis-Hansen), two
-/// clusters, X and Y, are first split into three, X1, X2, and Y. Then we
-/// consider all possible ways of gluing the three clusters (e.g., X1YX2, X1X2Y,
-/// X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the largest score.
-/// This improves the quality of the final result (the search space is larger)
-/// while keeping the implementation sufficiently fast.
-class CachePlus {
-public:
-  CachePlus(const BinaryFunction &BF)
-  : BF(BF),
-    Adjacent(BF.layout_size()),
-    Cache(BF.layout_size()) {
-    initialize();
-  }
-
-  /// Run cache+ algorithm and return a basic block ordering
-  std::vector<BinaryBasicBlock *> run() {
-    // Pass 1: Merge blocks with their fallthrough successors
-    mergeFallthroughs();
-
-    // Pass 2: Merge pairs of clusters while improving the ExtTSP metric
-    mergeClusterPairs();
-
-    // Pass 3: Merge cold blocks to reduce code size
-    mergeColdClusters();
-
-    // Sorting clusters by density
-    std::stable_sort(Clusters.begin(), Clusters.end(), compareClusters);
-
-    // Collect the basic blocks in the order specified by their clusters
-    std::vector<BinaryBasicBlock *> Result;
-    Result.reserve(BF.layout_size());
-    for (auto Cluster : Clusters) {
-      Result.insert(Result.end(),
-                    Cluster->blocks().begin(),
-                    Cluster->blocks().end());
-    }
-
-    return Result;
-  }
-
-private:
-  /// Initialize the set of active clusters, edges between blocks, and
-  /// adjacency matrix.
-  void initialize() {
-    // Initialize indices of basic blocks
-    size_t LayoutIndex = 0;
-    for (auto BB : BF.layout()) {
-      BB->setLayoutIndex(LayoutIndex);
-      LayoutIndex++;
-    }
-
-    // Initialize edges for the blocks and compute their total in/out weights
-    OutEdges = std::vector<EdgeList>(BF.layout_size());
-    auto InWeight = std::vector<uint64_t>(BF.layout_size(), 0);
-    auto OutWeight = std::vector<uint64_t>(BF.layout_size(), 0);
-    for (auto BB : BF.layout()) {
-      auto BI = BB->branch_info_begin();
-      for (auto I : BB->successors()) {
-        assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
-               "missing profile for a jump");
-        if (I != BB && BI->Count > 0) {
-          InWeight[I->getLayoutIndex()] += BI->Count;
-          OutEdges[BB->getLayoutIndex()].push_back(std::make_pair(I, BI->Count));
-          OutWeight[BB->getLayoutIndex()] += BI->Count;
-        }
-        ++BI;
-      }
-    }
-
-    // Initialize execution count for every basic block, which is the
-    // maximum over the sums of all in and out edge weights.
-    // Also execution count of the entry point is set to at least 1
-    auto ExecutionCounts = std::vector<uint64_t>(BF.layout_size(), 0);
-    for (auto BB : BF.layout()) {
-      uint64_t EC = BB->getKnownExecutionCount();
-      EC = std::max(EC, InWeight[BB->getLayoutIndex()]);
-      EC = std::max(EC, OutWeight[BB->getLayoutIndex()]);
-      if (BB->getLayoutIndex() == 0)
-        EC = std::max(EC, uint64_t(1));
-      ExecutionCounts[BB->getLayoutIndex()] = EC;
-    }
-
-    // Create a separate MCCodeEmitter to allow lock-free execution
-    BinaryContext::IndependentCodeEmitter Emitter;
-    if (!opts::NoThreads) {
-      Emitter = BF.getBinaryContext().createIndependentMCCodeEmitter();
-    }
-
-    // Initialize clusters
-    Clusters.reserve(BF.layout_size());
-    AllClusters.reserve(BF.layout_size());
-    CurCluster.reserve(BF.layout_size());
-    Size.reserve(BF.layout_size());
-    for (auto BB : BF.layout()) {
-      size_t Index = BB->getLayoutIndex();
-      Size.push_back(
-          std::max<uint64_t>(BB->estimateSize(Emitter.MCE.get()), 1));
-      AllClusters.emplace_back(BB, ExecutionCounts[Index], Size[Index]);
-      Clusters.push_back(&AllClusters[Index]);
-      CurCluster.push_back(&AllClusters[Index]);
-    }
-
-    // Initialize adjacency matrix
-    Adjacent.initialize(Clusters);
-    for (auto BB : BF.layout()) {
-      auto BI = BB->branch_info_begin();
-      for (auto I : BB->successors()) {
-        if (BB != I && BI->Count > 0) {
-          Adjacent.set(Clusters[BB->getLayoutIndex()],
-                       Clusters[I->getLayoutIndex()]);
-        }
-        ++BI;
-      }
-    }
-
-    // Initialize fallthrough successors
-    findFallthroughBlocks(InWeight, OutWeight);
-  }
-
-  /// Merge blocks with their fallthrough successors.
-  void mergeFallthroughs() {
-    for (auto BB : BF.layout()) {
-      if (FallthroughPred[BB->getLayoutIndex()] == nullptr &&
-          FallthroughSucc[BB->getLayoutIndex()] != nullptr) {
-        auto CurBB = BB;
-        while (FallthroughSucc[CurBB->getLayoutIndex()] != nullptr) {
-          const auto NextBB = FallthroughSucc[CurBB->getLayoutIndex()];
-          mergeClusters(&AllClusters[BB->getLayoutIndex()],
-                        &AllClusters[NextBB->getLayoutIndex()],
-                        0);
-          CurBB = NextBB;
-        }
-      }
-    }
-  }
-
-  /// Merge pairs of clusters while improving the ExtTSP metric
-  void mergeClusterPairs() {
-    while (Clusters.size() > 1) {
-      Cluster *BestClusterPred = nullptr;
-      Cluster *BestClusterSucc = nullptr;
-      std::pair<double, size_t> BestGain(-1, 0);
-      for (auto ClusterPred : Clusters) {
-        // Do not merge cold blocks
-        if (ClusterPred->isCold())
-          continue;
-
-        // Get candidates for merging with the current cluster
-        Adjacent.forAllAdjacent(
-          ClusterPred,
-          // Find the best candidate
-          [&](Cluster *ClusterSucc) {
-            assert(ClusterPred != ClusterSucc && "loop edges are not supported");
-            assert(!ClusterSucc->isCold() && "cannot merge cold clusters");
-
-            // Compute the gain of merging two clusters
-            auto Gain = mergeGain(ClusterPred, ClusterSucc);
-            if (Gain.first <= 0.0)
-              return;
-
-            // Breaking ties by density to make the hottest clusters be merged first
-            if (Gain.first > BestGain.first ||
-                (std::abs(Gain.first - BestGain.first) < 1e-8 &&
-                 compareClusterPairs(ClusterPred,
-                                     ClusterSucc,
-                                     BestClusterPred,
-                                     BestClusterSucc))) {
-              BestGain = Gain;
-              BestClusterPred = ClusterPred;
-              BestClusterSucc = ClusterSucc;
-            }
-          });
-      }
-
-      // Stop merging when there is no improvement
-      if (BestGain.first <= 0.0)
-        break;
-
-      // Merge the best pair of clusters
-      mergeClusters(BestClusterPred, BestClusterSucc, BestGain.second);
-    }
-  }
-
-  /// Merge cold blocks to reduce code size
-  void mergeColdClusters() {
-    for (auto SrcBB : BF.layout()) {
-      // Iterating in reverse order to make sure original fall-trough jumps are
-      // merged first
-      for (auto Itr = SrcBB->succ_rbegin(); Itr != SrcBB->succ_rend(); ++Itr) {
-        BinaryBasicBlock *DstBB = *Itr;
-        auto SrcCluster = CurCluster[SrcBB->getLayoutIndex()];
-        auto DstCluster = CurCluster[DstBB->getLayoutIndex()];
-        if (SrcCluster != DstCluster && !DstCluster->isEntryPoint() &&
-            SrcCluster->blocks().back() == SrcBB &&
-            DstCluster->blocks().front() == DstBB) {
-          mergeClusters(SrcCluster, DstCluster, 0);
-        }
-      }
-    }
-  }
-
-  /// For a pair of blocks, A and B, block B is the fallthrough successor of A,
-  /// if (i) all jumps (based on profile) from A goes to B and (ii) all jumps
-  /// to B are from A. Such blocks should be adjacent in an optimal ordering,
-  /// and the method finds such pairs of blocks.
-  void findFallthroughBlocks(const std::vector<uint64_t> &InWeight,
-                             const std::vector<uint64_t> &OutWeight) {
-    FallthroughSucc = std::vector<BinaryBasicBlock *>(BF.size(), nullptr);
-    FallthroughPred = std::vector<BinaryBasicBlock *>(BF.size(), nullptr);
-    // Find fallthroughs based on edge weights
-    for (auto BB : BF.layout()) {
-      if (BB->succ_size() == 1 &&
-          BB->getSuccessor()->pred_size() == 1 &&
-          BB->getSuccessor()->getLayoutIndex() != 0) {
-        FallthroughSucc[BB->getLayoutIndex()] = BB->getSuccessor();
-        FallthroughPred[BB->getSuccessor()->getLayoutIndex()] = BB;
-        continue;
-      }
-
-      if (OutWeight[BB->getLayoutIndex()] == 0)
-        continue;
-      for (auto Edge : OutEdges[BB->getLayoutIndex()]) {
-        const auto SuccBB = Edge.first;
-        // Successor cannot be the first BB, which is pinned
-        if (OutWeight[BB->getLayoutIndex()] == Edge.second &&
-            InWeight[SuccBB->getLayoutIndex()] == Edge.second &&
-            SuccBB->getLayoutIndex() != 0) {
-          FallthroughSucc[BB->getLayoutIndex()] = SuccBB;
-          FallthroughPred[SuccBB->getLayoutIndex()] = BB;
-          break;
-        }
-      }
-    }
-
-    // There might be 'cycles' in the fallthrough dependencies (since profile
-    // data isn't 100% accurate).
-    // Break the cycles by choosing the block with smallest index as the tail
-    for (auto BB : BF.layout()) {
-      const auto Idx = BB->getLayoutIndex();
-      if (FallthroughSucc[Idx] == nullptr || FallthroughPred[Idx] == nullptr)
-        continue;
-
-      auto SuccBB = FallthroughSucc[Idx];
-      while (SuccBB != nullptr && SuccBB != BB) {
-        SuccBB = FallthroughSucc[SuccBB->getLayoutIndex()];
-      }
-      if (SuccBB == nullptr)
-        continue;
-      // break the cycle
-      FallthroughSucc[FallthroughPred[Idx]->getLayoutIndex()] = nullptr;
-      FallthroughPred[Idx] = nullptr;
-    }
-  }
-
-  /// Compute ExtTSP score for a given order of basic blocks
-  double score(const MergedCluster& MergedBlocks) const {
-    uint64_t NotSet = static_cast<uint64_t>(-1);
-    EstimatedAddr.assign(BF.layout_size(), NotSet);
-
-    uint64_t CurAddr = 0;
-    MergedBlocks.forEach(
-      [&](const BinaryBasicBlock *BB) {
-        size_t Index = BB->getLayoutIndex();
-        EstimatedAddr[Index] = CurAddr;
-        CurAddr += Size[Index];
-      }
-    );
-
-    double Score = 0;
-    MergedBlocks.forEach(
-      [&](const BinaryBasicBlock *BB) {
-        size_t Index = BB->getLayoutIndex();
-        for (auto Edge : OutEdges[Index]) {
-          auto SuccBB = Edge.first;
-          size_t SuccIndex = SuccBB->getLayoutIndex();
-
-          if (EstimatedAddr[SuccIndex] != NotSet) {
-            Score += CacheMetrics::extTSPScore(EstimatedAddr[Index],
-                                               Size[Index],
-                                               EstimatedAddr[SuccIndex],
-                                               Edge.second);
-          }
-        }
-      }
-    );
-
-    return Score;
-  }
-
-  /// Verify if it is valid to merge two clusters into the new one
-  bool isValidMerge(const Cluster *ClusterPred,
-                    const Cluster *ClusterSucc,
-                    size_t MergeType,
-                    const MergedCluster& MergedBlocks) const {
-    // Does the new cluster preserve the original entry point?
-    if ((ClusterPred->isEntryPoint() || ClusterSucc->isEntryPoint()) &&
-        MergedBlocks.getFirstBlock()->getLayoutIndex() != 0)
-      return false;
-
-    // This corresponds to a concatentation of clusters w/o splitting, which is
-    // always safe
-    if (MergeType == 0)
-      return true;
-
-    size_t Offset = MergeType / 5;
-    // The basic blocks on the boundary of a split of ClusterPred
-    auto BB1 = ClusterPred->blocks()[Offset - 1];
-    auto BB2 = ClusterPred->blocks()[Offset];
-    // Does the splitting break FT successors?
-    if (FallthroughSucc[BB1->getLayoutIndex()] != nullptr) {
-      assert(FallthroughSucc[BB1->getLayoutIndex()] == BB2 &&
-             "Fallthrough successor is not preserved");
-      return false;
-    }
-
-    // Do not split large clusters to reduce computation time
-    if (ClusterPred->blocks().size() > opts::ClusterSplitThreshold) {
-      return false;
-    }
-
-    return true;
-  }
-
-  /// The gain of merging two clusters.
-  ///
-  /// The function considers all possible ways of merging two clusters and
-  /// computes the one having the largest increase in ExtTSP metric. The result
-  /// is a pair with the first element being the gain and the second element being
-  /// the corresponding merging type (encoded as an integer).
-  std::pair<double, size_t> mergeGain(const Cluster *ClusterPred,
-                                      const Cluster *ClusterSucc) const {
-    if (Cache.contains(ClusterPred, ClusterSucc)) {
-      return Cache.get(ClusterPred, ClusterSucc);
-    }
-
-    // The current score of two separate clusters
-    const auto CurScore = ClusterPred->score() + ClusterSucc->score();
-
-    // Merge two clusters and update the best Gain
-    auto computeMergeGain = [&](const std::pair<double, size_t> &CurGain,
-                                const Cluster *ClusterPred,
-                                const Cluster *ClusterSucc,
-                                size_t MergeType) {
-      auto MergedBlocks = mergeBlocks(ClusterPred->blocks(),
-                                      ClusterSucc->blocks(),
-                                      MergeType);
-
-      if (!isValidMerge(ClusterPred, ClusterSucc, MergeType, MergedBlocks))
-        return CurGain;
-
-      // The score of the new cluster
-      const auto NewScore = score(MergedBlocks);
-      if (NewScore > CurScore && NewScore - CurScore > CurGain.first)
-        return std::make_pair(NewScore - CurScore, MergeType);
-      else
-        return CurGain;
-    };
-
-    std::pair<double, size_t> Gain = std::make_pair(-1, 0);
-    // Try to concatenate two clusters w/o splitting
-    Gain = computeMergeGain(Gain, ClusterPred, ClusterSucc, 0);
-    // Try to split ClusterPred into two sub-clusters in various ways and then
-    // merge it with ClusterSucc
-    for (size_t Offset = 1; Offset < ClusterPred->blocks().size(); Offset++) {
-      for (size_t Type = 1; Type <= 4; Type++) {
-        size_t MergeType = Type + Offset * 5;
-        Gain = computeMergeGain(Gain, ClusterPred, ClusterSucc, MergeType);
-      }
-    }
-
-    Cache.set(ClusterPred, ClusterSucc, Gain);
-    return Gain;
-  }
-
-  /// Merge two clusters of blocks respecting a given merge 'type' and 'offset'.
-  ///
-  /// If MergeType == 0, then the result is a concatentation of two clusters.
-  /// Otherwise, the first cluster is cut into two sub-clusters at the offset,
-  /// and merged using all possible ways of concatenating three clusters.
-  MergedCluster mergeBlocks(const std::vector<BinaryBasicBlock *> &X,
-                            const std::vector<BinaryBasicBlock *> &Y,
-                            size_t MergeType) const {
-    // Merging w/o splitting existing clusters
-    if (MergeType == 0) {
-      ClusterIter Empty;
-      return MergedCluster(X.begin(), X.end(), Y.begin(), Y.end(), Empty, Empty);
-    }
-
-    size_t Type = MergeType % 5;
-    size_t Offset = MergeType / 5;
-    assert(0 < Offset && Offset < X.size() &&
-           "Invalid offset while merging clusters");
-    // Split the first cluster, X, into X1 and X2
-    ClusterIter BeginX1 = X.begin();
-    ClusterIter EndX1 = X.begin() + Offset;
-    ClusterIter BeginX2 = X.begin() + Offset;
-    ClusterIter EndX2 = X.end();
-    ClusterIter BeginY = Y.begin();
-    ClusterIter EndY = Y.end();
-
-    // Construct a new cluster from three existing ones
-    switch(Type) {
-    case 1: return MergedCluster(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2);
-    case 2: return MergedCluster(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1);
-    case 3: return MergedCluster(BeginX2, EndX2, BeginY, EndY, BeginX1, EndX1);
-    case 4: return MergedCluster(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY);
-    default:
-      llvm_unreachable("unexpected merge type");
-    }
-  }
-
-  /// Merge cluster From into cluster Into, update the list of active clusters,
-  /// adjacency information, and the corresponding cache.
-  void mergeClusters(Cluster *Into, Cluster *From, size_t MergeType) {
-    assert(Into != From && "Cluster cannot be merged with itself");
-
-    // Merge the blocks of clusters
-    auto MergedBlocks = mergeBlocks(Into->blocks(), From->blocks(), MergeType);
-    Into->merge(From, MergedBlocks.getBlocks(), score(MergedBlocks));
-    From->clear();
-
-    // Remove cluster From from the list of active clusters
-    auto Iter = std::remove(Clusters.begin(), Clusters.end(), From);
-    Clusters.erase(Iter, Clusters.end());
-
-    // Update block clusters
-    for (auto BB : Into->blocks()) {
-      CurCluster[BB->getLayoutIndex()] = Into;
-    }
-
-    // Invalidate caches
-    Cache.invalidate(Into);
-
-    // Update the adjacency matrix
-    Adjacent.merge(Into, From);
-  }
-
-  // The binary function
-  const BinaryFunction &BF;
-
-  // All clusters
-  std::vector<Cluster> AllClusters;
-
-  // Active clusters. The vector gets udpated at runtime when clusters are merged
-  std::vector<Cluster *> Clusters;
-
-  // Current cluster of a basic block
-  std::vector<Cluster *> CurCluster;
-
-  // Size of the block
-  std::vector<uint64_t> Size;
-
-  // Outgoing edges of the block
-  std::vector<EdgeList> OutEdges;
-
-  // Cluster adjacency matrix
-  AdjacencyMatrix<Cluster> Adjacent;
-
-  // Fallthrough successor of the block
-  std::vector<BinaryBasicBlock *> FallthroughSucc;
-  // Fallthrough predecessor of the block
-  std::vector<BinaryBasicBlock *> FallthroughPred;
-
-  // A cache that keeps precomputed values of mergeGain for pairs of clusters;
-  // when a pair of clusters (x,y) gets merged, we invalidate the pairs
-  // containing both x and y and all clusters adjacent to x and y (and recompute
-  // them on the next iteration).
-  mutable ClusterPairCache<Cluster, std::pair<double, size_t>> Cache;
-
-  // A reusable vector used within score() method
-  mutable std::vector<uint64_t> EstimatedAddr;
-};
-
-void CachePlusReorderAlgorithm::reorderBasicBlocks(
-      const BinaryFunction &BF, BasicBlockOrder &Order) const {
-  if (BF.layout_empty())
-    return;
-
-  // Are there jumps with positive execution count?
-  size_t NumHotBlocks = 0;
-  for (auto BB : BF.layout()) {
-    if (BB->getKnownExecutionCount() > 0)
-      NumHotBlocks++;
-  }
-
-  // Do not change layout of functions w/o profile information
-  if (NumHotBlocks == 0 || BF.layout_size() <= 1) {
-    for (auto BB : BF.layout()) {
-      Order.push_back(BB);
-    }
-    return;
-  }
-
-  // Apply the algorithm
-  Order = CachePlus(BF).run();
-
-  // Verify correctness
-  assert(Order[0]->isEntryPoint() && "Original entry point is not preserved");
-  assert(Order.size() == BF.layout_size() && "Wrong size of reordered layout");
-}
-
-} // namespace bolt
-} // namespace llvm
--- a/bolt/src/Passes/DataflowAnalysis.cpp
+++ b/bolt/src/Passes/DataflowAnalysis.cpp
@ -1,8 +1,31 @@
 #include "DataflowAnalysis.h"

+#define DEBUG_TYPE "dataflow"
+
 namespace llvm {

-raw_ostream &operator<<(raw_ostream &OS, const BitVector &Val) {
+raw_ostream &operator<<(raw_ostream &OS, const BitVector &State) {
+  LLVM_DEBUG({
+    OS << "BitVector(";
+    auto Sep = "";
+    if (State.count() > (State.size() >> 1)) {
+      OS << "all, except: ";
+      auto BV = State;
+      BV.flip();
+      for (auto I = BV.find_first(); I != -1; I = BV.find_next(I)) {
+        OS << Sep << I;
+        Sep = " ";
+      }
+      OS << ")";
+      return OS;
+    }
+    for (auto I = State.find_first(); I != -1; I = State.find_next(I)) {
+      OS << Sep << I;
+      Sep = " ";
+    }
+    OS << ")";
+    return OS;
+  });
  OS << "BitVector";
  return OS;
 }
--- a/bolt/src/Passes/ExtTSPReorderAlgorithm.cpp
+++ b/bolt/src/Passes/ExtTSPReorderAlgorithm.cpp
@ -34,7 +34,7 @@
 #include "BinaryFunction.h"
 #include "CacheMetrics.h"
 #include "ReorderAlgorithm.h"
-#include "llvm/Support/Options.h"
+#include "llvm/Support/CommandLine.h"

 using namespace llvm;
 using namespace bolt;
--- a/bolt/src/Passes/FeatureMiner.cpp
+++ b/bolt/src/Passes/FeatureMiner.cpp
@ -1,800 +0,0 @@
-#include "Passes/FeatureMiner.h"
-#include "Passes/DataflowInfoManager.h"
-
-#undef DEBUG_TYPE
-#define DEBUG_TYPE "bolt-feature-miner"
-
-namespace llvm {
-namespace bolt {
-
-class BinaryFunction;
-
-int8_t FeatureMiner::getProcedureType(BinaryFunction &Function,
-                                      BinaryContext &BC) {
-  int8_t ProcedureType = 1;
-  for (auto &BB : Function) {
-    for (auto &Inst : BB) {
-      if (BC.MIB->isCall(Inst)) {
-        ProcedureType = 0; // non-leaf type
-        if (const auto *CalleeSymbol = BC.MIB->getTargetSymbol(Inst)) {
-          const auto *Callee = BC.getFunctionForSymbol(CalleeSymbol);
-          if (Callee &&
-              Callee->getFunctionNumber() == Function.getFunctionNumber()) {
-            return 2; // call self type
-          }
-        }
-      }
-    }
-  }
-  return ProcedureType; // leaf type
-}
-
-void FeatureMiner::addSuccessorInfo(DominatorAnalysis<false> &DA,
-                                    DominatorAnalysis<true> &PDA,
-                                    SBIPtr const &SBI, BinaryFunction &Function,
-                                    BinaryContext &BC, MCInst &Inst,
-                                    BinaryBasicBlock &BB, bool SuccType) {
-
-  BinaryBasicBlock *Successor = BB.getConditionalSuccessor(SuccType);
-
-  if (!Successor)
-    return;
-
-  unsigned NumLoads{0};
-  unsigned NumStores{0};
-  unsigned NumCallsExit{0};
-  unsigned NumCalls{0};
-  unsigned NumCallsInvoke{0};
-  unsigned NumTailCalls{0};
-  unsigned NumIndirectCalls{0};
-
-  for (auto &Inst : BB) {
-    if (BC.MIB->isLoad(Inst)) {
-      ++NumLoads;
-    } else if (BC.MIB->isStore(Inst)) {
-      ++NumStores;
-    } else if (BC.MIB->isCall(Inst)) {
-      ++NumCalls;
-
-      if (BC.MIB->isIndirectCall(Inst))
-        ++NumIndirectCalls;
-
-      if (BC.MIB->isInvoke(Inst))
-        ++NumCallsInvoke;
-
-      if (BC.MIB->isTailCall(Inst))
-        ++NumTailCalls;
-
-      if (const auto *CalleeSymbol = BC.MIB->getTargetSymbol(Inst)) {
-        StringRef CalleeName = CalleeSymbol->getName();
-        if (CalleeName == "__cxa_throw@PLT" ||
-            CalleeName == "_Unwind_Resume@PLT" ||
-            CalleeName == "__cxa_rethrow@PLT" || CalleeName == "exit@PLT" ||
-            CalleeName == "abort@PLT")
-          ++NumCallsExit;
-      }
-    }
-  }
-
-  BBIPtr SuccBBInfo = std::make_unique<struct BasicBlockInfo>();
-
-  // Check if the successor basic block is a loop header and store it.
-  SuccBBInfo->LoopHeader = BPI->isLoopHeader(Successor);
-
-  SuccBBInfo->BasicBlockSize = Successor->size();
-
-  // Check if the edge getting to the successor basic block is a loop
-  // exit edge and store it.
-  SuccBBInfo->Exit = BPI->isExitEdge(&BB, Successor);
-
-  // Check if the edge getting to the successor basic block is a loop
-  // back edge and store it.
-  SuccBBInfo->Backedge = BPI->isBackEdge(&BB, Successor);
-
-  MCInst *SuccInst = Successor->getTerminatorBefore(nullptr);
-  // Store information about the branch type ending sucessor basic block
-  SuccBBInfo->EndOpcode = (SuccInst && BC.MIA->isBranch(*SuccInst))
-                              ? SuccInst->getOpcode()
-                              : 0; // 0 = NOTHING
-  if (SuccBBInfo->EndOpcode != 0)
-    SuccBBInfo->EndOpcodeStr = BC.MII->getName(SuccInst->getOpcode());
-  else
-    SuccBBInfo->EndOpcodeStr = "NOTHING";
-
-  // Check if the successor basic block contains
-  // a procedure call and store it.
-  SuccBBInfo->Call = (NumCalls > 0) ? 1  // Contains a call instruction
-                                    : 0; // Does not contain a call instruction
-
-  SuccBBInfo->NumStores = NumStores;
-  SuccBBInfo->NumLoads = NumLoads;
-  SuccBBInfo->NumCallsExit = NumCallsExit;
-  SuccBBInfo->NumCalls = NumCalls;
-
-  SuccBBInfo->NumCallsInvoke = NumCallsInvoke;
-  SuccBBInfo->NumIndirectCalls = NumIndirectCalls;
-  SuccBBInfo->NumTailCalls = NumTailCalls;
-
-  auto InstSucc = Successor->getLastNonPseudoInstr();
-  if (InstSucc) {
-    // Check if the source basic block dominates its
-    // target basic block and store it.
-    SuccBBInfo->BranchDominates = (DA.doesADominateB(Inst, *InstSucc) == true)
-                                      ? 1  // Dominates
-                                      : 0; // Does not dominate
-
-    // Check if the target basic block postdominates
-    // the source basic block and store it.
-    SuccBBInfo->BranchPostdominates =
-        (PDA.doesADominateB(*InstSucc, Inst) == true)
-            ? 1  // Postdominates
-            : 0; // Does not postdominate
-  }
-
-  /// The follwoing information is used as an identifier only for
-  /// the purpose of matching the inferred probabilities with the branches
-  /// in the binary.
-  SuccBBInfo->FromFunName = Function.getPrintName();
-  SuccBBInfo->FromBb = BB.getInputOffset();
-  BinaryFunction *ToFun = Successor->getFunction();
-  SuccBBInfo->ToFunName = ToFun->getPrintName();
-  SuccBBInfo->ToBb = Successor->getInputOffset();
-
-  auto Offset = BC.MIB->tryGetAnnotationAs<uint64_t>(Inst, "Offset");
-  if (Offset) {
-    int64_t Delta = Successor->getInputOffset() - Offset.get();
-    SBI->DeltaTaken = std::abs(Delta);
-  }
-
-  if (SuccType) {
-    SBI->TrueSuccessor = std::move(SuccBBInfo);
-
-    // Check if the taken branch is a forward
-    // or a backwards branch and store it.
-    SBI->Direction = (Function.isForwardBranch(&BB, Successor) == true)
-                         ? 1  // Forward branch
-                         : 0; // Backwards branch
-
-    auto TakenBranchInfo = BB.getTakenBranchInfo();
-    SBI->Count = TakenBranchInfo.Count;
-    SBI->MissPredicted = TakenBranchInfo.MispredictedCount;
-  } else {
-    SBI->FalseSuccessor = std::move(SuccBBInfo);
-
-    auto FallthroughBranchInfo = BB.getFallthroughBranchInfo();
-    SBI->FallthroughCount = FallthroughBranchInfo.Count;
-    SBI->FallthroughMissPredicted = FallthroughBranchInfo.MispredictedCount;
-  }
-}
-
-void FeatureMiner::extractFeatures(BinaryFunction &Function,
-                                   BinaryContext &BC) {
-  int8_t ProcedureType = getProcedureType(Function, BC);
-  auto Info = DataflowInfoManager(BC, Function, nullptr, nullptr);
-  auto &DA = Info.getDominatorAnalysis();
-  auto &PDA = Info.getPostDominatorAnalysis();
-  const BinaryLoopInfo &LoopsInfo = Function.getLoopInfo();
-  bool Simple = Function.isSimple();
-
-  for (auto &BB : Function) {
-
-    unsigned NumOuterLoops{0};
-    unsigned TotalLoops{0};
-    unsigned MaximumLoopDepth{0};
-    unsigned LoopDepth{0};
-    unsigned LoopNumExitEdges{0};
-    unsigned LoopNumExitBlocks{0};
-    unsigned LoopNumExitingBlocks{0};
-    unsigned LoopNumLatches{0};
-    unsigned LoopNumBlocks{0};
-    unsigned LoopNumBackEdges{0};
-
-    bool LocalExitingBlock{false};
-    bool LocalLatchBlock{false};
-    bool LocalLoopHeader{false};
-
-    BinaryLoop *Loop = LoopsInfo.getLoopFor(&BB);
-    if (Loop) {
-      SmallVector<BinaryBasicBlock *, 1> ExitingBlocks;
-      Loop->getExitingBlocks(ExitingBlocks);
-
-      SmallVector<BinaryBasicBlock *, 1> ExitBlocks;
-      Loop->getExitBlocks(ExitBlocks);
-
-      SmallVector<BinaryLoop::Edge, 1> ExitEdges;
-      Loop->getExitEdges(ExitEdges);
-
-      SmallVector<BinaryBasicBlock *, 1> Latches;
-      Loop->getLoopLatches(Latches);
-
-      NumOuterLoops = LoopsInfo.OuterLoops;
-      TotalLoops = LoopsInfo.TotalLoops;
-      MaximumLoopDepth = LoopsInfo.MaximumDepth;
-      LoopDepth = Loop->getLoopDepth();
-      LoopNumExitEdges = ExitEdges.size();
-      LoopNumExitBlocks = ExitBlocks.size();
-      LoopNumExitingBlocks = ExitingBlocks.size();
-      LoopNumLatches = Latches.size();
-      LoopNumBlocks = Loop->getNumBlocks();
-      LoopNumBackEdges = Loop->getNumBackEdges();
-
-      LocalExitingBlock = Loop->isLoopExiting(&BB);
-      LocalLatchBlock = Loop->isLoopLatch(&BB);
-      LocalLoopHeader = ((Loop->getHeader() == (&BB)) ? 1 : 0);
-    }
-
-    unsigned NumLoads{0};
-    unsigned NumStores{0};
-    unsigned NumCallsExit{0};
-    unsigned NumCalls{0};
-    unsigned NumCallsInvoke{0};
-    unsigned NumTailCalls{0};
-    unsigned NumIndirectCalls{0};
-    unsigned NumSelfCalls{0};
-
-    for (auto &Inst : BB) {
-      if (BC.MIB->isLoad(Inst)) {
-        ++NumLoads;
-      } else if (BC.MIB->isStore(Inst)) {
-        ++NumStores;
-      } else if (BC.MIB->isCall(Inst)) {
-        ++NumCalls;
-
-        if (BC.MIB->isIndirectCall(Inst))
-          ++NumIndirectCalls;
-
-        if (BC.MIB->isInvoke(Inst))
-          ++NumCallsInvoke;
-
-        if (BC.MIB->isTailCall(Inst))
-          ++NumTailCalls;
-
-        if (const auto *CalleeSymbol = BC.MIB->getTargetSymbol(Inst)) {
-          StringRef CalleeName = CalleeSymbol->getName();
-          if (CalleeName == "__cxa_throw@PLT" ||
-              CalleeName == "_Unwind_Resume@PLT" ||
-              CalleeName == "__cxa_rethrow@PLT" || CalleeName == "exit@PLT" ||
-              CalleeName == "abort@PLT")
-            ++NumCallsExit;
-          else if (CalleeName == Function.getPrintName()) {
-            ++NumSelfCalls;
-          }
-        }
-      }
-    }
-
-    int Index = -2;
-    bool LoopHeader = BPI->isLoopHeader(&BB);
-    for (auto &Inst : BB) {
-      ++Index;
-
-      if (!BC.MIA->isConditionalBranch(Inst))
-        continue;
-
-      SBIPtr SBI = std::make_unique<struct StaticBranchInfo>();
-
-      SBI->Simple = Simple;
-      SBI->NumOuterLoops = NumOuterLoops;
-      SBI->TotalLoops = TotalLoops;
-      SBI->MaximumLoopDepth = MaximumLoopDepth;
-      SBI->LoopDepth = LoopDepth;
-      SBI->LoopNumExitEdges = LoopNumExitEdges;
-      SBI->LoopNumExitBlocks = LoopNumExitBlocks;
-      SBI->LoopNumExitingBlocks = LoopNumExitingBlocks;
-      SBI->LoopNumLatches = LoopNumLatches;
-      SBI->LoopNumBlocks = LoopNumBlocks;
-      SBI->LoopNumBackEdges = LoopNumBackEdges;
-
-      SBI->LocalExitingBlock = LocalExitingBlock;
-      SBI->LocalLatchBlock = LocalLatchBlock;
-      SBI->LocalLoopHeader = LocalLoopHeader;
-
-      SBI->Call = ((NumCalls > 0) ? 1 : 0);
-      SBI->NumCalls = NumCalls;
-
-      SBI->BasicBlockSize = BB.size();
-      SBI->NumBasicBlocks = Function.size();
-      SBI->NumSelfCalls = NumSelfCalls;
-
-      SBI->NumLoads = NumLoads;
-      SBI->NumStores = NumStores;
-      SBI->NumCallsExit = NumCallsExit;
-
-      SBI->NumCallsInvoke = NumCallsInvoke;
-      SBI->NumIndirectCalls = NumIndirectCalls;
-      SBI->NumTailCalls = NumTailCalls;
-
-      // Check if branch's basic block is a loop header and store it.
-      SBI->LoopHeader = LoopHeader;
-
-      // Adding taken successor info.
-      addSuccessorInfo(DA, PDA, SBI, Function, BC, Inst, BB, true);
-      // Adding fall through successor info.
-      addSuccessorInfo(DA, PDA, SBI, Function, BC, Inst, BB, false);
-
-      // Holds the branch opcode info.
-      SBI->Opcode = Inst.getOpcode();
-      SBI->OpcodeStr = BC.MII->getName(Inst.getOpcode());
-
-      // Holds the branch's procedure type.
-      SBI->ProcedureType = ProcedureType;
-
-      SBI->CmpOpcode = 0;
-      if (Index > -1) {
-        auto Cmp = BB.begin() + Index;
-
-        if (BC.MII->get((*Cmp).getOpcode()).isCompare()) {
-          // Holding the branch comparison opcode info.
-          SBI->CmpOpcode = (*Cmp).getOpcode();
-
-          SBI->CmpOpcodeStr = BC.MII->getName((*Cmp).getOpcode());
-
-          auto getOperandType = [&](const MCOperand &Operand) -> int32_t {
-            if (Operand.isReg())
-              return 0;
-            else if (Operand.isImm())
-              return 1;
-            else if (Operand.isFPImm())
-              return 2;
-            else if (Operand.isExpr())
-              return 3;
-            else
-              return -1;
-          };
-
-          const auto InstInfo = BC.MII->get((*Cmp).getOpcode());
-          unsigned NumDefs = InstInfo.getNumDefs();
-          int32_t NumPrimeOperands =
-              MCPlus::getNumPrimeOperands(*Cmp) - NumDefs;
-          switch (NumPrimeOperands) {
-          case 6: {
-            int32_t RBType = getOperandType((*Cmp).getOperand(NumDefs));
-            int32_t RAType = getOperandType((*Cmp).getOperand(NumDefs + 1));
-
-            if (RBType == 0 && RAType == 0) {
-              SBI->OperandRBType = RBType;
-              SBI->OperandRAType = RAType;
-            } else if (RBType == 0 && (RAType == 1 || RAType == 2)) {
-              RAType = getOperandType((*Cmp).getOperand(NumPrimeOperands - 1));
-
-              if (RAType != 1 && RAType != 2) {
-                RAType = -1;
-              }
-
-              SBI->OperandRBType = RBType;
-              SBI->OperandRAType = RAType;
-            } else {
-              SBI->OperandRAType = -1;
-              SBI->OperandRBType = -1;
-            }
-            break;
-          }
-          case 2:
-            SBI->OperandRBType = getOperandType((*Cmp).getOperand(NumDefs));
-            SBI->OperandRAType = getOperandType((*Cmp).getOperand(NumDefs + 1));
-            break;
-          case 3:
-            SBI->OperandRBType = getOperandType((*Cmp).getOperand(NumDefs));
-            SBI->OperandRAType = getOperandType((*Cmp).getOperand(NumDefs + 2));
-            break;
-          case 1:
-            SBI->OperandRAType = getOperandType((*Cmp).getOperand(NumDefs));
-            break;
-          default:
-            SBI->OperandRAType = -1;
-            SBI->OperandRBType = -1;
-            break;
-          }
-
-        } else {
-          Index -= 1;
-          for (int Idx = Index; Idx > -1; Idx--) {
-            auto Cmp = BB.begin() + Idx;
-            if (BC.MII->get((*Cmp).getOpcode()).isCompare()) {
-              // Holding the branch comparison opcode info.
-              SBI->CmpOpcode = (*Cmp).getOpcode();
-              SBI->CmpOpcodeStr = BC.MII->getName((*Cmp).getOpcode());
-              break;
-            }
-          }
-        }
-      }
-
-      this->BranchesInfoSet.push_back(std::move(SBI));
-    }
-  }
-}
-
-void FeatureMiner::dumpSuccessorFeatures(raw_ostream &Printer,
-                                         BBIPtr &Successor) {
-  int16_t BranchDominates =
-      (Successor->BranchDominates.hasValue())
-          ? static_cast<bool>(*(Successor->BranchDominates))
-          : -1;
-
-  int16_t BranchPostdominates =
-      (Successor->BranchPostdominates.hasValue())
-          ? static_cast<bool>(*(Successor->BranchPostdominates))
-          : -1;
-
-  int16_t LoopHeader = (Successor->LoopHeader.hasValue())
-                           ? static_cast<bool>(*(Successor->LoopHeader))
-                           : -1;
-
-  int16_t Backedge = (Successor->Backedge.hasValue())
-                         ? static_cast<bool>(*(Successor->Backedge))
-                         : -1;
-
-  int16_t Exit =
-      (Successor->Exit.hasValue()) ? static_cast<bool>(*(Successor->Exit)) : -1;
-
-  int16_t Call =
-      (Successor->Call.hasValue()) ? static_cast<bool>(*(Successor->Call)) : -1;
-
-  int32_t EndOpcode = (Successor->EndOpcode.hasValue())
-                          ? static_cast<int32_t>(*(Successor->EndOpcode))
-                          : -1;
-
-  int64_t NumLoads = (Successor->NumLoads.hasValue())
-                         ? static_cast<int64_t>(*(Successor->NumLoads))
-                         : -1;
-
-  int64_t NumStores = (Successor->NumStores.hasValue())
-                          ? static_cast<int64_t>(*(Successor->NumStores))
-                          : -1;
-
-  int64_t BasicBlockSize =
-      (Successor->BasicBlockSize.hasValue())
-          ? static_cast<int64_t>(*(Successor->BasicBlockSize))
-          : -1;
-
-  int64_t NumCalls = (Successor->NumCalls.hasValue())
-                         ? static_cast<int64_t>(*(Successor->NumCalls))
-                         : -1;
-
-  int64_t NumCallsExit = (Successor->NumCallsExit.hasValue())
-                             ? static_cast<int64_t>(*(Successor->NumCallsExit))
-                             : -1;
-
-  int64_t NumCallsInvoke =
-      (Successor->NumCallsInvoke.hasValue())
-          ? static_cast<int64_t>(*(Successor->NumCallsInvoke))
-          : -1;
-
-  int64_t NumIndirectCalls =
-      (Successor->NumIndirectCalls.hasValue())
-          ? static_cast<int64_t>(*(Successor->NumIndirectCalls))
-          : -1;
-
-  int64_t NumTailCalls = (Successor->NumTailCalls.hasValue())
-                             ? static_cast<int64_t>(*(Successor->NumTailCalls))
-                             : -1;
-
-  Printer << "," << BranchDominates << "," << BranchPostdominates << ","
-          << EndOpcode << "," << Successor->EndOpcodeStr << "," << LoopHeader
-          << "," << Backedge << "," << Exit << "," << Call << ","
-          << Successor->FromFunName << ","
-          << Twine::utohexstr(Successor->FromBb) << "," << Successor->ToFunName
-          << "," << Twine::utohexstr(Successor->ToBb) << "," << NumLoads << ","
-          << NumStores << "," << BasicBlockSize << "," << NumCalls << ","
-          << NumCallsExit << "," << NumIndirectCalls << "," << NumCallsInvoke
-          << "," << NumTailCalls;
-}
-
-void FeatureMiner::dumpFeatures(raw_ostream &Printer,
-                                uint64_t FunctionAddress) {
-
-  for (auto const &SBI : BranchesInfoSet) {
-    auto &FalseSuccessor = SBI->FalseSuccessor;
-    auto &TrueSuccessor = SBI->TrueSuccessor;
-
-    if (!FalseSuccessor && !TrueSuccessor)
-      continue;
-
-    int16_t ProcedureType = (SBI->ProcedureType.hasValue())
-                                ? static_cast<int16_t>(*(SBI->ProcedureType))
-                                : -1;
-
-    int16_t Direction =
-        (SBI->Direction.hasValue()) ? static_cast<bool>(*(SBI->Direction)) : -1;
-
-    int16_t LoopHeader = (SBI->LoopHeader.hasValue())
-                             ? static_cast<bool>(*(SBI->LoopHeader))
-                             : -1;
-
-    int32_t Opcode =
-        (SBI->Opcode.hasValue()) ? static_cast<int32_t>(*(SBI->Opcode)) : -1;
-
-    int32_t CmpOpcode = (SBI->CmpOpcode.hasValue())
-                            ? static_cast<int32_t>(*(SBI->CmpOpcode))
-                            : -1;
-
-    int64_t Count =
-        (SBI->Count.hasValue()) ? static_cast<int64_t>(*(SBI->Count)) : -1;
-
-    int64_t MissPredicted = (SBI->MissPredicted.hasValue())
-                                ? static_cast<int64_t>(*(SBI->MissPredicted))
-                                : -1;
-
-    int64_t FallthroughCount =
-        (SBI->FallthroughCount.hasValue())
-            ? static_cast<int64_t>(*(SBI->FallthroughCount))
-            : -1;
-
-    int64_t FallthroughMissPredicted =
-        (SBI->FallthroughMissPredicted.hasValue())
-            ? static_cast<int64_t>(*(SBI->FallthroughMissPredicted))
-            : -1;
-
-    int64_t NumOuterLoops = (SBI->NumOuterLoops.hasValue())
-                                ? static_cast<int64_t>(*(SBI->NumOuterLoops))
-                                : -1;
-    int64_t TotalLoops = (SBI->TotalLoops.hasValue())
-                             ? static_cast<int64_t>(*(SBI->TotalLoops))
-                             : -1;
-    int64_t MaximumLoopDepth =
-        (SBI->MaximumLoopDepth.hasValue())
-            ? static_cast<int64_t>(*(SBI->MaximumLoopDepth))
-            : -1;
-    int64_t LoopDepth = (SBI->LoopDepth.hasValue())
-                            ? static_cast<int64_t>(*(SBI->LoopDepth))
-                            : -1;
-    int64_t LoopNumExitEdges =
-        (SBI->LoopNumExitEdges.hasValue())
-            ? static_cast<int64_t>(*(SBI->LoopNumExitEdges))
-            : -1;
-    int64_t LoopNumExitBlocks =
-        (SBI->LoopNumExitBlocks.hasValue())
-            ? static_cast<int64_t>(*(SBI->LoopNumExitBlocks))
-            : -1;
-    int64_t LoopNumExitingBlocks =
-        (SBI->LoopNumExitingBlocks.hasValue())
-            ? static_cast<int64_t>(*(SBI->LoopNumExitingBlocks))
-            : -1;
-    int64_t LoopNumLatches = (SBI->LoopNumLatches.hasValue())
-                                 ? static_cast<int64_t>(*(SBI->LoopNumLatches))
-                                 : -1;
-    int64_t LoopNumBlocks = (SBI->LoopNumBlocks.hasValue())
-                                ? static_cast<int64_t>(*(SBI->LoopNumBlocks))
-                                : -1;
-    int64_t LoopNumBackEdges =
-        (SBI->LoopNumBackEdges.hasValue())
-            ? static_cast<int64_t>(*(SBI->LoopNumBackEdges))
-            : -1;
-
-    int64_t LocalExitingBlock =
-        (SBI->LocalExitingBlock.hasValue())
-            ? static_cast<bool>(*(SBI->LocalExitingBlock))
-            : -1;
-
-    int64_t LocalLatchBlock = (SBI->LocalLatchBlock.hasValue())
-                                  ? static_cast<bool>(*(SBI->LocalLatchBlock))
-                                  : -1;
-
-    int64_t LocalLoopHeader = (SBI->LocalLoopHeader.hasValue())
-                                  ? static_cast<bool>(*(SBI->LocalLoopHeader))
-                                  : -1;
-
-    int64_t Call =
-        (SBI->Call.hasValue()) ? static_cast<bool>(*(SBI->Call)) : -1;
-
-    int64_t DeltaTaken = (SBI->DeltaTaken.hasValue())
-                             ? static_cast<int64_t>(*(SBI->DeltaTaken))
-                             : -1;
-
-    int64_t NumLoads = (SBI->NumLoads.hasValue())
-                           ? static_cast<int64_t>(*(SBI->NumLoads))
-                           : -1;
-
-    int64_t NumStores = (SBI->NumStores.hasValue())
-                            ? static_cast<int64_t>(*(SBI->NumStores))
-                            : -1;
-
-    int64_t BasicBlockSize = (SBI->BasicBlockSize.hasValue())
-                                 ? static_cast<int64_t>(*(SBI->BasicBlockSize))
-                                 : -1;
-
-    int64_t NumBasicBlocks = (SBI->NumBasicBlocks.hasValue())
-                                 ? static_cast<int64_t>(*(SBI->NumBasicBlocks))
-                                 : -1;
-
-    int64_t NumCalls = (SBI->NumCalls.hasValue())
-                           ? static_cast<int64_t>(*(SBI->NumCalls))
-                           : -1;
-
-    int64_t NumSelfCalls = (SBI->NumSelfCalls.hasValue())
-                               ? static_cast<int64_t>(*(SBI->NumSelfCalls))
-                               : -1;
-
-    int64_t NumCallsExit = (SBI->NumCallsExit.hasValue())
-                               ? static_cast<int64_t>(*(SBI->NumCallsExit))
-                               : -1;
-
-    int64_t OperandRAType = (SBI->OperandRAType.hasValue())
-                                ? static_cast<int32_t>(*(SBI->OperandRAType))
-                                : -1;
-
-    int64_t OperandRBType = (SBI->OperandRBType.hasValue())
-                                ? static_cast<int32_t>(*(SBI->OperandRBType))
-                                : -1;
-
-    int64_t NumCallsInvoke = (SBI->NumCallsInvoke.hasValue())
-                                 ? static_cast<int64_t>(*(SBI->NumCallsInvoke))
-                                 : -1;
-
-    int64_t NumIndirectCalls =
-        (SBI->NumIndirectCalls.hasValue())
-            ? static_cast<int64_t>(*(SBI->NumIndirectCalls))
-            : -1;
-
-    int64_t NumTailCalls = (SBI->NumTailCalls.hasValue())
-                               ? static_cast<int64_t>(*(SBI->NumTailCalls))
-                               : -1;
-
-    Printer << SBI->Simple << "," << Opcode << "," << SBI->OpcodeStr << ","
-            << Direction << "," << CmpOpcode << "," << SBI->CmpOpcodeStr << ","
-            << LoopHeader << "," << ProcedureType << "," << Count << ","
-            << MissPredicted << "," << FallthroughCount << ","
-            << FallthroughMissPredicted << "," << NumOuterLoops << ","
-            << NumCallsExit << "," << TotalLoops << "," << MaximumLoopDepth
-            << "," << LoopDepth << "," << LoopNumExitEdges << ","
-            << LoopNumExitBlocks << "," << LoopNumExitingBlocks << ","
-            << LoopNumLatches << "," << LoopNumBlocks << "," << LoopNumBackEdges
-            << "," << LocalExitingBlock << "," << LocalLatchBlock << ","
-            << LocalLoopHeader << "," << Call << "," << DeltaTaken << ","
-            << NumLoads << "," << NumStores << "," << NumCalls << ","
-            << OperandRAType << "," << OperandRBType << "," << BasicBlockSize
-            << "," << NumBasicBlocks << "," << NumCallsInvoke << ","
-            << NumIndirectCalls << "," << NumTailCalls << "," << NumSelfCalls;
-
-    if (FalseSuccessor && TrueSuccessor) {
-      dumpSuccessorFeatures(Printer, TrueSuccessor);
-      dumpSuccessorFeatures(Printer, FalseSuccessor);
-    }
-
-    Printer << "," << Twine::utohexstr(FunctionAddress) << "\n";
-  }
-  BranchesInfoSet.clear();
-}
-
-void FeatureMiner::runOnFunctions(BinaryContext &BC) {
-  auto FileName = "features.csv";
-  outs() << "BOLT-DEBUG: Dumping Binary's Features to " << FileName << "\n";
-  std::error_code EC;
-  raw_fd_ostream Printer(FileName, EC, sys::fs::F_None);
-
-  if (EC) {
-    errs() << "BOLT-WARNING: " << EC.message() << ", unable to open "
-           << FileName << " for output.\n";
-    return;
-  }
-
-  auto FILENAME = "profile_data_regular.fdata";
-  raw_fd_ostream Printer2(FILENAME, EC, sys::fs::F_None);
-  if (EC) {
-    dbgs() << "BOLT-WARNING: " << EC.message() << ", unable to open"
-           << " " << FILENAME << " for output.\n";
-    return;
-  }
-
-  // CSV file header
-  Printer << "FUN_TYPE,OPCODE,OPCODE_STR,DIRECTION,CMP_OPCODE,CMP_OPCODE_STR,"
-             "LOOP_HEADER,PROCEDURE_TYPE,"
-             "COUNT_TAKEN,MISS_TAKEN,COUNT_NOT_TAKEN,MISS_NOT_TAKEN,"
-             "NUM_OUTER_LOOPS,NUM_CALLS_EXIT,TOTAL_LOOPS,MAXIMUM_LOOP_DEPTH,"
-             "LOOP_DEPTH,LOOP_NUM_EXIT_EDGES,LOOP_NUM_EXIT_BLOCKS,"
-             "LOOP_NUM_EXITING_BLOCKS,LOOP_NUM_LATCHES,LOOP_NUM_BLOCKS,"
-             "LOOP_NUM_BAKCEDGES,LOCAL_EXITING_BLOCK,LOCAL_LATCH_BLOCK,"
-             "LOCAL_LOOP_HEADER,CALL,DELTA_TAKEN,NUM_LOADS,NUM_STORES,"
-             "NUM_CALLS,OPERAND_RA_TYPE,OPERAND_RB_TYPE,BASIC_BLOCK_SIZE,"
-             "NUM_BASIC_BLOCKS,NUM_CALLS_INVOKE,NUM_INDIRECT_CALLS,"
-             "NUM_TAIL_CALLS,NUM_SELF_CALLS,TS_DOMINATES,TS_POSTDOMINATES,"
-             "TS_END_OPCODE,TS_END_OPCODE_STR,TS_LOOP_HEADER,TS_BACKEDGE,TS_"
-             "EXIT,TS_CALL,"
-             "TS_FROM_FUN_NAME,TS_FROM_BB,TS_TO_FUN_NAME,TS_TO_BB,TS_NUM_LOADS,"
-             "TS_NUM_STORES,TS_BASIC_BLOCK_SIZE,TS_NUM_CALLS,TS_NUM_CALLS_EXIT,"
-             "TS_NUM_INDIRECT_CALL,TS_NUM_CALLS_INVOKE,TS_NUM_TAIL_CALLS,"
-             "FS_DOMINATES,FS_POSTDOMINATES,FS_END_OPCODE,FS_END_OPCODE_STR,FS_"
-             "LOOP_HEADER,"
-             "FS_BACKEDGE,FS_EXIT,FS_CALL,FS_FROM_FUN_NAME,FS_FROM_BB,"
-             "FS_TO_FUN_NAME,FS_TO_BB,FS_NUM_LOADS,FS_NUM_STORES,"
-             "FS_BASIC_BLOCK_SIZE,FS_NUM_CALLS,FS_NUM_CALLS_EXIT,"
-             "FS_NUM_INDIRECT_CALL,FS_NUM_CALLS_INVOKE,FS_NUM_TAIL_CALLS,"
-             "FUN_ENTRY_ADDRESS\n";
-
-  auto &BFs = BC.getBinaryFunctions();
-  BPI = std::make_unique<BranchPredictionInfo>();
-  for (auto &BFI : BFs) {
-    BinaryFunction &Function = BFI.second;
-
-    if (Function.empty()) // || !Function.isSimple())
-      continue;
-
-    if (!Function.isLoopFree()) {
-      const BinaryLoopInfo &LoopsInfo = Function.getLoopInfo();
-      BPI->findLoopEdgesInfo(LoopsInfo);
-    }
-    extractFeatures(Function, BC);
-
-    BPI->clear();
-
-    dumpFeatures(Printer, Function.getAddress());
-
-    dumpProfileData(Function, Printer2);
-  }
-}
-
-void FeatureMiner::dumpProfileData(BinaryFunction &Function,
-                                   raw_ostream &Printer) {
-
-  BinaryContext &BC = Function.getBinaryContext();
-
-  std::string FromFunName = Function.getPrintName();
-  for (auto &BB : Function) {
-    auto LastInst = BB.getLastNonPseudoInstr();
-
-    for (auto &Inst : BB) {
-      if (!BC.MIB->isCall(Inst) && !BC.MIB->isBranch(Inst) &&
-          LastInst != (&Inst))
-        continue;
-
-      auto Offset = BC.MIB->tryGetAnnotationAs<uint64_t>(Inst, "Offset");
-
-      if (!Offset)
-        continue;
-
-      uint64_t TakenFreqEdge = 0;
-      auto FromBb = Offset.get();
-      std::string ToFunName;
-      uint32_t ToBb;
-
-      if (BC.MIB->isCall(Inst)) {
-        auto *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
-        if (!CalleeSymbol)
-          continue;
-
-        ToFunName = CalleeSymbol->getName();
-        ToBb = 0;
-
-        if (BC.MIB->getConditionalTailCall(Inst)) {
-
-          if (BC.MIB->hasAnnotation(Inst, "CTCTakenCount")) {
-            auto CountAnnt =
-                BC.MIB->tryGetAnnotationAs<uint64_t>(Inst, "CTCTakenCount");
-            if (CountAnnt) {
-              TakenFreqEdge = (*CountAnnt);
-            }
-          }
-        } else {
-          if (BC.MIB->hasAnnotation(Inst, "Count")) {
-            auto CountAnnt =
-                BC.MIB->tryGetAnnotationAs<uint64_t>(Inst, "Count");
-            if (CountAnnt) {
-              TakenFreqEdge = (*CountAnnt);
-            }
-          }
-        }
-
-        if (TakenFreqEdge > 0)
-          Printer << "1 " << FromFunName << " " << Twine::utohexstr(FromBb)
-                  << " 1 " << ToFunName << " " << Twine::utohexstr(ToBb) << " "
-                  << 0 << " " << TakenFreqEdge << "\n";
-      } else {
-        for (BinaryBasicBlock *SuccBB : BB.successors()) {
-          TakenFreqEdge = BB.getBranchInfo(*SuccBB).Count;
-          BinaryFunction *ToFun = SuccBB->getFunction();
-          ToFunName = ToFun->getPrintName();
-          ToBb = SuccBB->getInputOffset();
-
-          if (TakenFreqEdge > 0)
-            Printer << "1 " << FromFunName << " " << Twine::utohexstr(FromBb)
-                    << " 1 " << ToFunName << " " << Twine::utohexstr(ToBb)
-                    << " " << 0 << " " << TakenFreqEdge << "\n";
-        }
-      }
-    }
-  }
-}
-
-} // namespace bolt
-} // namespace llvm
--- a/bolt/src/Passes/FeatureMiner.h
+++ b/bolt/src/Passes/FeatureMiner.h
@ -1,165 +0,0 @@
-//===--- Passes/FeatureMiner.h -------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// A very simple feature extractor based on Calder's paper
-// Evidence-based static branch prediction using machine learning
-// https://dl.acm.org/doi/10.1145/239912.239923
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_FEATUREMINER_H_
-#define LLVM_TOOLS_LLVM_BOLT_PASSES_FEATUREMINER_H_
-
-#include "BinaryContext.h"
-#include "BinaryFunction.h"
-#include "BinaryLoop.h"
-#include "DominatorAnalysis.h"
-#include "Passes/BinaryPasses.h"
-#include "Passes/BranchPredictionInfo.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace llvm {
-namespace bolt {
-
-class FeatureMiner : public BinaryFunctionPass {
-private:
-  std::unique_ptr<BranchPredictionInfo> BPI;
-
-  /// BasicBlockInfo - This structure holds feature information about the target
-  /// BasicBlock of either the taken or the fallthrough paths of a given branch.
-  struct BasicBlockInfo {
-    Optional<bool> BranchDominates;     // 1 - dominates, 0 - does not dominate
-    Optional<bool> BranchPostdominates; // 1 - postdominates, 0 - does not PD
-    Optional<bool> LoopHeader; // 1 - loop header, 0 - not a loop header
-    Optional<bool> Backedge;   // 1 - loop back, 0 - not a loop back
-    Optional<bool> Exit;       // 1 - loop exit, 0 - not a loop exit
-    Optional<bool> Call;       // 1 - program call, 0 - not a program call
-    Optional<unsigned> NumCalls;
-    Optional<unsigned> NumLoads;
-    Optional<unsigned> NumStores;
-    Optional<int32_t> EndOpcode; // 0 = NOTHING
-    StringRef EndOpcodeStr = "UNDEF";
-    Optional<int32_t> BasicBlockSize;
-    std::string FromFunName = "UNDEF";
-    uint32_t FromBb;
-    std::string ToFunName = "UNDEF";
-    uint32_t ToBb;
-
-    Optional<unsigned> NumCallsExit;
-    Optional<unsigned> NumCallsInvoke;
-    Optional<unsigned> NumIndirectCalls;
-    Optional<unsigned> NumTailCalls;
-  };
-
-  typedef std::unique_ptr<struct BasicBlockInfo> BBIPtr;
-
-  /// StaticBranchInfo - This structure holds feature information about each
-  /// two-way branch from the program.
-  struct StaticBranchInfo {
-    StringRef OpcodeStr = "UNDEF";
-    StringRef CmpOpcodeStr = "UNDEF";
-    bool Simple = 0;
-
-    Optional<int32_t> Opcode;
-    Optional<int32_t> CmpOpcode;
-    Optional<int64_t> Count;
-    Optional<int64_t> MissPredicted;
-    Optional<int64_t> FallthroughCount;
-    Optional<int64_t> FallthroughMissPredicted;
-    BBIPtr TrueSuccessor = std::make_unique<struct BasicBlockInfo>();
-    BBIPtr FalseSuccessor = std::make_unique<struct BasicBlockInfo>();
-    Optional<int8_t> ProcedureType; // 1 - Leaf, 0 - NonLeaf, 2 - CallSelf
-    Optional<bool> LoopHeader;      // 1 — loop header, 0 - not a loop header
-    Optional<bool> Direction;       // 1 - Forward Branch, 0 - Backward Branch
-
-    Optional<unsigned> NumOuterLoops;
-    Optional<unsigned> TotalLoops;
-    Optional<unsigned> MaximumLoopDepth;
-    Optional<unsigned> LoopDepth;
-    Optional<unsigned> LoopNumExitEdges;
-    Optional<unsigned> LoopNumExitBlocks;
-    Optional<unsigned> LoopNumExitingBlocks;
-    Optional<unsigned> LoopNumLatches;
-    Optional<unsigned> LoopNumBlocks;
-    Optional<unsigned> LoopNumBackEdges;
-    Optional<unsigned> NumLoads;
-    Optional<unsigned> NumStores;
-
-    Optional<bool> LocalExitingBlock;
-    Optional<bool> LocalLatchBlock;
-    Optional<bool> LocalLoopHeader;
-    Optional<bool> Call;
-
-    Optional<unsigned> NumCalls;
-    Optional<unsigned> NumCallsExit;
-    Optional<unsigned> NumCallsInvoke;
-    Optional<unsigned> NumIndirectCalls;
-    Optional<unsigned> NumTailCalls;
-    Optional<unsigned> NumSelfCalls;
-
-    Optional<unsigned> NumBasicBlocks;
-
-    Optional<unsigned> DeltaTaken;
-
-    Optional<int32_t> OperandRAType;
-    Optional<int32_t> OperandRBType;
-
-    Optional<int32_t> BasicBlockSize;
-  };
-
-  typedef std::unique_ptr<struct StaticBranchInfo> SBIPtr;
-  std::vector<SBIPtr> BranchesInfoSet;
-
-  /// getProcedureType - Determines which category the function falls into:
-  /// Leaf, Non-leaf or Calls-self.
-  int8_t getProcedureType(BinaryFunction &Function, BinaryContext &BC);
-
-  /// addSuccessorInfo - Discovers feature information for the target successor
-  /// basic block, and inserts it into the static branch info container.
-  void addSuccessorInfo(DominatorAnalysis<false> &DA,
-                        DominatorAnalysis<true> &PDA, SBIPtr const &SBI,
-                        BinaryFunction &Function, BinaryContext &BC,
-                        MCInst &Inst, BinaryBasicBlock &BB, bool Succ);
-
-  /// extractFeatures - Extracts the feature information for each two-way branch
-  /// from the program.
-  void extractFeatures(BinaryFunction &Function, BinaryContext &BC);
-
-  /// dumpSuccessorFeatures - Dumps the feature information about the target
-  /// BasicBlock of either the taken or the fallthrough paths of a given branch.
-  void dumpSuccessorFeatures(raw_ostream &Printer, BBIPtr &Successor);
-
-  /// dumpFeatures - Dumps the feature information about each two-way branch
-  /// from the program.
-  void dumpFeatures(raw_ostream &Printer, uint64_t FunctionAddress);
-
-  /// dumpProfileData - Dumps a limited version of the inout profile data
-  /// that contains only profile for conditional branches, unconditional
-  /// branches and terminators that aren't branches.
-  void dumpProfileData(BinaryFunction &Function, raw_ostream &Printer);
-
-public:
-  explicit FeatureMiner(const cl::opt<bool> &PrintPass)
-      : BinaryFunctionPass(PrintPass) {}
-
-  const char *getName() const override { return "feature-miner"; }
-
-  void runOnFunctions(BinaryContext &BC) override;
-};
-
-} // namespace bolt
-} // namespace llvm
-
-#endif /* LLVM_TOOLS_LLVM_BOLT_PASSES_FEATUREMINER_H_ */
--- a/bolt/src/Passes/FrameAnalysis.cpp
+++ b/bolt/src/Passes/FrameAnalysis.cpp
@ -13,6 +13,7 @@
 #include "ParallelUtilities.h"
 #include "llvm/Support/ThreadPool.h"
 #include <fstream>
+#include <stack>

 #define DEBUG_TYPE "fa"

@ -119,16 +120,16 @@ class FrameAccessAnalysis {
    MCPhysReg Reg{0};
    int64_t StackOffset{0};
    bool IsIndexed{false};
-    if (!BC.MIB->isStackAccess(Inst, FIE.IsLoad, FIE.IsStore, FIE.IsStoreFromReg,
-                               Reg, SrcImm, FIE.StackPtrReg, StackOffset, FIE.Size,
-                               FIE.IsSimple, IsIndexed)) {
+    if (!BC.MIB->isStackAccess(
+            Inst, FIE.IsLoad, FIE.IsStore, FIE.IsStoreFromReg, Reg, SrcImm,
+            FIE.StackPtrReg, StackOffset, FIE.Size, FIE.IsSimple, IsIndexed)) {
      return true;
    }

    if (IsIndexed || FIE.Size == 0) {
-      DEBUG(dbgs() << "Giving up on indexed memory access/unknown size\n");
-      DEBUG(dbgs() << "Blame insn: ");
-      DEBUG(Inst.dump());
+      LLVM_DEBUG(dbgs() << "Giving up on indexed memory access/unknown size\n");
+      LLVM_DEBUG(dbgs() << "Blame insn: ");
+      LLVM_DEBUG(Inst.dump());
      return false;
    }

@ -140,21 +141,24 @@ class FrameAccessAnalysis {

    if (FIE.StackPtrReg == BC.MIB->getStackPointer() && SPOffset != SPT.EMPTY &&
        SPOffset != SPT.SUPERPOSITION) {
-      DEBUG(dbgs() << "Adding access via SP while CFA reg is another one\n");
+      LLVM_DEBUG(
+          dbgs() << "Adding access via SP while CFA reg is another one\n");
      FIE.StackOffset = SPOffset + StackOffset;
    } else if (FIE.StackPtrReg == BC.MIB->getFramePointer() &&
               FPOffset != SPT.EMPTY && FPOffset != SPT.SUPERPOSITION) {
-      DEBUG(dbgs() << "Adding access via FP while CFA reg is another one\n");
+      LLVM_DEBUG(
+          dbgs() << "Adding access via FP while CFA reg is another one\n");
      FIE.StackOffset = FPOffset + StackOffset;
    } else if (FIE.StackPtrReg ==
-               BC.MRI->getLLVMRegNum(CfaReg, /*isEH=*/false)) {
+               *BC.MRI->getLLVMRegNum(CfaReg, /*isEH=*/false)) {
      FIE.StackOffset = CfaOffset + StackOffset;
    } else {
-      DEBUG(dbgs() << "Found stack access with reg different than cfa reg.\n");
-      DEBUG(dbgs() << "\tCurrent CFA reg: " << CfaReg
-                   << "\n\tStack access reg: " << FIE.StackPtrReg << "\n");
-      DEBUG(dbgs() << "Blame insn: ");
-      DEBUG(Inst.dump());
+      LLVM_DEBUG(
+          dbgs() << "Found stack access with reg different than cfa reg.\n");
+      LLVM_DEBUG(dbgs() << "\tCurrent CFA reg: " << CfaReg
+                        << "\n\tStack access reg: " << FIE.StackPtrReg << "\n");
+      LLVM_DEBUG(dbgs() << "Blame insn: ");
+      LLVM_DEBUG(Inst.dump());
      return false;
    }
    IsValidAccess = true;
@ -183,7 +187,7 @@ public:
      switch (CFI->getOperation()) {
      case MCCFIInstruction::OpDefCfa:
        CfaOffset = CFI->getOffset();
-      // Fall-through
+        LLVM_FALLTHROUGH;
      case MCCFIInstruction::OpDefCfaRegister:
        CfaReg = CFI->getRegister();
        break;
@ -214,9 +218,10 @@ public:
    }

    if (BC.MIB->escapesVariable(Inst, SPT.HasFramePointer)) {
-      DEBUG(dbgs() << "Leaked stack address, giving up on this function.\n");
-      DEBUG(dbgs() << "Blame insn: ");
-      DEBUG(Inst.dump());
+      LLVM_DEBUG(
+          dbgs() << "Leaked stack address, giving up on this function.\n");
+      LLVM_DEBUG(dbgs() << "Blame insn: ");
+      LLVM_DEBUG(Inst.dump());
      return false;
    }

@ -376,8 +381,8 @@ bool FrameAnalysis::updateArgsTouchedFor(const BinaryFunction &BF, MCInst &Inst,
      addArgAccessesFor(Inst, ArgAccesses(/*AssumeEverything=*/true));
      break;
    }
-    DEBUG(dbgs() << "Added arg in stack access annotation "
-                 << CurOffset + Elem.first << "\n");
+    LLVM_DEBUG(dbgs() << "Added arg in stack access annotation "
+                      << CurOffset + Elem.first << "\n");
    addArgInStackAccessFor(
        Inst, ArgInStackAccess{/*StackOffset=*/CurOffset + Elem.first,
                               /*Size=*/Elem.second});
@ -387,7 +392,8 @@ bool FrameAnalysis::updateArgsTouchedFor(const BinaryFunction &BF, MCInst &Inst,

 bool FrameAnalysis::computeArgsAccessed(BinaryFunction &BF) {
  if (!BF.isSimple() || !BF.hasCFG()) {
-    DEBUG(dbgs() << "Treating " << BF.getPrintName() << " conservatively.\n");
+    LLVM_DEBUG(dbgs() << "Treating " << BF.getPrintName()
+                      << " conservatively.\n");
    ArgsTouchedMap[&BF].emplace(std::make_pair(-1, 0));
    if (!FunctionsRequireAlignment.count(&BF)) {
      FunctionsRequireAlignment.insert(&BF);
@ -396,8 +402,8 @@ bool FrameAnalysis::computeArgsAccessed(BinaryFunction &BF) {
    return false;
  }

-  DEBUG(dbgs() << "Now computing args accessed for: " << BF.getPrintName()
-               << "\n");
+  LLVM_DEBUG(dbgs() << "Now computing args accessed for: " << BF.getPrintName()
+                    << "\n");
  bool UpdatedArgsTouched = false;
  bool NoInfo = false;
  FrameAccessAnalysis FAA(BC, BF, getSPT(BF));
@ -431,7 +437,7 @@ bool FrameAnalysis::computeArgsAccessed(BinaryFunction &BF) {
      // Record accesses to the previous stack frame
      ArgsTouchedMap[&BF].emplace(std::make_pair(FIE.StackOffset, FIE.Size));
      UpdatedArgsTouched = true;
-      DEBUG({
+      LLVM_DEBUG({
        dbgs() << "Arg access offset " << FIE.StackOffset << " added to:\n";
        BC.printInstruction(dbgs(), Inst, 0, &BF, true);
      });
@ -461,16 +467,16 @@ bool FrameAnalysis::computeArgsAccessed(BinaryFunction &BF) {
 bool FrameAnalysis::restoreFrameIndex(BinaryFunction &BF) {
  FrameAccessAnalysis FAA(BC, BF, getSPT(BF));

-  DEBUG(dbgs() << "Restoring frame indices for \"" << BF.getPrintName()
-               << "\"\n");
+  LLVM_DEBUG(dbgs() << "Restoring frame indices for \"" << BF.getPrintName()
+                    << "\"\n");
  for (auto BB : BF.layout()) {
-    DEBUG(dbgs() << "\tNow at BB " << BB->getName() << "\n");
+    LLVM_DEBUG(dbgs() << "\tNow at BB " << BB->getName() << "\n");
    FAA.enterNewBB();

    for (auto &Inst : *BB) {
      if (!FAA.doNext(*BB, Inst))
        return false;
-      DEBUG({
+      LLVM_DEBUG({
        dbgs() << "\t\tNow at ";
        Inst.dump();
        dbgs() << "\t\t\tSP offset is " << FAA.getSPOffset() << "\n";
@ -482,7 +488,7 @@ bool FrameAnalysis::restoreFrameIndex(BinaryFunction &BF) {
      const FrameIndexEntry &FIE = FAA.getFIE();

      addFIEFor(Inst, FIE);
-      DEBUG({
+      LLVM_DEBUG({
        dbgs() << "Frame index annotation " << FIE << " added to:\n";
        BC.printInstruction(dbgs(), Inst, 0, &BF, true);
      });
--- a/bolt/src/Passes/FrameOptimizer.cpp
+++ b/bolt/src/Passes/FrameOptimizer.cpp
@ -60,15 +60,15 @@ void FrameOptimizerPass::removeUnnecessaryLoads(const RegAnalysis &RA,
  StackAvailableExpressions SAE(RA, FA, BC, BF);
  SAE.run();

-  DEBUG(dbgs() << "Performing unnecessary loads removal\n");
+  LLVM_DEBUG(dbgs() << "Performing unnecessary loads removal\n");
  std::deque<std::pair<BinaryBasicBlock *, MCInst *>> ToErase;
  bool Changed = false;
  const auto ExprEnd = SAE.expr_end();
  for (auto &BB : BF) {
-    DEBUG(dbgs() <<"\tNow at BB " << BB.getName() << "\n");
+    LLVM_DEBUG(dbgs() <<"\tNow at BB " << BB.getName() << "\n");
    const MCInst *Prev = nullptr;
    for (auto &Inst : BB) {
-      DEBUG({
+      LLVM_DEBUG({
        dbgs() << "\t\tNow at ";
        Inst.dump();
        for (auto I = Prev ? SAE.expr_begin(*Prev) : SAE.expr_begin(BB);
@ -109,47 +109,47 @@ void FrameOptimizerPass::removeUnnecessaryLoads(const RegAnalysis &RA,

        ++NumRedundantLoads;
        Changed = true;
-        DEBUG(dbgs() << "Redundant load instruction: ");
-        DEBUG(Inst.dump());
-        DEBUG(dbgs() << "Related store instruction: ");
-        DEBUG(AvailableInst->dump());
-        DEBUG(dbgs() << "@BB: " << BB.getName() << "\n");
+        LLVM_DEBUG(dbgs() << "Redundant load instruction: ");
+        LLVM_DEBUG(Inst.dump());
+        LLVM_DEBUG(dbgs() << "Related store instruction: ");
+        LLVM_DEBUG(AvailableInst->dump());
+        LLVM_DEBUG(dbgs() << "@BB: " << BB.getName() << "\n");
        // Replace load
        if (FIEY->IsStoreFromReg) {
          if (!BC.MIB->replaceMemOperandWithReg(Inst, FIEY->RegOrImm)) {
-            DEBUG(dbgs() << "FAILED to change operand to a reg\n");
+            LLVM_DEBUG(dbgs() << "FAILED to change operand to a reg\n");
            break;
          }
          ++NumLoadsChangedToReg;
          BC.MIB->removeAnnotation(Inst, "FrameAccessEntry");
-          DEBUG(dbgs() << "Changed operand to a reg\n");
+          LLVM_DEBUG(dbgs() << "Changed operand to a reg\n");
          if (BC.MIB->isRedundantMove(Inst)) {
            ++NumLoadsDeleted;
-            DEBUG(dbgs() << "Created a redundant move\n");
+            LLVM_DEBUG(dbgs() << "Created a redundant move\n");
            // Delete it!
            ToErase.push_front(std::make_pair(&BB, &Inst));
          }
        } else {
          char Buf[8] = {0, 0, 0, 0, 0, 0, 0, 0};
          support::ulittle64_t::ref(Buf + 0) = FIEY->RegOrImm;
-          DEBUG(dbgs() << "Changing operand to an imm... ");
+          LLVM_DEBUG(dbgs() << "Changing operand to an imm... ");
          if (!BC.MIB->replaceMemOperandWithImm(Inst, StringRef(Buf, 8), 0)) {
-            DEBUG(dbgs() << "FAILED\n");
+            LLVM_DEBUG(dbgs() << "FAILED\n");
          } else {
            ++NumLoadsChangedToImm;
            BC.MIB->removeAnnotation(Inst, "FrameAccessEntry");
-            DEBUG(dbgs() << "Ok\n");
+            LLVM_DEBUG(dbgs() << "Ok\n");
          }
        }
-        DEBUG(dbgs() << "Changed to: ");
-        DEBUG(Inst.dump());
+        LLVM_DEBUG(dbgs() << "Changed to: ");
+        LLVM_DEBUG(Inst.dump());
        break;
      }
      Prev = &Inst;
    }
  }
  if (Changed) {
-    DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n");
+    LLVM_DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n");
  }
  // TODO: Implement an interface of eraseInstruction that works out the
  // complete list of elements to remove.
@ -164,15 +164,15 @@ void FrameOptimizerPass::removeUnusedStores(const FrameAnalysis &FA,
  StackReachingUses SRU(FA, BC, BF);
  SRU.run();

-  DEBUG(dbgs() << "Performing unused stores removal\n");
+  LLVM_DEBUG(dbgs() << "Performing unused stores removal\n");
  std::vector<std::pair<BinaryBasicBlock *, MCInst *>> ToErase;
  bool Changed = false;
  for (auto &BB : BF) {
-    DEBUG(dbgs() <<"\tNow at BB " << BB.getName() << "\n");
+    LLVM_DEBUG(dbgs() <<"\tNow at BB " << BB.getName() << "\n");
    const MCInst *Prev = nullptr;
    for (auto I = BB.rbegin(), E = BB.rend(); I != E; ++I) {
      auto &Inst = *I;
-      DEBUG({
+      LLVM_DEBUG({
        dbgs() << "\t\tNow at ";
        Inst.dump();
        for (auto I = Prev ? SRU.expr_begin(*Prev) : SRU.expr_begin(BB);
@ -202,10 +202,10 @@ void FrameOptimizerPass::removeUnusedStores(const FrameAnalysis &FA,

      ++NumRedundantStores;
      Changed = true;
-      DEBUG(dbgs() << "Unused store instruction: ");
-      DEBUG(Inst.dump());
-      DEBUG(dbgs() << "@BB: " << BB.getName() << "\n");
-      DEBUG(dbgs() << "FIE offset = " << FIEX->StackOffset
+      LLVM_DEBUG(dbgs() << "Unused store instruction: ");
+      LLVM_DEBUG(Inst.dump());
+      LLVM_DEBUG(dbgs() << "@BB: " << BB.getName() << "\n");
+      LLVM_DEBUG(dbgs() << "FIE offset = " << FIEX->StackOffset
                   << " size = " << (int)FIEX->Size << "\n");
      // Delete it!
      ToErase.push_back(std::make_pair(&BB, &Inst));
@ -217,7 +217,7 @@ void FrameOptimizerPass::removeUnusedStores(const FrameAnalysis &FA,
    I.first->eraseInstruction(I.first->findInstruction(I.second));
  }
  if (Changed) {
-    DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n");
+    LLVM_DEBUG(dbgs() << "FOP modified \"" << BF.getPrintName() << "\"\n");
  }
 }

@ -256,11 +256,12 @@ void FrameOptimizerPass::runOnFunctions(BinaryContext &BC) {
    if (opts::FrameOptimization == FOP_HOT) {
      if (I.second.getKnownExecutionCount() < BC.getHotThreshold())
        continue;
-      DEBUG(dbgs() << "Considering " << I.second.getPrintName()
-                   << " for frame optimizations because its execution count ( "
-                   << I.second.getKnownExecutionCount()
-                   << " ) exceeds our hotness threshold ( "
-                   << BC.getHotThreshold() << " )\n");
+      LLVM_DEBUG(
+          dbgs() << "Considering " << I.second.getPrintName()
+                 << " for frame optimizations because its execution count ( "
+                 << I.second.getKnownExecutionCount()
+                 << " ) exceeds our hotness threshold ( "
+                 << BC.getHotThreshold() << " )\n");
    }

    {
--- a/bolt/src/Passes/HFSort.cpp
+++ b/bolt/src/Passes/HFSort.cpp
@ -28,9 +28,9 @@
 */

 #include "HFSort.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/Options.h"
 #include "llvm/Support/raw_ostream.h"
 #include <unordered_set>

@ -109,7 +109,7 @@ void freezeClusters(const CallGraph &Cg, std::vector<Cluster> &Clusters) {
    if (NewSize > FrozenPages * HugePageSize) break;
    C.freeze();
    TotalSize = NewSize;
-    DEBUG(
+    LLVM_DEBUG(
      auto Fid = C.target(0);
      dbgs() <<
          format("freezing cluster for func %d, size = %u, samples = %lu)\n",
@ -229,7 +229,7 @@ std::vector<Cluster> clusterize(const CallGraph &Cg) {
      continue;
    }

-    DEBUG(
+    LLVM_DEBUG(
      if (opts::Verbosity > 1) {
        dbgs() << format("merging %s -> %s: %u\n",
                         PredCluster->toString().c_str(),
--- a/bolt/src/Passes/HFSortPlus.cpp
+++ b/bolt/src/Passes/HFSortPlus.cpp
@ -25,7 +25,7 @@

 #include "BinaryFunction.h"
 #include "HFSort.h"
-#include "llvm/Support/Options.h"
+#include "llvm/Support/CommandLine.h"

 #include <set>
 #include <vector>
--- a/bolt/src/Passes/IdenticalCodeFolding.cpp
+++ b/bolt/src/Passes/IdenticalCodeFolding.cpp
@ -11,7 +11,7 @@

 #include "Passes/IdenticalCodeFolding.h"
 #include "ParallelUtilities.h"
-#include "llvm/Support/Options.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ThreadPool.h"
 #include "llvm/Support/Timer.h"
 #include <atomic>
@ -461,7 +461,7 @@ void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC) {
                                        "ICF breakdown", "ICF breakdown",
                                        opts::TimeICF);
    Timer SinglePass("single fold pass", "single fold pass");
-    DEBUG(SinglePass.startTimer());
+    LLVM_DEBUG(SinglePass.startTimer());

    ThreadPool *ThPool;
    if (!opts::NoThreads)
@ -470,7 +470,7 @@ void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC) {
    // Fold identical functions within a single congruent bucket
    auto processSingleBucket = [&](std::set<BinaryFunction *> &Candidates) {
      Timer T("folding single congruent list", "folding single congruent list");
-      DEBUG(T.startTimer());
+      LLVM_DEBUG(T.startTimer());

      // Identical functions go into the same bucket.
      IdenticalBucketsMap IdenticalBuckets;
@ -495,8 +495,8 @@ void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC) {
        BinaryFunction *ParentBF = Twins[0];
        for (unsigned i = 1; i < Twins.size(); ++i) {
          auto *ChildBF = Twins[i];
-          DEBUG(dbgs() << "BOLT-DEBUG: folding " << *ChildBF << " into "
-                       << *ParentBF << '\n');
+          LLVM_DEBUG(dbgs() << "BOLT-DEBUG: folding " << *ChildBF << " into "
+                            << *ParentBF << '\n');

          // Remove child function from the list of candidates.
          auto FI = Candidates.find(ChildBF);
@ -517,7 +517,7 @@ void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC) {
        }
      }

-      DEBUG(T.stopTimer());
+      LLVM_DEBUG(T.stopTimer());
    };

    // Create a task for each congruent bucket
@ -535,7 +535,7 @@ void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC) {
    if (!opts::NoThreads)
      ThPool->wait();

-    DEBUG(SinglePass.stopTimer());
+    LLVM_DEBUG(SinglePass.stopTimer());
  };

  hashFunctions();
@ -545,7 +545,7 @@ void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC) {
  // We repeat the pass until no new modifications happen.
  do {
    NumFoldedLastIteration = 0;
-    DEBUG(dbgs() << "BOLT-DEBUG: ICF iteration " << Iteration << "...\n");
+    LLVM_DEBUG(dbgs() << "BOLT-DEBUG: ICF iteration " << Iteration << "...\n");

    performFoldingPass();

@ -554,7 +554,7 @@ void IdenticalCodeFolding::runOnFunctions(BinaryContext &BC) {

  } while (NumFoldedLastIteration > 0);

-   DEBUG(
+   LLVM_DEBUG(
    // Print functions that are congruent but not identical.
    for (auto &CBI : CongruentBuckets) {
      auto &Candidates = CBI.second;
--- a/bolt/src/Passes/IndirectCallPromotion.cpp
+++ b/bolt/src/Passes/IndirectCallPromotion.cpp
@ -11,7 +11,7 @@

 #include "IndirectCallPromotion.h"
 #include "DataflowInfoManager.h"
-#include "llvm/Support/Options.h"
+#include "llvm/Support/CommandLine.h"
 #include <numeric>

 #define DEBUG_TYPE "ICP"
@ -316,7 +316,7 @@ IndirectCallPromotion::getCallTargets(
    }
    ++Result;

-    DEBUG(if (Targets.end() - Result > 0) {
+    LLVM_DEBUG(if (Targets.end() - Result > 0) {
      dbgs() << "BOLT-INFO: ICP: " << (Targets.end() - Result)
             << " duplicate targets removed\n";
    });
@ -360,7 +360,7 @@ IndirectCallPromotion::getCallTargets(
                             });
  Targets.erase(Last, Targets.end());

-  DEBUG(
+  LLVM_DEBUG(
    if (BF.getJumpTable(Inst)) {
      uint64_t TotalCount = 0;
      uint64_t TotalMispreds = 0;
@ -418,7 +418,7 @@ IndirectCallPromotion::maybeGetHotJumpTableTargets(
  if (!MemLocInstr)
    return JumpTableInfoType();

-  DEBUG({
+  LLVM_DEBUG({
      dbgs() << "BOLT-INFO: ICP attempting to find memory profiling data for "
             << "jump table in " << Function << " at @ "
             << (&CallInst - &BB->front()) << "\n"
@ -453,7 +453,8 @@ IndirectCallPromotion::maybeGetHotJumpTableTargets(

  uint64_t ArrayStart;
  if (DispExpr) {
-    auto DispValueOrError = BC.getSymbolValue(DispExpr->getSymbol());
+    auto DispValueOrError =
+        BC.getSymbolValue(*BC.MIB->getTargetSymbol(DispExpr));
    assert(DispValueOrError && "global symbol needs a value");
    ArrayStart = *DispValueOrError;
  } else {
@ -493,15 +494,15 @@ IndirectCallPromotion::maybeGetHotJumpTableTargets(
    // If Index is out of range it probably means the memory profiling data is
    // wrong for this instruction, bail out.
    if (Index >= Range.second) {
-      DEBUG(dbgs() << "BOLT-INFO: Index out of range of " << Range.first
-                   << ", " << Range.second << "\n");
+      LLVM_DEBUG(dbgs() << "BOLT-INFO: Index out of range of " << Range.first
+                        << ", " << Range.second << "\n");
      return JumpTableInfoType();
    }

    // Make sure the hot index points at a legal label corresponding to a BB,
    // e.g. not the end of function (unreachable) label.
    if (!Function.getBasicBlockForLabel(JT->Entries[Index + Range.first])) {
-      DEBUG({
+      LLVM_DEBUG({
          dbgs() << "BOLT-INFO: hot index " << Index << " pointing at bogus "
                 << "label " << JT->Entries[Index + Range.first]->getName()
                 << " in jump table:\n";
@ -532,7 +533,7 @@ IndirectCallPromotion::maybeGetHotJumpTableTargets(
  // Sort with highest counts first.
  std::sort(HotTargets.rbegin(), HotTargets.rend());

-  DEBUG({
+  LLVM_DEBUG({
      dbgs() << "BOLT-INFO: ICP jump table hot targets:\n";
      for (const auto &Target : HotTargets) {
        dbgs() << "BOLT-INFO:  Idx = " << Target.second << ", "
@ -575,8 +576,9 @@ IndirectCallPromotion::findCallTargetSymbols(
          if (std::find(JTIs.begin(), JTIs.end(), JTIndex) != JTIs.end())
            return I;
        }
-        DEBUG(dbgs() << "BOLT-ERROR: Unable to find target index for hot jump "
-                     << " table entry in " << Function << "\n");
+        LLVM_DEBUG(
+            dbgs() << "BOLT-ERROR: Unable to find target index for hot jump "
+                   << " table entry in " << Function << "\n");
        llvm_unreachable("Hot indices must be referred to by at least one "
                         "callsite");
      };
@ -1179,7 +1181,7 @@ IndirectCallPromotion::printCallsiteInfo(const BinaryBasicBlock *BB,
    outs() << "\n";
  }

-  DEBUG({
+  LLVM_DEBUG({
    dbgs() << "BOLT-INFO: ICP original call instruction:";
    BC.printInstruction(dbgs(), Inst, Targets[0].From.Addr, nullptr, true);
  });
@ -1397,7 +1399,7 @@ void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
        if (!N)
          continue;

-        DEBUG(printDecision(dbgs(), Targets, N));
+        LLVM_DEBUG(printDecision(dbgs(), Targets, N));

        // If we can't resolve any of the target symbols, punt on this callsite.
        // TODO: can this ever happen?
@ -1422,9 +1424,10 @@ void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
                                          Inst,
                                          SymTargets);
          TotalMethodLoadsEliminated += MethodInfo.first.empty() ? 0 : 1;
-          DEBUG(dbgs() << "BOLT-INFO: ICP "
-                       << (!MethodInfo.first.empty() ? "found" : "did not find")
-                       << " vtables for all methods.\n");
+          LLVM_DEBUG(dbgs()
+                     << "BOLT-INFO: ICP "
+                     << (!MethodInfo.first.empty() ? "found" : "did not find")
+                     << " vtables for all methods.\n");
        } else if (TargetFetchInst) {
          ++TotalIndexBasedJumps;
          MethodInfo.second.push_back(TargetFetchInst);
@ -1451,7 +1454,7 @@ void IndirectCallPromotion::runOnFunctions(BinaryContext &BC) {
          continue;
        }

-        DEBUG({
+        LLVM_DEBUG({
          auto Offset = Targets[0].From.Addr;
          dbgs() << "BOLT-INFO: ICP indirect call code:\n";
          for (const auto &entry : ICPcode) {
--- a/bolt/src/Passes/Inliner.cpp
+++ b/bolt/src/Passes/Inliner.cpp
@ -25,7 +25,7 @@

 #include "Inliner.h"
 #include "MCPlus.h"
-#include "llvm/Support/Options.h"
+#include "llvm/Support/CommandLine.h"
 #include <map>

 #define DEBUG_TYPE "bolt-inliner"
@ -152,7 +152,7 @@ uint64_t Inliner::getSizeOfCallInst(const BinaryContext &BC) {
    return SizeOfCallInst;

  MCInst Inst;
-  BC.MIB->createCall(Inst, BC.Ctx->createTempSymbol(), BC.Ctx.get());
+  BC.MIB->createCall(Inst, BC.Ctx->createNamedTempSymbol(), BC.Ctx.get());
  SizeOfCallInst = BC.computeInstructionSize(Inst);

  return SizeOfCallInst;
@ -163,7 +163,7 @@ uint64_t Inliner::getSizeOfTailCallInst(const BinaryContext &BC) {
    return SizeOfTailCallInst;

  MCInst Inst;
-  BC.MIB->createTailCall(Inst, BC.Ctx->createTempSymbol(), BC.Ctx.get());
+  BC.MIB->createTailCall(Inst, BC.Ctx->createNamedTempSymbol(), BC.Ctx.get());
  SizeOfTailCallInst = BC.computeInstructionSize(Inst);

  return SizeOfTailCallInst;
@ -341,7 +341,7 @@ Inliner::inlineCall(BinaryBasicBlock &CallerBB,
      if (MIB.isPseudo(Inst))
        continue;

-      MIB.stripAnnotations(Inst);
+      MIB.stripAnnotations(Inst, /*KeepTC=*/BC.isX86());

      // Fix branch target. Strictly speaking, we don't have to do this as
      // targets of direct branches will be fixed later and don't matter
@ -499,10 +499,11 @@ bool Inliner::inlineCallsInFunction(BinaryFunction &Function) {
        }
      }

-      DEBUG(dbgs() << "BOLT-DEBUG: inlining call to " << *TargetFunction
-                   << " in " << Function << " : " << BB->getName()
-                   << ". Count: " << BB->getKnownExecutionCount()
-                   << ". Size change: " << SizeAfterInlining << " bytes.\n");
+      LLVM_DEBUG(dbgs() << "BOLT-DEBUG: inlining call to " << *TargetFunction
+                        << " in " << Function << " : " << BB->getName()
+                        << ". Count: " << BB->getKnownExecutionCount()
+                        << ". Size change: " << SizeAfterInlining
+                        << " bytes.\n");

      std::tie(BB, InstIt) = inlineCall(*BB, InstIt, *TargetFunction);

@ -522,8 +523,8 @@ bool Inliner::inlineCallsInFunction(BinaryFunction &Function) {
        auto CallerIInfo = InliningCandidates.find(&Function);
        if (CallerIInfo != InliningCandidates.end() &&
            CallerIInfo->second.Type == INL_ANY) {
-          DEBUG(dbgs() << "adjusting inlining status for function " << Function
-                       << '\n');
+          LLVM_DEBUG(dbgs() << "adjusting inlining status for function "
+                            << Function << '\n');
          CallerIInfo->second.Type = INL_TAILCALL;
        }
      }
--- a/bolt/src/Passes/Instrumentation.cpp
+++ b/bolt/src/Passes/Instrumentation.cpp
@ -12,7 +12,8 @@
 #include "Instrumentation.h"
 #include "ParallelUtilities.h"
 #include "Passes/DataflowInfoManager.h"
-#include "llvm/Support/Options.h"
+#include "llvm/Support/CommandLine.h"
+#include <stack>

 #define DEBUG_TYPE "bolt-instrumentation"

@ -72,7 +73,7 @@ uint32_t Instrumentation::getFunctionNameIndex(const BinaryFunction &Function) {
    return Iter->second;
  auto Idx = Summary->StringTable.size();
  FuncToStringIdx.emplace(std::make_pair(&Function, Idx));
-  Summary->StringTable.append(Function.getOneName());
+  Summary->StringTable.append(std::string(Function.getOneName()));
  Summary->StringTable.append(1, '\0');
  return Idx;
 }
@ -156,7 +157,7 @@ std::vector<MCInst>
 Instrumentation::createInstrumentationSnippet(BinaryContext &BC, bool IsLeaf) {
  auto L = BC.scopeLock();
  MCSymbol *Label;
-  Label = BC.Ctx->createTempSymbol("InstrEntry", true);
+  Label = BC.Ctx->createNamedTempSymbol("InstrEntry");
  Summary->Counters.emplace_back(Label);
  std::vector<MCInst> CounterInstrs(5);
  // Don't clobber application red zone (ABI dependent)
@ -586,7 +587,7 @@ void Instrumentation::runOnFunctions(BinaryContext &BC) {
 void Instrumentation::createAuxiliaryFunctions(BinaryContext &BC) {
  auto createSimpleFunction =
      [&](StringRef Title, std::vector<MCInst> Instrs) -> BinaryFunction * {
-    BinaryFunction *Func = BC.createInjectedBinaryFunction(Title);
+    BinaryFunction *Func = BC.createInjectedBinaryFunction(std::string(Title));

    std::vector<std::unique_ptr<BinaryBasicBlock>> BBs;
    BBs.emplace_back(
@ -649,7 +650,7 @@ void Instrumentation::setupRuntimeLibrary(BinaryContext &BC) {
         << opts::InstrumentationFilename << "\n";

  BC.setRuntimeLibrary(
-      llvm::make_unique<InstrumentationRuntimeLibrary>(std::move(Summary)));
+      std::make_unique<InstrumentationRuntimeLibrary>(std::move(Summary)));
 }
 }
 }
--- a/bolt/src/Passes/Instrumentation.h
+++ b/bolt/src/Passes/Instrumentation.h
@ -30,7 +30,7 @@ class Instrumentation : public BinaryFunctionPass {
 public:
  Instrumentation(const cl::opt<bool> &PrintPass)
      : BinaryFunctionPass(PrintPass),
-        Summary(llvm::make_unique<InstrumentationSummary>()) {}
+        Summary(std::make_unique<InstrumentationSummary>()) {}

  /// Modifies all functions by inserting instrumentation code (first step)
  void runOnFunctions(BinaryContext &BC) override;
--- a/bolt/src/Passes/InstrumentationSummary.h
+++ b/bolt/src/Passes/InstrumentationSummary.h
@ -15,6 +15,7 @@
 #define LLVM_TOOLS_LLVM_BOLT_PASSES_INSTRUMENTATION_SUMMARY_H

 #include "llvm/ADT/DenseSet.h"
+#include <vector>

 namespace llvm {

--- a/bolt/src/Passes/JTFootprintReduction.cpp
+++ b/bolt/src/Passes/JTFootprintReduction.cpp
@ -10,7 +10,7 @@
 //===----------------------------------------------------------------------===//

 #include "JTFootprintReduction.h"
-#include "llvm/Support/Options.h"
+#include "llvm/Support/CommandLine.h"

 #define DEBUG_TYPE "JT"

--- a/bolt/src/Passes/LongJmp.cpp
+++ b/bolt/src/Passes/LongJmp.cpp
@ -10,6 +10,7 @@
 //===----------------------------------------------------------------------===//

 #include "LongJmp.h"
+#include "llvm/Support/Alignment.h"

 #define DEBUG_TYPE "longjmp"

@ -79,7 +80,7 @@ LongJmpPass::createNewStub(BinaryBasicBlock &SourceBB, const MCSymbol *TgtSym,
  BinaryFunction &Func = *SourceBB.getFunction();
  const BinaryContext &BC = Func.getBinaryContext();
  const bool IsCold = SourceBB.isCold();
-  auto *StubSym = BC.Ctx->createTempSymbol("Stub", true);
+  auto *StubSym = BC.Ctx->createNamedTempSymbol("Stub");
  auto StubBB = Func.createBasicBlock(0, StubSym);
  MCInst Inst;
  BC.MIB->createUncondBranch(Inst, TgtSym, BC.Ctx.get());
@ -147,7 +148,7 @@ BinaryBasicBlock *LongJmpPass::lookupStubFromGroup(
  uint64_t PCRelTgtAddress = Cand->first;
  PCRelTgtAddress = DotAddress > PCRelTgtAddress ? DotAddress - PCRelTgtAddress
                                                 : PCRelTgtAddress - DotAddress;
-  DEBUG({
+  LLVM_DEBUG({
    if (Candidates.size() > 1)
      dbgs() << "Considering stub group with " << Candidates.size()
             << " candidates. DotAddress is " << Twine::utohexstr(DotAddress)
@ -301,12 +302,12 @@ uint64_t LongJmpPass::tentativeLayoutRelocColdPart(
    if (!Func->isSplit())
      continue;
    DotAddress = alignTo(DotAddress, BinaryFunction::MinAlign);
-    auto Pad = OffsetToAlignment(DotAddress, opts::AlignFunctions);
+    auto Pad = offsetToAlignment(DotAddress, llvm::Align(opts::AlignFunctions));
    if (Pad <= opts::AlignFunctionsMaxBytes)
      DotAddress += Pad;
    ColdAddresses[Func] = DotAddress;
-    DEBUG(dbgs() << Func->getPrintName() << " cold tentative: "
-                 << Twine::utohexstr(DotAddress) << "\n");
+    LLVM_DEBUG(dbgs() << Func->getPrintName() << " cold tentative: "
+                      << Twine::utohexstr(DotAddress) << "\n");
    DotAddress += Func->estimateColdSize();
    DotAddress += Func->estimateConstantIslandSize();
  }
@ -349,12 +350,12 @@ uint64_t LongJmpPass::tentativeLayoutRelocMode(
    }

    DotAddress = alignTo(DotAddress, BinaryFunction::MinAlign);
-    auto Pad = OffsetToAlignment(DotAddress, opts::AlignFunctions);
+    auto Pad = offsetToAlignment(DotAddress, llvm::Align(opts::AlignFunctions));
    if (Pad <= opts::AlignFunctionsMaxBytes)
      DotAddress += Pad;
    HotAddresses[Func] = DotAddress;
-    DEBUG(dbgs() << Func->getPrintName()
-                 << " tentative: " << Twine::utohexstr(DotAddress) << "\n");
+    LLVM_DEBUG(dbgs() << Func->getPrintName() << " tentative: "
+                      << Twine::utohexstr(DotAddress) << "\n");
    if (!Func->isSplit())
      DotAddress += Func->estimateSize();
    else
@ -393,7 +394,7 @@ void LongJmpPass::tentativeLayout(
  // Initial padding
  if (opts::UseOldText && EstimatedTextSize <= BC.OldTextSectionSize) {
    DotAddress = BC.OldTextSectionAddress;
-    auto Pad = OffsetToAlignment(DotAddress, BC.PageAlign);
+    auto Pad = offsetToAlignment(DotAddress, llvm::Align(BC.PageAlign));
    if (Pad + EstimatedTextSize <= BC.OldTextSectionSize) {
      DotAddress += Pad;
    }
@ -464,9 +465,10 @@ bool LongJmpPass::relaxStub(BinaryBasicBlock &StubBB) {
    if (Bits >= RangeShortJmp)
      return false;

-    DEBUG(dbgs() << "Relaxing stub to short jump. PCRelTgtAddress = "
-                 << Twine::utohexstr(PCRelTgtAddress)
-                 << " RealTargetSym = " << RealTargetSym->getName() << "\n");
+    LLVM_DEBUG(dbgs() << "Relaxing stub to short jump. PCRelTgtAddress = "
+                      << Twine::utohexstr(PCRelTgtAddress)
+                      << " RealTargetSym = " << RealTargetSym->getName()
+                      << "\n");
    relaxStubToShortJmp(StubBB, RealTargetSym);
    StubBits[&StubBB] = RangeShortJmp;
    return true;
@ -476,9 +478,9 @@ bool LongJmpPass::relaxStub(BinaryBasicBlock &StubBB) {
  if (Bits > RangeShortJmp)
    return false;

-  DEBUG(dbgs() << "Relaxing stub to long jump. PCRelTgtAddress = "
-               << Twine::utohexstr(PCRelTgtAddress)
-               << " RealTargetSym = " << RealTargetSym->getName() << "\n");
+  LLVM_DEBUG(dbgs() << "Relaxing stub to long jump. PCRelTgtAddress = "
+                    << Twine::utohexstr(PCRelTgtAddress)
+                    << " RealTargetSym = " << RealTargetSym->getName() << "\n");
  relaxStubToLongJmp(StubBB, RealTargetSym);
  StubBits[&StubBB] = static_cast<int>(BC.AsmInfo->getCodePointerSize() * 8);
  return true;
--- a/bolt/src/Passes/MCF.cpp
+++ b/bolt/src/Passes/MCF.cpp
@ -0,0 +1,491 @@
+//===--- Passes/MCF.cpp ---------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCF.h"
+#include "BinaryFunction.h"
+#include "BinaryPassManager.h"
+#include "Passes/DataflowInfoManager.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Timer.h"
+#include <vector>
+#include <limits>
+#include <algorithm>
+#include <cmath>
+
+#undef  DEBUG_TYPE
+#define DEBUG_TYPE "mcf"
+
+using namespace llvm;
+using namespace bolt;
+
+namespace opts {
+
+extern cl::OptionCategory BoltOptCategory;
+
+extern cl::opt<bool> TimeOpts;
+
+static cl::opt<bool>
+IterativeGuess("iterative-guess",
+  cl::desc("in non-LBR mode, guess edge counts using iterative technique"),
+  cl::ZeroOrMore,
+  cl::init(false),
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+EqualizeBBCounts("equalize-bb-counts",
+  cl::desc("in non-LBR mode, use same count for BBs "
+           "that should have equivalent count"),
+  cl::ZeroOrMore,
+  cl::init(false),
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+static cl::opt<bool>
+UseRArcs("mcf-use-rarcs",
+  cl::desc("in MCF, consider the possibility of cancelling flow to balance "
+           "edges"),
+  cl::ZeroOrMore,
+  cl::init(false),
+  cl::Hidden,
+  cl::cat(BoltOptCategory));
+
+} // namespace opts
+
+namespace llvm {
+namespace bolt {
+
+namespace {
+
+// Edge Weight Inference Heuristic
+//
+// We start by maintaining the invariant used in LBR mode where the sum of
+// pred edges count is equal to the block execution count. This loop will set
+// pred edges count by balancing its own execution count in different pred
+// edges. The weight of each edge is guessed by looking at how hot each pred
+// block is (in terms of samples).
+// There are two caveats in this approach. One is for critical edges and the
+// other is for self-referencing blocks (loops of 1 BB). For critical edges,
+// we can't infer the hotness of them based solely on pred BBs execution
+// count. For each critical edge we look at the pred BB, then look at its
+// succs to adjust its weight.
+//
+//    [ 60  ]       [ 25 ]
+//       |      \     |
+//    [ 10  ]       [ 75 ]
+//
+// The illustration above shows a critical edge \. We wish to adjust bb count
+// 60 to 50 to properly determine the weight of the critical edge to be
+// 50 / 75.
+// For self-referencing edges, we attribute its weight by subtracting the
+// current BB execution count by the sum of predecessors count if this result
+// is non-negative.
+using EdgeWeightMap =
+    DenseMap<std::pair<const BinaryBasicBlock *, const BinaryBasicBlock *>,
+             double>;
+
+template <class NodeT>
+void updateEdgeWeight(EdgeWeightMap &EdgeWeights, const BinaryBasicBlock *A,
+                      const BinaryBasicBlock *B, double Weight);
+
+template <>
+void updateEdgeWeight<BinaryBasicBlock *>(
+    EdgeWeightMap &EdgeWeights, const BinaryBasicBlock *A,
+    const BinaryBasicBlock *B, double Weight) {
+  EdgeWeights[std::make_pair(A, B)] = Weight;
+  return;
+}
+
+template <>
+void updateEdgeWeight<Inverse<BinaryBasicBlock *>>(
+    EdgeWeightMap &EdgeWeights, const BinaryBasicBlock *A,
+    const BinaryBasicBlock *B, double Weight) {
+  EdgeWeights[std::make_pair(B, A)] = Weight;
+  return;
+}
+
+template <class NodeT>
+void computeEdgeWeights(BinaryBasicBlock *BB, EdgeWeightMap &EdgeWeights) {
+  typedef GraphTraits<NodeT> GraphT;
+  typedef GraphTraits<Inverse<NodeT> > InvTraits;
+
+  double TotalChildrenCount{0.0};
+  SmallVector<double, 4> ChildrenExecCount;
+  // First pass computes total children execution count that directly
+  // contribute to this BB.
+  for (typename GraphT::ChildIteratorType CI = GraphT::child_begin(BB),
+         E = GraphT::child_end(BB); CI != E; ++CI) {
+    typename GraphT::NodeRef Child = *CI;
+    double ChildExecCount = Child->getExecutionCount();
+    // Is self-reference?
+    if (Child == BB) {
+      ChildExecCount = 0.0; // will fill this in second pass
+    } else if (GraphT::child_end(BB) - GraphT::child_begin(BB) > 1 &&
+               InvTraits::child_end(Child) - InvTraits::child_begin(Child) >
+                   1) {
+      // Handle critical edges. This will cause a skew towards crit edges, but
+      // it is a quick solution.
+      double CritWeight = 0.0;
+      uint64_t Denominator = 0;
+      for (typename InvTraits::ChildIteratorType
+               II = InvTraits::child_begin(Child),
+               IE = InvTraits::child_end(Child);
+           II != IE; ++II) {
+        typename GraphT::NodeRef N = *II;
+        Denominator += N->getExecutionCount();
+        if (N != BB) {
+          continue;
+        }
+        CritWeight = N->getExecutionCount();
+      }
+      if (Denominator)
+        CritWeight /= static_cast<double>(Denominator);
+      ChildExecCount *= CritWeight;
+    }
+    ChildrenExecCount.push_back(ChildExecCount);
+    TotalChildrenCount += ChildExecCount;
+  }
+  // Second pass fixes the weight of a possible self-reference edge
+  uint32_t ChildIndex{0};
+  for (typename GraphT::ChildIteratorType CI = GraphT::child_begin(BB),
+         E = GraphT::child_end(BB); CI != E; ++CI) {
+    typename GraphT::NodeRef Child = *CI;
+    if (Child != BB) {
+      ++ChildIndex;
+      continue;
+    }
+    if (static_cast<double>(BB->getExecutionCount()) > TotalChildrenCount) {
+      ChildrenExecCount[ChildIndex] =
+          BB->getExecutionCount() - TotalChildrenCount;
+      TotalChildrenCount += ChildrenExecCount[ChildIndex];
+    }
+    break;
+  }
+  // Third pass finally assigns weights to edges
+  ChildIndex = 0;
+  for (typename GraphT::ChildIteratorType CI = GraphT::child_begin(BB),
+         E = GraphT::child_end(BB); CI != E; ++CI) {
+    typename GraphT::NodeRef Child = *CI;
+    double Weight = 1 / (GraphT::child_end(BB) - GraphT::child_begin(BB));
+    if (TotalChildrenCount != 0.0)
+      Weight = ChildrenExecCount[ChildIndex] / TotalChildrenCount;
+    updateEdgeWeight<NodeT>(EdgeWeights, BB, Child, Weight);
+    ++ChildIndex;
+  }
+}
+
+template<class NodeT>
+void computeEdgeWeights(BinaryFunction &BF, EdgeWeightMap &EdgeWeights) {
+  for (auto &BB : BF) {
+    computeEdgeWeights<NodeT>(&BB, EdgeWeights);
+  }
+}
+
+/// Make BB count match the sum of all incoming edges. If AllEdges is true,
+/// make it match max(SumPredEdges, SumSuccEdges).
+void recalculateBBCounts(BinaryFunction &BF, bool AllEdges) {
+  for (auto &BB : BF) {
+    uint64_t TotalPredsEWeight{0};
+    for (auto Pred : BB.predecessors()) {
+      TotalPredsEWeight += Pred->getBranchInfo(BB).Count;
+    }
+
+    if (TotalPredsEWeight > BB.getExecutionCount()) {
+      BB.setExecutionCount(TotalPredsEWeight);
+    }
+
+    if (!AllEdges)
+      continue;
+
+    uint64_t TotalSuccsEWeight{0};
+    for (auto &BI : BB.branch_info()) {
+      TotalSuccsEWeight += BI.Count;
+    }
+
+    if (TotalSuccsEWeight > BB.getExecutionCount()) {
+      BB.setExecutionCount(TotalSuccsEWeight);
+    }
+  }
+}
+
+// This is our main edge count guessing heuristic. Look at predecessors and
+// assign a proportionally higher count to pred edges coming from blocks with
+// a higher execution count in comparison with the other predecessor blocks,
+// making SumPredEdges match the current BB count.
+// If "UseSucc" is true, apply the same logic to successor edges as well. Since
+// some successor edges may already have assigned a count, only update it if the
+// new count is higher.
+void guessEdgeByRelHotness(BinaryFunction &BF, bool UseSucc,
+                           EdgeWeightMap &PredEdgeWeights,
+                           EdgeWeightMap &SuccEdgeWeights) {
+  for (auto &BB : BF) {
+    for (auto Pred : BB.predecessors()) {
+      double RelativeExec = PredEdgeWeights[std::make_pair(Pred, &BB)];
+      RelativeExec *= BB.getExecutionCount();
+      auto &BI = Pred->getBranchInfo(BB);
+      if (static_cast<uint64_t>(RelativeExec) > BI.Count)
+        BI.Count = static_cast<uint64_t>(RelativeExec);
+    }
+
+    if (!UseSucc)
+      continue;
+
+    auto BI = BB.branch_info_begin();
+    for (auto Succ : BB.successors()) {
+      double RelativeExec = SuccEdgeWeights[std::make_pair(&BB, Succ)];
+      RelativeExec *= BB.getExecutionCount();
+      if (static_cast<uint64_t>(RelativeExec) > BI->Count)
+        BI->Count = static_cast<uint64_t>(RelativeExec);
+      ++BI;
+    }
+  }
+}
+
+using ArcSet =
+    DenseSet<std::pair<const BinaryBasicBlock *, const BinaryBasicBlock *>>;
+
+/// Predecessor edges version of guessEdgeByIterativeApproach. GuessedArcs has
+/// all edges we already established their count. Try to guess the count of
+/// the remaining edge, if there is only one to guess, and return true if we
+/// were able to guess.
+bool guessPredEdgeCounts(BinaryBasicBlock *BB, ArcSet &GuessedArcs) {
+  if (BB->pred_size() == 0)
+    return false;
+
+  uint64_t TotalPredCount{0};
+  unsigned NumGuessedEdges{0};
+  for (auto Pred : BB->predecessors()) {
+    if (GuessedArcs.count(std::make_pair(Pred, BB)))
+      ++NumGuessedEdges;
+    TotalPredCount += Pred->getBranchInfo(*BB).Count;
+  }
+
+  if (NumGuessedEdges != BB->pred_size() - 1)
+    return false;
+
+  int64_t Guessed =
+      static_cast<int64_t>(BB->getExecutionCount()) - TotalPredCount;
+  if (Guessed < 0)
+    Guessed = 0;
+
+  for (auto Pred : BB->predecessors()) {
+    if (GuessedArcs.count(std::make_pair(Pred, BB)))
+      continue;
+
+    Pred->getBranchInfo(*BB).Count = Guessed;
+    return true;
+  }
+  llvm_unreachable("Expected unguessed arc");
+}
+
+/// Successor edges version of guessEdgeByIterativeApproach. GuessedArcs has
+/// all edges we already established their count. Try to guess the count of
+/// the remaining edge, if there is only one to guess, and return true if we
+/// were able to guess.
+bool guessSuccEdgeCounts(BinaryBasicBlock *BB, ArcSet &GuessedArcs) {
+  if (BB->succ_size() == 0)
+    return false;
+
+  uint64_t TotalSuccCount{0};
+  unsigned NumGuessedEdges{0};
+  auto BI = BB->branch_info_begin();
+  for (auto Succ : BB->successors()) {
+    if (GuessedArcs.count(std::make_pair(BB, Succ)))
+      ++NumGuessedEdges;
+    TotalSuccCount += BI->Count;
+    ++BI;
+  }
+
+  if (NumGuessedEdges != BB->succ_size() - 1)
+    return false;
+
+  int64_t Guessed =
+      static_cast<int64_t>(BB->getExecutionCount()) - TotalSuccCount;
+  if (Guessed < 0)
+    Guessed = 0;
+
+  BI = BB->branch_info_begin();
+  for (auto Succ : BB->successors()) {
+    if (GuessedArcs.count(std::make_pair(BB, Succ))) {
+      ++BI;
+      continue;
+    }
+
+    BI->Count = Guessed;
+    GuessedArcs.insert(std::make_pair(BB, Succ));
+    return true;
+  }
+  llvm_unreachable("Expected unguessed arc");
+}
+
+/// Guess edge count whenever we have only one edge (pred or succ) left
+/// to guess. Then make its count equal to BB count minus all other edge
+/// counts we already know their count. Repeat this until there is no
+/// change.
+void guessEdgeByIterativeApproach(BinaryFunction &BF) {
+  ArcSet KnownArcs;
+  bool Changed{false};
+
+  do {
+    Changed = false;
+    for (auto &BB : BF) {
+      if (guessPredEdgeCounts(&BB, KnownArcs)) Changed = true;
+      if (guessSuccEdgeCounts(&BB, KnownArcs)) Changed = true;
+    }
+  } while (Changed);
+
+  // Guess count for non-inferred edges
+  for (auto &BB : BF) {
+    for (auto Pred : BB.predecessors()) {
+      if (KnownArcs.count(std::make_pair(Pred, &BB)))
+        continue;
+      auto &BI = Pred->getBranchInfo(BB);
+      BI.Count =
+        std::min(Pred->getExecutionCount(), BB.getExecutionCount()) / 2;
+      KnownArcs.insert(std::make_pair(Pred, &BB));
+    }
+    auto BI = BB.branch_info_begin();
+    for (auto Succ : BB.successors()) {
+      if (KnownArcs.count(std::make_pair(&BB, Succ))) {
+        ++BI;
+        continue;
+      }
+      BI->Count =
+          std::min(BB.getExecutionCount(), Succ->getExecutionCount()) / 2;
+      KnownArcs.insert(std::make_pair(&BB, Succ));
+      break;
+    }
+  }
+}
+
+/// Associate each basic block with the BinaryLoop object corresponding to the
+/// innermost loop containing this block.
+DenseMap<const BinaryBasicBlock *, const BinaryLoop*>
+createLoopNestLevelMap(BinaryFunction &BF) {
+  DenseMap<const BinaryBasicBlock *, const BinaryLoop*> LoopNestLevel;
+  auto &BLI = BF.getLoopInfo();
+
+  for (auto &BB : BF) {
+    LoopNestLevel[&BB] = BLI[&BB];
+  }
+
+  return LoopNestLevel;
+}
+
+/// Implement the idea in "SamplePGO - The Power of Profile Guided Optimizations
+/// without the Usability Burden" by Diego Novillo to make basic block counts
+/// equal if we show that A dominates B, B post-dominates A and they are in the
+/// same loop and same loop nesting level.
+void equalizeBBCounts(BinaryFunction &BF) {
+  auto Info = DataflowInfoManager(BF.getBinaryContext(), BF, nullptr, nullptr);
+  auto &DA = Info.getDominatorAnalysis();
+  auto &PDA = Info.getPostDominatorAnalysis();
+  auto &InsnToBB = Info.getInsnToBBMap();
+  // These analyses work at the instruction granularity, but we really only need
+  // basic block granularity here. So we'll use a set of visited edges to avoid
+  // revisiting the same BBs again and again.
+  DenseMap<const BinaryBasicBlock *, std::set<const BinaryBasicBlock *>>
+      Visited;
+  // Equivalence classes mapping. Each equivalence class is defined by the set
+  // of BBs that obeys the aforementioned properties.
+  DenseMap<const BinaryBasicBlock *, signed> BBsToEC;
+  std::vector<std::vector<BinaryBasicBlock *>> Classes;
+
+  BF.calculateLoopInfo();
+  auto LoopNestLevel = createLoopNestLevelMap(BF);
+
+  for (auto &BB : BF) {
+    BBsToEC[&BB] = -1;
+  }
+
+  for (auto &BB : BF) {
+    auto I = BB.begin();
+    if (I == BB.end())
+      continue;
+
+    DA.doForAllDominators(*I, [&](const MCInst &DomInst) {
+      auto *DomBB = InsnToBB[&DomInst];
+      if (Visited[DomBB].count(&BB))
+        return;
+      Visited[DomBB].insert(&BB);
+      if (!PDA.doesADominateB(*I, DomInst))
+        return;
+      if (LoopNestLevel[&BB] != LoopNestLevel[DomBB])
+        return;
+      if (BBsToEC[DomBB] == -1  && BBsToEC[&BB] == -1) {
+        BBsToEC[DomBB] = Classes.size();
+        BBsToEC[&BB] = Classes.size();
+        Classes.emplace_back();
+        Classes.back().push_back(DomBB);
+        Classes.back().push_back(&BB);
+        return;
+      }
+      if (BBsToEC[DomBB] == -1) {
+        BBsToEC[DomBB] = BBsToEC[&BB];
+        Classes[BBsToEC[&BB]].push_back(DomBB);
+        return;
+      }
+      if (BBsToEC[&BB] == -1) {
+        BBsToEC[&BB] = BBsToEC[DomBB];
+        Classes[BBsToEC[DomBB]].push_back(&BB);
+        return;
+      }
+      auto BBECNum = BBsToEC[&BB];
+      auto DomEC = Classes[BBsToEC[DomBB]];
+      auto BBEC = Classes[BBECNum];
+      for (auto *Block : DomEC) {
+        BBsToEC[Block] = BBECNum;
+        BBEC.push_back(Block);
+      }
+      DomEC.clear();
+    });
+  }
+
+  for (auto &Class : Classes) {
+    uint64_t Max{0ULL};
+    for (auto *BB : Class) {
+      Max = std::max(Max, BB->getExecutionCount());
+    }
+    for (auto *BB : Class) {
+      BB->setExecutionCount(Max);
+    }
+  }
+}
+
+} // end anonymous namespace
+
+void estimateEdgeCounts(BinaryFunction &BF) {
+  EdgeWeightMap PredEdgeWeights;
+  EdgeWeightMap SuccEdgeWeights;
+  if (!opts::IterativeGuess) {
+    computeEdgeWeights<Inverse<BinaryBasicBlock *>>(BF, PredEdgeWeights);
+    computeEdgeWeights<BinaryBasicBlock *>(BF, SuccEdgeWeights);
+  }
+  if (opts::EqualizeBBCounts) {
+    LLVM_DEBUG(BF.print(dbgs(), "before equalize BB counts", true));
+    equalizeBBCounts(BF);
+    LLVM_DEBUG(BF.print(dbgs(), "after equalize BB counts", true));
+  }
+  if (opts::IterativeGuess)
+    guessEdgeByIterativeApproach(BF);
+  else
+    guessEdgeByRelHotness(BF, /*UseSuccs=*/false, PredEdgeWeights,
+                          SuccEdgeWeights);
+  recalculateBBCounts(BF, /*AllEdges=*/false);
+}
+
+void solveMCF(BinaryFunction &BF, MCFCostFunction CostFunction) {
+  llvm_unreachable("not implemented");
+}
+
+}
+}
--- a/bolt/src/Passes/MCF.h
+++ b/bolt/src/Passes/MCF.h
@ -0,0 +1,52 @@
+//===--- Passes/MCF.h -----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_BOLT_MCF_H
+#define LLVM_TOOLS_LLVM_BOLT_MCF_H
+
+namespace llvm {
+namespace bolt {
+
+class BinaryFunction;
+
+enum MCFCostFunction : char {
+  MCF_DISABLE = 0,
+  MCF_LINEAR,
+  MCF_QUADRATIC,
+  MCF_LOG,
+  MCF_BLAMEFTS
+};
+
+/// Fill edge counts based on the basic block count. Used in nonLBR mode when
+/// we only have bb count.
+void estimateEdgeCounts(BinaryFunction &BF);
+
+/// Entry point for computing a min-cost flow for the CFG with the goal
+/// of fixing the flow of the CFG edges, that is, making sure it obeys the
+/// flow-conservation equation  SumInEdges = SumOutEdges.
+///
+/// To do this, we create an instance of the min-cost flow problem in a
+/// similar way as the one discussed in the work of Roy Levin "Completing
+/// Incomplete Edge Profile by Applying Minimum Cost Circulation Algorithms".
+/// We do a few things differently, though. We don't populate edge counts using
+/// weights coming from a static branch prediction technique and we don't
+/// use the same cost function.
+///
+/// If cost function BlameFTs is used, assign all remaining flow to
+/// fall-throughs. This is used when the sampling is based on taken branches
+/// that do not account for them.
+void solveMCF(BinaryFunction &BF, MCFCostFunction CostFunction);
+
+}
+}
+
+
+#endif
--- a/bolt/src/Passes/PLTCall.cpp
+++ b/bolt/src/Passes/PLTCall.cpp
@ -12,7 +12,7 @@
 //===----------------------------------------------------------------------===//

 #include "PLTCall.h"
-#include "llvm/Support/Options.h"
+#include "llvm/Support/CommandLine.h"

 #define DEBUG_TYPE "bolt-plt"

--- a/bolt/src/Passes/PatchEntries.cpp
+++ b/bolt/src/Passes/PatchEntries.cpp
@ -13,7 +13,7 @@

 #include "PatchEntries.h"
 #include "NameResolver.h"
-#include "llvm/Support/Options.h"
+#include "llvm/Support/CommandLine.h"

 namespace opts {

--- a/bolt/src/Passes/PettisAndHansen.cpp
+++ b/bolt/src/Passes/PettisAndHansen.cpp
@ -155,8 +155,9 @@ std::vector<Cluster> pettisAndHansen(const CallGraph &Cg) {

    orderFuncs(Cg, C1, C2);

-    DEBUG(dbgs() << format("merging %s -> %s: %.1f\n", C2->toString().c_str(),
-          C1->toString().c_str(), Max.Weight););
+    LLVM_DEBUG(dbgs() << format("merging %s -> %s: %.1f\n",
+                                C2->toString().c_str(), C1->toString().c_str(),
+                                Max.Weight));

    // update carcs: merge C1arcs to C2arcs

--- a/bolt/src/Passes/RegReAssign.cpp
+++ b/bolt/src/Passes/RegReAssign.cpp
@ -13,6 +13,7 @@
 #include "DataflowInfoManager.h"
 #include "MCPlus.h"
 #include "RegReAssign.h"
+#include "Utils.h"
 #include <numeric>

 #define DEBUG_TYPE "regreassign"
@ -71,7 +72,7 @@ void RegReAssign::swap(BinaryContext &BC, BinaryFunction &Function, MCPhysReg A,
    for (auto &Inst : BB) {
      if (!BC.MIB->isCFI(Inst))
        continue;
-      auto *CFI = Function.getCFIFor(Inst);
+      const MCCFIInstruction *CFI = Function.getCFIFor(Inst);
      if (Changed.count(CFI))
        continue;
      Changed.insert(CFI);
@ -79,16 +80,24 @@ void RegReAssign::swap(BinaryContext &BC, BinaryFunction &Function, MCPhysReg A,
      switch (CFI->getOperation()) {
      case MCCFIInstruction::OpRegister: {
        const auto CFIReg2 = CFI->getRegister2();
-        const MCPhysReg Reg2 = BC.MRI->getLLVMRegNum(CFIReg2, /*isEH=*/false);
+        const MCPhysReg Reg2 = *BC.MRI->getLLVMRegNum(CFIReg2, /*isEH=*/false);
        if (AliasA.test(Reg2)) {
-          CFI->setRegister2(BC.MRI->getDwarfRegNum(
-              BC.MIB->getAliasSized(B, BC.MIB->getRegSize(Reg2)), false));
+          Function.setCFIFor(
+              Inst, MCCFIInstruction::createRegister(
+                        nullptr, CFI->getRegister(),
+                        BC.MRI->getDwarfRegNum(
+                            BC.MIB->getAliasSized(B, BC.MIB->getRegSize(Reg2)),
+                            false)));
        } else if (AliasB.test(Reg2)) {
-          CFI->setRegister2(BC.MRI->getDwarfRegNum(
-              BC.MIB->getAliasSized(A, BC.MIB->getRegSize(Reg2)), false));
+          Function.setCFIFor(
+              Inst, MCCFIInstruction::createRegister(
+                        nullptr, CFI->getRegister(),
+                        BC.MRI->getDwarfRegNum(
+                            BC.MIB->getAliasSized(A, BC.MIB->getRegSize(Reg2)),
+                            false)));
        }
      }
-      // Fall-through
+      LLVM_FALLTHROUGH;
      case MCCFIInstruction::OpUndefined:
      case MCCFIInstruction::OpDefCfa:
      case MCCFIInstruction::OpOffset:
@ -96,16 +105,29 @@ void RegReAssign::swap(BinaryContext &BC, BinaryFunction &Function, MCPhysReg A,
      case MCCFIInstruction::OpSameValue:
      case MCCFIInstruction::OpDefCfaRegister:
      case MCCFIInstruction::OpRelOffset:
-      case MCCFIInstruction::OpExpression:
-      case MCCFIInstruction::OpValExpression: {
-        const auto CFIReg = CFI->getRegister();
-        const MCPhysReg Reg = BC.MRI->getLLVMRegNum(CFIReg, /*isEH=*/false);
+      case MCCFIInstruction::OpEscape: {
+        unsigned CFIReg;
+        if (CFI->getOperation() != MCCFIInstruction::OpEscape) {
+          CFIReg = CFI->getRegister();
+        } else {
+          Optional<uint8_t> Reg =
+              readDWARFExpressionTargetReg(CFI->getValues());
+          // Handle DW_CFA_def_cfa_expression
+          if (!Reg)
+            break;
+          CFIReg = *Reg;
+        }
+        const MCPhysReg Reg = *BC.MRI->getLLVMRegNum(CFIReg, /*isEH=*/false);
        if (AliasA.test(Reg)) {
-          CFI->setRegister(BC.MRI->getDwarfRegNum(
-              BC.MIB->getAliasSized(B, BC.MIB->getRegSize(Reg)), false));
+          Function.mutateCFIRegisterFor(
+              Inst,
+              BC.MRI->getDwarfRegNum(
+                  BC.MIB->getAliasSized(B, BC.MIB->getRegSize(Reg)), false));
        } else if (AliasB.test(Reg)) {
-          CFI->setRegister(BC.MRI->getDwarfRegNum(
-              BC.MIB->getAliasSized(A, BC.MIB->getRegSize(Reg)), false));
+          Function.mutateCFIRegisterFor(
+              Inst,
+              BC.MRI->getDwarfRegNum(
+                  BC.MIB->getAliasSized(A, BC.MIB->getRegSize(Reg)), false));
        }
        break;
      }
@ -180,7 +202,7 @@ void RegReAssign::rankRegisters(BinaryContext &BC, BinaryFunction &Function) {
  std::sort(RankedRegs.begin(), RankedRegs.end(),
            [&](size_t A, size_t B) { return RegScore[A] > RegScore[B]; });

-  DEBUG({
+  LLVM_DEBUG({
    for (auto Reg : RankedRegs) {
      if (RegScore[Reg] == 0)
        continue;
@ -259,34 +281,34 @@ void RegReAssign::aggressivePassOverFunction(BinaryContext &BC,
    }

    if (RegScore[ClassicReg] << 1 >= RegScore[ExtReg]) {
-      DEBUG(dbgs() << " Ending at " << BC.MRI->getName(ClassicReg) << " with "
-                   << BC.MRI->getName(ExtReg)
-                   << " because exchange is not profitable\n");
+      LLVM_DEBUG(dbgs() << " Ending at " << BC.MRI->getName(ClassicReg)
+                        << " with " << BC.MRI->getName(ExtReg)
+                        << " because exchange is not profitable\n");
      break;
    }

    BitVector AnyAliasAlive = AliveAtStart;
    AnyAliasAlive &= BC.MIB->getAliases(ClassicReg);
    if (AnyAliasAlive.any()) {
-      DEBUG(dbgs() << " Bailed on " << BC.MRI->getName(ClassicReg) << " with "
-                   << BC.MRI->getName(ExtReg)
-                   << " because classic reg is alive\n");
+      LLVM_DEBUG(dbgs() << " Bailed on " << BC.MRI->getName(ClassicReg)
+                        << " with " << BC.MRI->getName(ExtReg)
+                        << " because classic reg is alive\n");
      --End;
      continue;
    }
    AnyAliasAlive = AliveAtStart;
    AnyAliasAlive &= BC.MIB->getAliases(ExtReg);
    if (AnyAliasAlive.any()) {
-      DEBUG(dbgs() << " Bailed on " << BC.MRI->getName(ClassicReg) << " with "
-                   << BC.MRI->getName(ExtReg)
-                   << " because extended reg is alive\n");
+      LLVM_DEBUG(dbgs() << " Bailed on " << BC.MRI->getName(ClassicReg)
+                        << " with " << BC.MRI->getName(ExtReg)
+                        << " because extended reg is alive\n");
      ++Begin;
      continue;
    }

    // Opportunity detected. Swap.
-    DEBUG(dbgs() << "\n ** Swapping " << BC.MRI->getName(ClassicReg) << " with "
-          << BC.MRI->getName(ExtReg) << "\n\n");
+    LLVM_DEBUG(dbgs() << "\n ** Swapping " << BC.MRI->getName(ClassicReg)
+                      << " with " << BC.MRI->getName(ExtReg) << "\n\n");
    swap(BC, Function, ClassicReg, ExtReg);
    FuncsChanged.insert(&Function);
    ++Begin;
@ -328,8 +350,8 @@ bool RegReAssign::conservativePassOverFunction(BinaryContext &BC,
  if (!RBX)
    return false;

-  DEBUG(dbgs() << "\n ** Swapping " << BC.MRI->getName(RBX) << " with "
-               << BC.MRI->getName(Candidate) << "\n\n");
+  LLVM_DEBUG(dbgs() << "\n ** Swapping " << BC.MRI->getName(RBX) << " with "
+                    << BC.MRI->getName(Candidate) << "\n\n");
  swap(BC, Function, RBX, Candidate);
  FuncsChanged.insert(&Function);
  return true;
@ -365,7 +387,7 @@ void RegReAssign::setupConservativePass(
  ExtendedCSR.flip();
  ExtendedCSR &= CalleeSaved;

-  DEBUG({
+  LLVM_DEBUG({
    RegStatePrinter P(BC);
    dbgs() << "Starting register reassignment\nClassicRegs: ";
    P.print(dbgs(), ClassicRegs);
@ -394,12 +416,12 @@ void RegReAssign::runOnFunctions(BinaryContext &BC) {
    if (!Function.isSimple() || Function.isIgnored())
      continue;

-    DEBUG(dbgs() << "====================================\n");
-    DEBUG(dbgs() << " - " << Function.getPrintName() << "\n");
+    LLVM_DEBUG(dbgs() << "====================================\n");
+    LLVM_DEBUG(dbgs() << " - " << Function.getPrintName() << "\n");
    if (!conservativePassOverFunction(BC, Function) &&
        opts::AggressiveReAssign) {
      aggressivePassOverFunction(BC, Function);
-      DEBUG({
+      LLVM_DEBUG({
        if (FuncsChanged.count(&Function)) {
          dbgs() << "Aggressive pass successful on " << Function.getPrintName()
                 << "\n";
--- a/bolt/src/Passes/ReorderAlgorithm.cpp
+++ b/bolt/src/Passes/ReorderAlgorithm.cpp
@ -18,6 +18,7 @@
 #include <functional>
 #include <queue>
 #include <random>
+#include <stack>

 #undef  DEBUG_TYPE
 #define DEBUG_TYPE "bolt"
@ -177,13 +178,11 @@ void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF,
    const auto *SrcBB = E.Src;
    const auto *DstBB = E.Dst;

-    DEBUG(dbgs() << "Popped edge ";
-          E.print(dbgs());
-          dbgs() << "\n");
+    LLVM_DEBUG(dbgs() << "Popped edge "; E.print(dbgs()); dbgs() << "\n");

    // Case 1: BBSrc and BBDst are the same. Ignore this edge
    if (SrcBB == DstBB || DstBB == *BF.layout_begin()) {
-      DEBUG(dbgs() << "\tIgnored (same src, dst)\n");
+      LLVM_DEBUG(dbgs() << "\tIgnored (same src, dst)\n");
      continue;
    }

@ -195,7 +194,7 @@ void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF,
    if (I == J) {
      if (ComputeEdges)
        ClusterEdges[I][I] += E.Count;
-      DEBUG(dbgs() << "\tIgnored (src, dst belong to the same cluster)\n");
+      LLVM_DEBUG(dbgs() << "\tIgnored (src, dst belong to the same cluster)\n");
      continue;
    }

@ -221,7 +220,7 @@ void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF,
      }
      // Adjust the weights of the remaining edges and re-sort the queue.
      adjustQueue(Queue, BF);
-      DEBUG(dbgs() << "\tMerged clusters of src, dst\n");
+      LLVM_DEBUG(dbgs() << "\tMerged clusters of src, dst\n");
    } else {
      // Case 4: Both SrcBB and DstBB are allocated in positions we cannot
      // merge them. Add the count of this edge to the inter-cluster edge count
@ -229,7 +228,8 @@ void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF,
      // clusters.
      if (ComputeEdges)
        ClusterEdges[I][J] += E.Count;
-      DEBUG(dbgs() << "\tIgnored (src, dst belong to incompatible clusters)\n");
+      LLVM_DEBUG(
+          dbgs() << "\tIgnored (src, dst belong to incompatible clusters)\n");
    }
  }
 }
@ -352,9 +352,8 @@ void MinBranchGreedyClusterAlgorithm::adjustQueue(
    // Case 1: SrcBB and DstBB are the same or DstBB is the entry block. Ignore
    // this edge.
    if (SrcBB == DstBB || DstBB == *BF.layout_begin()) {
-      DEBUG(dbgs() << "\tAdjustment: Ignored edge ";
-            E.print(dbgs());
-            dbgs() << " (same src, dst)\n");
+      LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs());
+                 dbgs() << " (same src, dst)\n");
      continue;
    }

@ -370,10 +369,9 @@ void MinBranchGreedyClusterAlgorithm::adjustQueue(
    if (I == J || !areClustersCompatible(ClusterA, ClusterB, E)) {
      if (!ClusterEdges.empty())
        ClusterEdges[I][J] += E.Count;
-      DEBUG(dbgs() << "\tAdjustment: Ignored edge ";
-            E.print(dbgs());
-            dbgs() << " (src, dst belong to same cluster or incompatible "
-                      "clusters)\n");
+      LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs());
+                 dbgs() << " (src, dst belong to same cluster or incompatible "
+                           "clusters)\n");
      for (const auto *SuccBB : SrcBB->successors()) {
        if (SuccBB == DstBB)
          continue;
--- a/bolt/src/Passes/ReorderData.cpp
+++ b/bolt/src/Passes/ReorderData.cpp
@ -358,8 +358,8 @@ void ReorderData::setSectionOrder(BinaryContext &BC,
    TotalCount += Itr->second;
  }

-  DEBUG(dbgs() << "BOLT-DEBUG: setSectionOrder for "
-               << OutputSection.getName() << "\n");
+  LLVM_DEBUG(dbgs() << "BOLT-DEBUG: setSectionOrder for "
+                    << OutputSection.getName() << "\n");

  for (; Begin != End; ++Begin) {
    auto *BD = Begin->first;
@ -371,8 +371,8 @@ void ReorderData::setSectionOrder(BinaryContext &BC,
    ++NumReordered;
    if (NumReordered > opts::ReorderDataMaxSymbols) {
      if (!NewOrder.empty()) {
-        dbgs() << "BOLT-DEBUG: processing ending on symbol "
-               << *NewOrder.back() << "\n";
+        LLVM_DEBUG(dbgs() << "BOLT-DEBUG: processing ending on symbol "
+                          << *NewOrder.back() << "\n");
      }
      break;
    }
@ -382,14 +382,14 @@ void ReorderData::setSectionOrder(BinaryContext &BC,

    if ((Offset + BD->getSize()) > opts::ReorderDataMaxBytes) {
      if (!NewOrder.empty()) {
-        dbgs() << "BOLT-DEBUG: processing ending on symbol "
-               << *NewOrder.back() << "\n";
+        LLVM_DEBUG(dbgs() << "BOLT-DEBUG: processing ending on symbol "
+                          << *NewOrder.back() << "\n");
      }
      break;
    }

-    DEBUG(dbgs() << "BOLT-DEBUG: " << BD->getName() << " @ 0x"
-                 << Twine::utohexstr(Offset) << "\n");
+    LLVM_DEBUG(dbgs() << "BOLT-DEBUG: " << BD->getName() << " @ 0x"
+                      << Twine::utohexstr(Offset) << "\n");

    BD->setOutputLocation(OutputSection, Offset);

@ -397,8 +397,8 @@ void ReorderData::setSectionOrder(BinaryContext &BC,
    for (auto &SubBD : BC.getSubBinaryData(BD)) {
      if (!SubBD.second->isJumpTable()) {
        auto SubOffset = Offset + SubBD.second->getAddress() - BD->getAddress();
-        DEBUG(dbgs() << "BOLT-DEBUG: SubBD " << SubBD.second->getName()
-                     << " @ " << SubOffset << "\n");
+        LLVM_DEBUG(dbgs() << "BOLT-DEBUG: SubBD " << SubBD.second->getName()
+                          << " @ " << SubOffset << "\n");
        SubBD.second->setOutputLocation(OutputSection, SubOffset);
      }
    }
--- a/bolt/src/Passes/ReorderFunctions.cpp
+++ b/bolt/src/Passes/ReorderFunctions.cpp
@ -11,7 +11,7 @@

 #include "ReorderFunctions.h"
 #include "HFSort.h"
-#include "llvm/Support/Options.h"
+#include "llvm/Support/CommandLine.h"
 #include <fstream>

 #define DEBUG_TYPE "hfsort"
@ -396,7 +396,7 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC) {
  std::unique_ptr<std::ofstream> FuncsFile;
  if (!opts::GenerateFunctionOrderFile.empty()) {
    FuncsFile =
-      llvm::make_unique<std::ofstream>(opts::GenerateFunctionOrderFile,
+      std::make_unique<std::ofstream>(opts::GenerateFunctionOrderFile,
                                       std::ios::out);
    if (!FuncsFile) {
      errs() << "BOLT-ERROR: ordered functions file "
@ -408,7 +408,7 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC) {
  std::unique_ptr<std::ofstream> LinkSectionsFile;
  if (!opts::LinkSectionsFile.empty()) {
    LinkSectionsFile =
-      llvm::make_unique<std::ofstream>(opts::LinkSectionsFile,
+      std::make_unique<std::ofstream>(opts::LinkSectionsFile,
                                       std::ios::out);
    if (!LinkSectionsFile) {
      errs() << "BOLT-ERROR: link sections file "
--- a/bolt/src/Passes/RetpolineInsertion.cpp
+++ b/bolt/src/Passes/RetpolineInsertion.cpp
@ -85,14 +85,14 @@ BinaryFunction *createNewRetpoline(BinaryContext &BC,
                                   bool R11Available) {
  auto &MIB = *BC.MIB;
  auto &Ctx = *BC.Ctx.get();
-  DEBUG(dbgs() << "BOLT-DEBUG: Creating a new retpoline function["
-               << RetpolineTag << "]\n");
+  LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Creating a new retpoline function["
+                    << RetpolineTag << "]\n");

  auto *NewRetpoline = BC.createInjectedBinaryFunction(RetpolineTag, true);
  std::vector<std::unique_ptr<BinaryBasicBlock>> NewBlocks(3);
  for (int I = 0; I < 3; I++) {
    auto Symbol =
-        Ctx.createTempSymbol(Twine(RetpolineTag + "_BB" + to_string(I)), true);
+        Ctx.createNamedTempSymbol(Twine(RetpolineTag + "_BB" + to_string(I)));
    NewBlocks[I] = NewRetpoline->createBasicBlock(
        BinaryBasicBlock::INVALID_OFFSET, Symbol);
    NewBlocks[I].get()->setCFIState(0);
--- a/bolt/src/Passes/ShrinkWrapping.cpp
+++ b/bolt/src/Passes/ShrinkWrapping.cpp
@ -12,6 +12,7 @@
 #include "MCPlus.h"
 #include "ShrinkWrapping.h"
 #include <numeric>
+#include <stack>

 #define DEBUG_TYPE "shrinkwrapping"

@ -39,9 +40,9 @@ void CalleeSavedAnalysis::analyzeSaves() {
  auto &InsnToBB = Info.getInsnToBBMap();
  BitVector BlacklistedRegs(BC.MRI->getNumRegs(), false);

-  DEBUG(dbgs() << "Checking spill locations\n");
+  LLVM_DEBUG(dbgs() << "Checking spill locations\n");
  for (auto &BB : BF) {
-    DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n");
+    LLVM_DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n");
    const MCInst *Prev = nullptr;
    for (auto &Inst : BB) {
      if (auto FIE = FA.getFIEFor(Inst)) {
@ -104,8 +105,8 @@ void CalleeSavedAnalysis::analyzeSaves() {
        SavingCost[FIE->RegOrImm] += InsnToBB[&Inst]->getKnownExecutionCount();
        BC.MIB->addAnnotation(Inst, getSaveTag(), FIE->RegOrImm, AllocatorId);
        OffsetsByReg[FIE->RegOrImm] = FIE->StackOffset;
-        DEBUG(dbgs() << "Logging new candidate for Callee-Saved Reg: "
-                     << FIE->RegOrImm << "\n");
+        LLVM_DEBUG(dbgs() << "Logging new candidate for Callee-Saved Reg: "
+                          << FIE->RegOrImm << "\n");
      }
      Prev = &Inst;
    }
@ -142,15 +143,15 @@ void CalleeSavedAnalysis::analyzeRestores() {
        // we don't completely understand what's happening here
        if (FIE->StackOffset != OffsetsByReg[FIE->RegOrImm]) {
          CalleeSaved.reset(FIE->RegOrImm);
-          DEBUG(dbgs() << "Dismissing Callee-Saved Reg because we found a "
-                          "mismatching restore: "
-                       << FIE->RegOrImm << "\n");
+          LLVM_DEBUG(dbgs() << "Dismissing Callee-Saved Reg because we found a "
+                               "mismatching restore: "
+                            << FIE->RegOrImm << "\n");
          Prev = &Inst;
          continue;
        }

-        DEBUG(dbgs() << "Adding matching restore for: " << FIE->RegOrImm
-                     << "\n");
+        LLVM_DEBUG(dbgs() << "Adding matching restore for: " << FIE->RegOrImm
+                          << "\n");
        if (LoadFIEByReg[FIE->RegOrImm] == nullptr)
          LoadFIEByReg[FIE->RegOrImm] = &*FIE;
        BC.MIB->addAnnotation(Inst, getRestoreTag(), FIE->RegOrImm,
@ -358,8 +359,8 @@ void StackLayoutModifier::classifyStackAccesses() {
      BC.MIB->addAnnotation(Inst, getSlotTag(), FIEX->StackOffset, AllocatorId);
      RegionToRegMap[FIEX->StackOffset].insert(FIEX->RegOrImm);
      RegToRegionMap[FIEX->RegOrImm].insert(FIEX->StackOffset);
-      DEBUG(dbgs() << "Adding region " << FIEX->StackOffset << " size "
-                   << (int)FIEX->Size << "\n");
+      LLVM_DEBUG(dbgs() << "Adding region " << FIEX->StackOffset << " size "
+                        << (int)FIEX->Size << "\n");
    }
  }
 }
@ -370,10 +371,10 @@ void StackLayoutModifier::classifyCFIs() {
  uint16_t CfaReg{7};

  auto recordAccess = [&](MCInst *Inst, int64_t Offset) {
-    const uint16_t Reg = BC.MRI->getLLVMRegNum(CfaReg, /*isEH=*/false);
+    const uint16_t Reg = *BC.MRI->getLLVMRegNum(CfaReg, /*isEH=*/false);
    if (Reg == BC.MIB->getStackPointer() || Reg == BC.MIB->getFramePointer()) {
      BC.MIB->addAnnotation(*Inst, getSlotTag(), Offset, AllocatorId);
-      DEBUG(dbgs() << "Recording CFI " << Offset << "\n");
+      LLVM_DEBUG(dbgs() << "Recording CFI " << Offset << "\n");
    } else {
      IsSimple = false;
      return;
@ -384,17 +385,17 @@ void StackLayoutModifier::classifyCFIs() {
    for (auto &Inst : *BB) {
      if (!BC.MIB->isCFI(Inst))
        continue;
-      auto *CFI = BF.getCFIFor(Inst);
+      const MCCFIInstruction *CFI = BF.getCFIFor(Inst);
      switch (CFI->getOperation()) {
      case MCCFIInstruction::OpDefCfa:
-        CfaOffset = CFI->getOffset();
+        CfaOffset = -CFI->getOffset();
        recordAccess(&Inst, CfaOffset);
-      // Fall-through
+        LLVM_FALLTHROUGH;
      case MCCFIInstruction::OpDefCfaRegister:
        CfaReg = CFI->getRegister();
        break;
      case MCCFIInstruction::OpDefCfaOffset:
-        CfaOffset = CFI->getOffset();
+        CfaOffset = -CFI->getOffset();
        recordAccess(&Inst, CfaOffset);
        break;
      case MCCFIInstruction::OpOffset:
@ -661,10 +662,15 @@ void StackLayoutModifier::performChanges() {
        if (ModifiedCFIIndices.count(CFINum))
          continue;
        ModifiedCFIIndices.insert(CFINum);
-        MCCFIInstruction *CFI = BF.getCFIFor(Inst);
-        DEBUG(dbgs() << "Changing CFI offset from " << CFI->getOffset()
-                     << " to " << (CFI->getOffset() + Adjustment) << "\n");
-        CFI->setOffset(CFI->getOffset() + Adjustment);
+        const MCCFIInstruction *CFI = BF.getCFIFor(Inst);
+        const MCCFIInstruction::OpType Operation = CFI->getOperation();
+        if (Operation == MCCFIInstruction::OpDefCfa ||
+            Operation == MCCFIInstruction::OpDefCfaOffset) {
+          Adjustment = 0 - Adjustment;
+        }
+        LLVM_DEBUG(dbgs() << "Changing CFI offset from " << CFI->getOffset()
+                          << " to " << (CFI->getOffset() + Adjustment) << "\n");
+        BF.mutateCFIOffsetFor(Inst, CFI->getOffset() + Adjustment);
        continue;
      }
      int32_t SrcImm{0};
@ -696,7 +702,7 @@ void StackLayoutModifier::performChanges() {
      else if (IsStore)
        Success = BC.MIB->createSaveToStack(
            Inst, StackPtrReg, StackOffset + Adjustment, Reg, Size);
-      DEBUG({
+      LLVM_DEBUG({
        dbgs() << "Adjusted instruction: ";
        Inst.dump();
      });
@ -758,16 +764,18 @@ void ShrinkWrapping::pruneUnwantedCSRs() {
      continue;
    }
    if (UsesByReg[I].empty()) {
-      DEBUG(dbgs()
-            << "Dismissing Callee-Saved Reg because we found no uses of it:"
-            << I << "\n");
+      LLVM_DEBUG(
+          dbgs()
+          << "Dismissing Callee-Saved Reg because we found no uses of it:" << I
+          << "\n");
      CSA.CalleeSaved.reset(I);
      continue;
    }
    if (!CSA.HasRestores[I]) {
-      DEBUG(dbgs() << "Dismissing Callee-Saved Reg because it does not have "
-                      "restores:"
-                   << I << "\n");
+      LLVM_DEBUG(
+          dbgs() << "Dismissing Callee-Saved Reg because it does not have "
+                    "restores:"
+                 << I << "\n");
      CSA.CalleeSaved.reset(I);
    }
  }
@ -779,9 +787,9 @@ void ShrinkWrapping::computeSaveLocations() {
  auto &DA = Info.getDominatorAnalysis();
  auto &SPT = Info.getStackPointerTracking();

-  DEBUG(dbgs() << "Checking save/restore possibilities\n");
+  LLVM_DEBUG(dbgs() << "Checking save/restore possibilities\n");
  for (auto &BB : BF) {
-    DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n");
+    LLVM_DEBUG(dbgs() << "\tNow at BB " << BB.getName() << "\n");

    MCInst *First = BB.begin() != BB.end() ? &*BB.begin() : nullptr;
    if (!First)
@ -808,16 +816,16 @@ void ShrinkWrapping::computeSaveLocations() {
        if (DA.doesADominateB(*First, J))
          BBDominatedUses.set(J);
      }
-      DEBUG(dbgs() << "\t\tBB " << BB.getName() << " dominates "
-                   << BBDominatedUses.count() << " uses for reg " << I
-                   << ". Total uses for reg is " << UsesByReg[I].count()
-                   << "\n");
+      LLVM_DEBUG(dbgs() << "\t\tBB " << BB.getName() << " dominates "
+                        << BBDominatedUses.count() << " uses for reg " << I
+                        << ". Total uses for reg is " << UsesByReg[I].count()
+                        << "\n");
      BBDominatedUses &= UsesByReg[I];
      if (BBDominatedUses == UsesByReg[I]) {
-        DEBUG(dbgs() << "\t\t\tAdded " << BB.getName() << " as a save pos for "
-                     << I << "\n");
+        LLVM_DEBUG(dbgs() << "\t\t\tAdded " << BB.getName()
+                          << " as a save pos for " << I << "\n");
        SavePos[I].insert(First);
-        DEBUG({
+        LLVM_DEBUG({
          dbgs() << "Dominated uses are:\n";
          for (auto J = UsesByReg[I].find_first(); J > 0;
               J = UsesByReg[I].find_next(J)) {
@ -890,7 +898,7 @@ bool ShrinkWrapping::isBestSavePosCold(unsigned CSR, MCInst *&BestPosSave,
  bool ShouldMove{false};
  if (BestCount != std::numeric_limits<uint64_t>::max() &&
      BestCount < (opts::ShrinkWrappingThreshold / 100.0) * CurSavingCost) {
-    DEBUG({
+    LLVM_DEBUG({
      auto &InsnToBB = Info.getInsnToBBMap();
      dbgs() << "Better position for saves found in func " << BF.getPrintName()
             << " count << " << BF.getKnownExecutionCount() << "\n";
@ -905,7 +913,7 @@ bool ShrinkWrapping::isBestSavePosCold(unsigned CSR, MCInst *&BestPosSave,
  if (!ShouldMove)
    return false;
  if (!BestPosSave) {
-    DEBUG({
+    LLVM_DEBUG({
      dbgs() << "Dropping opportunity because we don't know where to put "
                "stores -- total est. freq reduc: "
             << TotalEstimatedWin << "\n";
@ -922,8 +930,8 @@ void ShrinkWrapping::splitFrontierCritEdges(
    const SmallVector<bool, 4> &IsCritEdge,
    const SmallVector<BinaryBasicBlock *, 4> &From,
    const SmallVector<SmallVector<BinaryBasicBlock *, 4>, 4> &To) {
-  DEBUG(dbgs() << "splitFrontierCritEdges: Now handling func "
-               << BF.getPrintName() << "\n");
+  LLVM_DEBUG(dbgs() << "splitFrontierCritEdges: Now handling func "
+                    << BF.getPrintName() << "\n");
  // For every FromBB, there might be one or more critical edges, with
  // To[I] containing destination BBs. It's important to memorize
  // the original size of the Frontier as we may append to it while splitting
@ -934,11 +942,12 @@ void ShrinkWrapping::splitFrontierCritEdges(
    if (To[I].empty())
      continue;
    auto FromBB = From[I];
-    DEBUG(dbgs() << " - Now handling FrontierBB " << FromBB->getName() << "\n");
+    LLVM_DEBUG(dbgs() << " - Now handling FrontierBB " << FromBB->getName()
+                      << "\n");
    // Split edge for every DestinationBBs
    for (size_t DI = 0, DIE = To[I].size(); DI < DIE; ++DI) {
      auto DestinationBB = To[I][DI];
-      DEBUG(dbgs() << "   - Dest : " << DestinationBB->getName() << "\n");
+      LLVM_DEBUG(dbgs() << "   - Dest : " << DestinationBB->getName() << "\n");
      auto *NewBB = Func->splitEdge(FromBB, DestinationBB);
      // Insert dummy instruction so this BB is never empty (we need this for
      // PredictiveStackPointerTracking to work, since it annotates instructions
@ -955,11 +964,13 @@ void ShrinkWrapping::splitFrontierCritEdges(
      if (DI == 0) {
        // Update frontier inplace
        Frontier[I] = NewFrontierPP;
-        DEBUG(dbgs() << "   - Update frontier with " << NewBB->getName() << '\n');
+        LLVM_DEBUG(dbgs() << "   - Update frontier with " << NewBB->getName()
+                          << '\n');
      } else {
        // Append new frontier to the end of the list
        Frontier.push_back(NewFrontierPP);
-        DEBUG(dbgs() << "   - Append frontier " << NewBB->getName() << '\n');
+        LLVM_DEBUG(dbgs() << "   - Append frontier " << NewBB->getName()
+                          << '\n');
      }
    }
  }
@ -979,7 +990,7 @@ ShrinkWrapping::doRestorePlacement(MCInst *BestPosSave, unsigned CSR,
  // into edges transitioning to the dominance frontier, otherwise we pull these
  // restores to inside the dominated area.
  Frontier = DA.getDominanceFrontierFor(*BestPosSave).takeVector();
-  DEBUG({
+  LLVM_DEBUG({
    dbgs() << "Dumping dominance frontier for ";
    BC.printInstruction(dbgs(), *BestPosSave);
    for (auto &PP : Frontier) {
@ -1003,7 +1014,8 @@ ShrinkWrapping::doRestorePlacement(MCInst *BestPosSave, unsigned CSR,
    // Check for invoke instructions at the dominance frontier, which indicates
    // the landing pad is not dominated.
    if (PP.isInst() && BC.MIB->isInvoke(*PP.getInst())) {
-      DEBUG(dbgs() << "Bailing on restore placement to avoid LP splitting\n");
+      LLVM_DEBUG(
+          dbgs() << "Bailing on restore placement to avoid LP splitting\n");
      Frontier.clear();
      return Frontier;
    }
@ -1031,7 +1043,7 @@ ShrinkWrapping::doRestorePlacement(MCInst *BestPosSave, unsigned CSR,
    InvalidateRequired = true;
  }
  if (std::accumulate(IsCritEdge.begin(), IsCritEdge.end(), 0)) {
-    DEBUG({
+    LLVM_DEBUG({
      dbgs() << "Now detected critical edges in the following frontier:\n";
      for (auto &PP : Frontier) {
        if (PP.isBB())
@ -1054,7 +1066,7 @@ ShrinkWrapping::doRestorePlacement(MCInst *BestPosSave, unsigned CSR,
    classifyCSRUses();
  }
  if (CannotPlace) {
-    DEBUG({
+    LLVM_DEBUG({
      dbgs() << "Dropping opportunity because restore placement failed"
                " -- total est. freq reduc: "
             << TotalEstimatedWin << "\n";
@ -1068,7 +1080,7 @@ ShrinkWrapping::doRestorePlacement(MCInst *BestPosSave, unsigned CSR,
 bool ShrinkWrapping::validatePushPopsMode(unsigned CSR, MCInst *BestPosSave,
                                          int64_t SaveOffset) {
  if (FA.requiresAlignment(BF)) {
-    DEBUG({
+    LLVM_DEBUG({
      dbgs() << "Reg " << CSR << " is not using push/pops due to function "
                                 "alignment requirements.\n";
    });
@ -1076,14 +1088,14 @@ bool ShrinkWrapping::validatePushPopsMode(unsigned CSR, MCInst *BestPosSave,
  }
  for (MCInst *Save : CSA.getSavesByReg(CSR)) {
    if (!SLM.canCollapseRegion(Save)) {
-      DEBUG(dbgs() << "Reg " << CSR << " cannot collapse region.\n");
+      LLVM_DEBUG(dbgs() << "Reg " << CSR << " cannot collapse region.\n");
      return false;
    }
  }
  // Abort if one of the restores for this CSR is not a POP.
  for (MCInst *Load : CSA.getRestoresByReg(CSR)) {
    if (!BC.MIB->isPop(*Load)) {
-      DEBUG(dbgs() << "Reg " << CSR << " has a mismatching restore.\n");
+      LLVM_DEBUG(dbgs() << "Reg " << CSR << " has a mismatching restore.\n");
      return false;
    }
  }
@ -1094,7 +1106,7 @@ bool ShrinkWrapping::validatePushPopsMode(unsigned CSR, MCInst *BestPosSave,
  if (!SLM.canInsertRegion(BestPosSave) ||
      SaveOffset == SPT.SUPERPOSITION || SaveOffset == SPT.EMPTY ||
      (SaveOffset == -8 && SPT.HasFramePointer)) {
-    DEBUG({
+    LLVM_DEBUG({
      dbgs() << "Reg " << CSR << " cannot insert region or we are "
                                 "trying to insert a push into entry bb.\n";
    });
@ -1135,7 +1147,7 @@ SmallVector<ProgramPoint, 4> ShrinkWrapping::fixPopsPlacements(
      }
    }
    if (!Found) {
-      DEBUG({
+      LLVM_DEBUG({
        dbgs() << "Could not find restore insertion point for " << CSR
               << ", falling back to load/store mode\n";
      });
@ -1182,7 +1194,7 @@ void ShrinkWrapping::scheduleOldSaveRestoresRemoval(unsigned CSR,
      const bool RecordDeletedPopCFIs =
          RestoredReg == CSR && DeletedPopCFIs[CSR].empty();
      for (MCInst *CFI : CFIs) {
-        auto *MCCFI = BF.getCFIFor(*CFI);
+        const MCCFIInstruction *MCCFI = BF.getCFIFor(*CFI);
        // Do not touch these...
        if (MCCFI->getOperation() == MCCFIInstruction::OpRestoreState ||
            MCCFI->getOperation() == MCCFIInstruction::OpRememberState)
@ -1223,7 +1235,7 @@ void ShrinkWrapping::scheduleSaveRestoreInsertions(
  auto FIELoad = CSA.LoadFIEByReg[CSR];
  assert(FIESave && FIELoad && "Invalid CSR");

-  DEBUG({
+  LLVM_DEBUG({
    dbgs() << "Scheduling save insertion at: ";
    BestPosSave->dump();
  });
@ -1234,7 +1246,7 @@ void ShrinkWrapping::scheduleSaveRestoreInsertions(

  for (auto &PP : RestorePoints) {
    BinaryBasicBlock *FrontierBB = Info.getParentBB(PP);
-    DEBUG({
+    LLVM_DEBUG({
      dbgs() << "Scheduling restore insertion at: ";
      if (PP.isInst())
        PP.getInst()->dump();
@ -1547,7 +1559,7 @@ void ShrinkWrapping::insertUpdatedCFI(unsigned CSR, int SPValPush,
      break;
  }
  assert(SavePoint);
-  DEBUG({
+  LLVM_DEBUG({
    dbgs() << "Now using as save point for reg " << CSR << " :";
    SavePoint->dump();
  });
@ -1602,7 +1614,7 @@ void ShrinkWrapping::rebuildCFIForSP() {
    for (auto &Inst : BB) {
      if (!BC.MIB->isCFI(Inst))
        continue;
-      auto *CFI = BF.getCFIFor(Inst);
+      const MCCFIInstruction *CFI = BF.getCFIFor(Inst);
      if (CFI->getOperation() == MCCFIInstruction::OpDefCfaOffset)
        BC.MIB->addAnnotation(Inst, "DeleteMe", 0U, AllocatorId);
    }
@ -1624,19 +1636,19 @@ void ShrinkWrapping::rebuildCFIForSP() {
        ++InsertionIter;
        Iter = BF.addCFIInstruction(
            BB, InsertionIter,
-            MCCFIInstruction::createDefCfaOffset(nullptr, -CurVal));
+            MCCFIInstruction::cfiDefCfaOffset(nullptr, -CurVal));
        SPVal = CurVal;
      }
    }
    if (BF.isSplit() && PrevBB && BB->isCold() != PrevBB->isCold()) {
      BF.addCFIInstruction(
          BB, BB->begin(),
-          MCCFIInstruction::createDefCfaOffset(nullptr, -SPValAtBegin));
+          MCCFIInstruction::cfiDefCfaOffset(nullptr, -SPValAtBegin));
    } else {
      if (SPValAtBegin != PrevSPVal) {
        BF.addCFIInstruction(
            PrevBB, PrevBB->end(),
-            MCCFIInstruction::createDefCfaOffset(nullptr, -SPValAtBegin));
+            MCCFIInstruction::cfiDefCfaOffset(nullptr, -SPValAtBegin));
      }
    }
    PrevSPVal = SPValAtEnd;
@ -1700,7 +1712,7 @@ MCInst ShrinkWrapping::createStackAccess(int SPVal, int FPVal,
 }

 void ShrinkWrapping::updateCFIInstOffset(MCInst &Inst, int64_t NewOffset) {
-  auto *CFI = BF.getCFIFor(Inst);
+  const MCCFIInstruction *CFI = BF.getCFIFor(Inst);
  if (UpdatedCFIs.count(CFI))
    return;

@ -1708,7 +1720,7 @@ void ShrinkWrapping::updateCFIInstOffset(MCInst &Inst, int64_t NewOffset) {
  case MCCFIInstruction::OpDefCfa:
  case MCCFIInstruction::OpDefCfaRegister:
  case MCCFIInstruction::OpDefCfaOffset:
-    CFI->setOffset(NewOffset);
+    CFI = BF.mutateCFIOffsetFor(Inst, -NewOffset);
    break;
  case MCCFIInstruction::OpOffset:
  default:
@ -1771,7 +1783,7 @@ BBIterTy ShrinkWrapping::processInsertion(BBIterTy InsertionPoint,
    }
  }

-  DEBUG({
+  LLVM_DEBUG({
    dbgs() << "Creating stack access with SPVal = " << SPVal
           << "; stack offset = " << Item.FIEToInsert.StackOffset
           << " Is push = " << (Item.Action == WorklistItem::InsertPushOrPop)
@ -1781,7 +1793,7 @@ BBIterTy ShrinkWrapping::processInsertion(BBIterTy InsertionPoint,
      createStackAccess(SPVal, FPVal, Item.FIEToInsert,
                        Item.Action == WorklistItem::InsertPushOrPop);
  if (InsertionPoint != CurBB->end()) {
-    DEBUG({
+    LLVM_DEBUG({
      dbgs() << "Adding before Inst: ";
      InsertionPoint->dump();
      dbgs() << "the following inst: ";
@ -1790,7 +1802,7 @@ BBIterTy ShrinkWrapping::processInsertion(BBIterTy InsertionPoint,
    return ++CurBB->insertInstruction(InsertionPoint, std::move(NewInst));
  }
  CurBB->addInstruction(std::move(NewInst));
-  DEBUG(dbgs() << "Adding to BB!\n");
+  LLVM_DEBUG(dbgs() << "Adding to BB!\n");
  return CurBB->end();
 }

@ -1877,7 +1889,7 @@ bool ShrinkWrapping::processInsertions() {
        continue;
      Changes = true;
      auto List = *TodoList;
-      DEBUG({
+      LLVM_DEBUG({
        dbgs() << "Now processing insertions in " << BB.getName()
               << " before inst: ";
        Inst.dump();
@ -1927,7 +1939,7 @@ void ShrinkWrapping::processDeletions() {
          }
        }

-        DEBUG({
+        LLVM_DEBUG({
          dbgs() << "Erasing: ";
          BC.printInstruction(dbgs(), Inst);
        });
@ -1962,8 +1974,8 @@ bool ShrinkWrapping::perform() {
  DomOrder = std::vector<MCPhysReg>(BC.MRI->getNumRegs(), 0);

  if (BF.checkForAmbiguousJumpTables()) {
-    DEBUG(dbgs() << "BOLT-DEBUG: ambiguous JTs in " << BF.getPrintName()
-                 << ".\n");
+    LLVM_DEBUG(dbgs() << "BOLT-DEBUG: ambiguous JTs in " << BF.getPrintName()
+                      << ".\n");
    // We could call disambiguateJumpTables here, but it is probably not worth
    // the cost (of duplicating potentially large jump tables that could regress
    // dcache misses). Moreover, ambiguous JTs are rare and coming from code
@ -1977,7 +1989,7 @@ bool ShrinkWrapping::perform() {
  computeSaveLocations();
  computeDomOrder();
  moveSaveRestores();
-  DEBUG({
+  LLVM_DEBUG({
    dbgs() << "Func before shrink-wrapping: \n";
    BF.dump();
  });
@ -1988,14 +2000,14 @@ bool ShrinkWrapping::perform() {
  processDeletions();
  if (foldIdenticalSplitEdges()) {
    const auto Stats = BF.eraseInvalidBBs();
-    DEBUG(dbgs() << "Deleted " << Stats.first << " redundant split edge BBs ("
-                 << Stats.second << " bytes) for " << BF.getPrintName()
-                 << "\n");
+    LLVM_DEBUG(dbgs() << "Deleted " << Stats.first
+                      << " redundant split edge BBs (" << Stats.second
+                      << " bytes) for " << BF.getPrintName() << "\n");
  }
  rebuildCFI();
  // We may have split edges, creating BBs that need correct branching
  BF.fixBranches();
-  DEBUG({
+  LLVM_DEBUG({
    dbgs() << "Func after shrink-wrapping: \n";
    BF.dump();
  });
--- a/bolt/src/Passes/SplitFunctions.cpp
+++ b/bolt/src/Passes/SplitFunctions.cpp
@ -12,7 +12,7 @@
 #include "BinaryFunction.h"
 #include "ParallelUtilities.h"
 #include "SplitFunctions.h"
-#include "llvm/Support/Options.h"
+#include "llvm/Support/CommandLine.h"

 #include <numeric>
 #include <vector>
@ -143,9 +143,10 @@ void SplitFunctions::splitFunction(BinaryFunction &BF) {
  size_t ColdSize;
  if (BC.isX86()) {
    std::tie(OriginalHotSize, ColdSize) = BC.calculateEmittedSize(BF);
-    DEBUG(dbgs() << "Estimated size for function " << BF << " pre-split is <0x"
-                 << Twine::utohexstr(OriginalHotSize) << ", 0x"
-                 << Twine::utohexstr(ColdSize) << ">\n");
+    LLVM_DEBUG(dbgs() << "Estimated size for function " << BF
+                      << " pre-split is <0x"
+                      << Twine::utohexstr(OriginalHotSize) << ", 0x"
+                      << Twine::utohexstr(ColdSize) << ">\n");
  }

  if (opts::SplitFunctions == SplitFunctions::ST_LARGE && !BC.HasRelocations) {
@ -224,15 +225,15 @@ void SplitFunctions::splitFunction(BinaryFunction &BF) {
  // Check the new size to see if it's worth splitting the function.
  if (BC.isX86() && BF.isSplit()) {
    std::tie(HotSize, ColdSize) = BC.calculateEmittedSize(BF);
-    DEBUG(dbgs() << "Estimated size for function " << BF << " post-split is <0x"
-                 << Twine::utohexstr(HotSize) << ", 0x"
-                 << Twine::utohexstr(ColdSize) << ">\n");
+    LLVM_DEBUG(dbgs() << "Estimated size for function " << BF
+                      << " post-split is <0x" << Twine::utohexstr(HotSize)
+                      << ", 0x" << Twine::utohexstr(ColdSize) << ">\n");
    if (alignTo(OriginalHotSize, opts::SplitAlignThreshold) <=
        alignTo(HotSize, opts::SplitAlignThreshold) + opts::SplitThreshold) {
-      DEBUG(dbgs() << "Reversing splitting of function " << BF << ":\n  0x"
-                   << Twine::utohexstr(HotSize) << ", 0x"
-                   << Twine::utohexstr(ColdSize) << " -> 0x"
-                   << Twine::utohexstr(OriginalHotSize) << '\n');
+      LLVM_DEBUG(dbgs() << "Reversing splitting of function " << BF << ":\n  0x"
+                        << Twine::utohexstr(HotSize) << ", 0x"
+                        << Twine::utohexstr(ColdSize) << " -> 0x"
+                        << Twine::utohexstr(OriginalHotSize) << '\n');

      BF.updateBasicBlockLayout(PreSplitLayout);
      for (auto &BB : BF) {
--- a/bolt/src/Passes/StackAllocationAnalysis.cpp
+++ b/bolt/src/Passes/StackAllocationAnalysis.cpp
@ -18,8 +18,8 @@ namespace llvm {
 namespace bolt {

 void StackAllocationAnalysis::preflight() {
-  DEBUG(dbgs() << "Starting StackAllocationAnalysis on \""
-                << Func.getPrintName() << "\"\n");
+  LLVM_DEBUG(dbgs() << "Starting StackAllocationAnalysis on \""
+                    << Func.getPrintName() << "\"\n");

  for (auto &BB : this->Func) {
    for (auto &Inst : BB) {
@ -65,7 +65,7 @@ BitVector StackAllocationAnalysis::doKill(const MCInst &Point,
      continue;
    if (InstrOffset < SPOffset) {
      Next.reset(I.getBitVectorIndex());
-      DEBUG({
+      LLVM_DEBUG({
        dbgs() << "SAA FYI: Killed: ";
        Instr->dump();
        dbgs() << "by: ";
--- a/bolt/src/Passes/StackAvailableExpressions.cpp
+++ b/bolt/src/Passes/StackAvailableExpressions.cpp
@ -24,8 +24,8 @@ StackAvailableExpressions::StackAvailableExpressions(const RegAnalysis &RA,
    : InstrsDataflowAnalysis(BC, BF), RA(RA), FA(FA) {}

 void StackAvailableExpressions::preflight() {
-  DEBUG(dbgs() << "Starting StackAvailableExpressions on \""
-               << Func.getPrintName() << "\"\n");
+  LLVM_DEBUG(dbgs() << "Starting StackAvailableExpressions on \""
+                    << Func.getPrintName() << "\"\n");

  // Populate our universe of tracked expressions. We are interested in
  // tracking available stores to frame position at any given point of the
@ -113,11 +113,11 @@ BitVector StackAvailableExpressions::computeNext(const MCInst &Point,
  // Kill
  for (auto I = expr_begin(Next), E = expr_end(); I != E; ++I) {
    assert(*I != nullptr && "Lost pointers");
-    DEBUG(dbgs() << "\t\t\tDoes it kill ");
-    DEBUG((*I)->dump());
+    LLVM_DEBUG(dbgs() << "\t\t\tDoes it kill ");
+    LLVM_DEBUG((*I)->dump());
    if (doesXKillsY(&Point, *I)) {
-      DEBUG(dbgs() << "\t\t\t\tKilling ");
-      DEBUG((*I)->dump());
+      LLVM_DEBUG(dbgs() << "\t\t\t\tKilling ");
+      LLVM_DEBUG((*I)->dump());
      Next.reset(I.getBitVectorIndex());
    }
  }
--- a/bolt/src/Passes/StackReachingUses.cpp
+++ b/bolt/src/Passes/StackReachingUses.cpp
@ -63,8 +63,8 @@ bool StackReachingUses::isStoreUsed(const FrameIndexEntry &StoreFIE,
 }

 void StackReachingUses::preflight() {
-  DEBUG(dbgs() << "Starting StackReachingUses on \"" << Func.getPrintName()
-               << "\"\n");
+  LLVM_DEBUG(dbgs() << "Starting StackReachingUses on \"" << Func.getPrintName()
+                    << "\"\n");

  // Populate our universe of tracked expressions. We are interested in
  // tracking reaching loads from frame position at any given point of the
@ -109,8 +109,8 @@ BitVector StackReachingUses::computeNext(const MCInst &Point,
  for (auto I = expr_begin(Next), E = expr_end(); I != E; ++I) {
    assert(*I != nullptr && "Lost pointers");
    if (doesXKillsY(&Point, *I)) {
-      DEBUG(dbgs() << "\t\t\tKilling ");
-      DEBUG((*I)->dump());
+      LLVM_DEBUG(dbgs() << "\t\t\tKilling ");
+      LLVM_DEBUG((*I)->dump());
      Next.reset(I.getBitVectorIndex());
    }
  };
--- a/bolt/src/Passes/StokeInfo.cpp
+++ b/bolt/src/Passes/StokeInfo.cpp
@ -1,5 +1,5 @@
 #include "StokeInfo.h"
-#include "llvm/Support/Options.h"
+#include "llvm/Support/CommandLine.h"

 #undef DEBUG_TYPE
 #define DEBUG_TYPE "stoke"
@ -25,13 +25,13 @@ void getRegNameFromBitVec(const BinaryContext &BC, const BitVector &RegV,
    std::set<std::string> *NameVec = nullptr) {
  int RegIdx = RegV.find_first();
  while (RegIdx != -1) {
-    DEBUG(dbgs() << BC.MRI->getName(RegIdx) << " ");
+    LLVM_DEBUG(dbgs() << BC.MRI->getName(RegIdx) << " ");
    if (NameVec) {
      NameVec->insert(std::string(BC.MRI->getName(RegIdx)));
    }
    RegIdx = RegV.find_next(RegIdx);
  }
-  DEBUG(dbgs() << "\n");
+  LLVM_DEBUG(dbgs() << "\n");
 }

 void StokeInfo::checkInstr(const BinaryContext &BC, const BinaryFunction &BF,
@ -123,12 +123,12 @@ bool StokeInfo::checkFunction(const BinaryContext &BC, BinaryFunction &BF,
    return false;
  }

-  DEBUG(dbgs() << "\t [DefIn]\n\t ");
+  LLVM_DEBUG(dbgs() << "\t [DefIn]\n\t ");
  auto LiveInBV = *(DInfo.getLivenessAnalysis().getStateAt(FirstNonPseudo));
  LiveInBV &= DefaultDefInMask;
  getRegNameFromBitVec(BC, LiveInBV, &FuncInfo.DefIn);

-  DEBUG(dbgs() << "\t [LiveOut]\n\t ");
+  LLVM_DEBUG(dbgs() << "\t [LiveOut]\n\t ");
  auto LiveOutBV = RA.getFunctionClobberList(&BF);
  LiveOutBV &= DefaultLiveOutMask;
  getRegNameFromBitVec(BC, LiveOutBV, &FuncInfo.LiveOut);
@ -149,9 +149,9 @@ void StokeInfo::runOnFunctions(BinaryContext &BC) {
  }

  // check some context meta data
-  DEBUG(dbgs() << "\tTarget: " << BC.TheTarget->getName() << "\n");
-  DEBUG(dbgs() << "\tTripleName " << BC.TripleName << "\n");
-  DEBUG(dbgs() << "\tgetNumRegs " << BC.MRI->getNumRegs() << "\n");
+  LLVM_DEBUG(dbgs() << "\tTarget: " << BC.TheTarget->getName() << "\n");
+  LLVM_DEBUG(dbgs() << "\tTripleName " << BC.TripleName << "\n");
+  LLVM_DEBUG(dbgs() << "\tgetNumRegs " << BC.MRI->getNumRegs() << "\n");

  auto CG = buildCallGraph(BC);
  RegAnalysis RA(BC, &BC.getBinaryFunctions(), &CG);
--- a/bolt/src/Passes/ValidateInternalCalls.cpp
+++ b/bolt/src/Passes/ValidateInternalCalls.cpp
@ -210,7 +210,8 @@ bool ValidateInternalCalls::analyzeFunction(BinaryFunction &Function) const {
        continue;

      if (HasTailCalls) {
-        DEBUG(dbgs() << Function << " has tail calls and internal calls.\n");
+        LLVM_DEBUG(dbgs() << Function
+                          << " has tail calls and internal calls.\n");
        return false;
      }

@ -224,7 +225,7 @@ bool ValidateInternalCalls::analyzeFunction(BinaryFunction &Function) const {
                                 FIE.IsStoreFromReg, Reg, SrcImm,
                                 FIE.StackPtrReg, StackOffset, FIE.Size,
                                 FIE.IsSimple, IsIndexed)) {
-        DEBUG({
+        LLVM_DEBUG({
          dbgs() << "Frame analysis failed - not simple: " << Function << "\n";
          Function.dump();
        });
@ -232,7 +233,7 @@ bool ValidateInternalCalls::analyzeFunction(BinaryFunction &Function) const {
      }
      if (!FIE.IsLoad || FIE.StackPtrReg != BC.MIB->getStackPointer() ||
          StackOffset != 0) {
-        DEBUG({
+        LLVM_DEBUG({
          dbgs() << "Target instruction does not fetch return address - not "
                    "simple: "
                 << Function << "\n";
@ -260,30 +261,30 @@ bool ValidateInternalCalls::analyzeFunction(BinaryFunction &Function) const {
        std::pair<MCPhysReg, int64_t> Input1 = std::make_pair(Reg, 0);
        std::pair<MCPhysReg, int64_t> Input2 = std::make_pair(0, 0);
        if (!BC.MIB->evaluateSimple(Use, Output, Input1, Input2)) {
-          DEBUG(dbgs() << "Evaluate simple failed.\n");
+          LLVM_DEBUG(dbgs() << "Evaluate simple failed.\n");
          return false;
        }
        if (Offset + Output < 0 ||
            Offset + Output > static_cast<int64_t>(Function.getSize())) {
-          DEBUG({
+          LLVM_DEBUG({
            dbgs() << "Detected out-of-range PIC reference in " << Function
                   << "\nReturn address load: ";
-            BC.InstPrinter->printInst(TargetInst, dbgs(), "", *BC.STI);
+            BC.InstPrinter->printInst(TargetInst, 0, "", *BC.STI, dbgs());
            dbgs() << "\nUse: ";
-            BC.InstPrinter->printInst(&Use, dbgs(), "", *BC.STI);
+            BC.InstPrinter->printInst(&Use, 0, "", *BC.STI, dbgs());
            dbgs() << "\n";
            Function.dump();
          });
          return false;
        }
-        DEBUG({
+        LLVM_DEBUG({
          dbgs() << "Validated access: ";
-          BC.InstPrinter->printInst(&Use, dbgs(), "", *BC.STI);
+          BC.InstPrinter->printInst(&Use, 0, "", *BC.STI, dbgs());
          dbgs() << "\n";
        });
      }
      if (!UseDetected) {
-        DEBUG(dbgs() << "No use detected.\n");
+        LLVM_DEBUG(dbgs() << "No use detected.\n");
        return false;
      }
    }
@ -321,7 +322,7 @@ void ValidateInternalCalls::runOnFunctions(BinaryContext &BC) {
  // case, we mark this function as non-simple and stop processing it.
  std::set<BinaryFunction *> Invalid;
  for (auto *Function : NeedsValidation) {
-    DEBUG(dbgs() << "Validating " << *Function << "\n");
+    LLVM_DEBUG(dbgs() << "Validating " << *Function << "\n");
    if (!analyzeFunction(*Function)) {
      Invalid.insert(Function);
    }
--- a/bolt/src/Passes/VeneerElimination.cpp
+++ b/bolt/src/Passes/VeneerElimination.cpp
@ -63,8 +63,8 @@ void VeneerElimination::runOnFunctions(BinaryContext &BC) {
    }
  }

-  DEBUG(dbgs() << "BOLT-INFO: number of removed linker-inserted veneers :" << VeneersCount
-               << "\n");
+  LLVM_DEBUG(dbgs() << "BOLT-INFO: number of removed linker-inserted veneers :"
+                    << VeneersCount << "\n");

  // Handle veneers to veneers in case they occur
  for (auto entry : VeneerDestinations) {
@ -97,8 +97,9 @@ void VeneerElimination::runOnFunctions(BinaryContext &BC) {
    }
  }

-  DEBUG(dbgs() << "BOLT-INFO: number of linker-inserted veneers call sites :" << VeneerCallers
-               << "\n");
+  LLVM_DEBUG(
+      dbgs() << "BOLT-INFO: number of linker-inserted veneers call sites :"
+             << VeneerCallers << "\n");
 }

 } // namespace bolt
--- a/bolt/src/ProfileReader.cpp
+++ b/bolt/src/ProfileReader.cpp
@ -1,369 +0,0 @@
-//===-- ProfileReader.cpp - BOLT profile de-serializer ----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#include "BinaryBasicBlock.h"
-#include "BinaryFunction.h"
-#include "Passes/MCF.h"
-#include "ProfileReader.h"
-#include "ProfileYAMLMapping.h"
-#include "llvm/Support/CommandLine.h"
-
-using namespace llvm;
-
-namespace opts {
-
-extern cl::opt<unsigned> Verbosity;
-extern cl::OptionCategory BoltOptCategory;
-
-static llvm::cl::opt<bool>
-IgnoreHash("profile-ignore-hash",
-  cl::desc("ignore hash while reading function profile"),
-  cl::init(false),
-  cl::ZeroOrMore,
-  cl::Hidden,
-  cl::cat(BoltOptCategory));
-
-}
-
-namespace llvm {
-namespace bolt {
-
-void
-ProfileReader::buildNameMaps(std::map<uint64_t, BinaryFunction> &Functions) {
-  for (auto &YamlBF : YamlBP.Functions) {
-    StringRef Name = YamlBF.Name;
-    const auto Pos = Name.find("(*");
-    if (Pos != StringRef::npos)
-      Name = Name.substr(0, Pos);
-    ProfileNameToProfile[Name] = &YamlBF;
-    if (const auto CommonName = getLTOCommonName(Name)) {
-      LTOCommonNameMap[*CommonName].push_back(&YamlBF);
-    }
-  }
-  for (auto &BFI : Functions) {
-    const auto &Function = BFI.second;
-    for (auto Name : Function.getNames()) {
-      if (const auto CommonName = getLTOCommonName(Name)) {
-        LTOCommonNameFunctionMap[*CommonName].insert(&Function);
-      }
-    }
-  }
-}
-
-bool
-ProfileReader::parseFunctionProfile(BinaryFunction &BF,
-    const yaml::bolt::BinaryFunctionProfile &YamlBF) {
-  auto &BC = BF.getBinaryContext();
-
-  bool ProfileMatched = true;
-  uint64_t MismatchedBlocks = 0;
-  uint64_t MismatchedCalls = 0;
-  uint64_t MismatchedEdges = 0;
-
-  uint64_t FunctionExecutionCount = 0;
-
-  BF.setExecutionCount(YamlBF.ExecCount);
-
-  if (!opts::IgnoreHash && YamlBF.Hash != BF.computeHash(/*UseDFS=*/true)) {
-    if (opts::Verbosity >= 1)
-      errs() << "BOLT-WARNING: function hash mismatch\n";
-    ProfileMatched = false;
-  }
-
-  if (YamlBF.NumBasicBlocks != BF.size()) {
-    if (opts::Verbosity >= 1)
-      errs() << "BOLT-WARNING: number of basic blocks mismatch\n";
-    ProfileMatched = false;
-  }
-
-  auto DFSOrder = BF.dfs();
-
-  for (const auto &YamlBB : YamlBF.Blocks) {
-    if (YamlBB.Index >= DFSOrder.size()) {
-      if (opts::Verbosity >= 2)
-        errs() << "BOLT-WARNING: index " << YamlBB.Index
-               << " is out of bounds\n";
-      ++MismatchedBlocks;
-      continue;
-    }
-
-    auto &BB = *DFSOrder[YamlBB.Index];
-
-    // Basic samples profile (without LBR) does not have branches information
-    // and needs a special processing.
-    if (YamlBP.Header.Flags & BinaryFunction::PF_SAMPLE) {
-      if (!YamlBB.EventCount) {
-        BB.setExecutionCount(0);
-        continue;
-      }
-      auto NumSamples = YamlBB.EventCount * 1000;
-      if (NormalizeByInsnCount && BB.getNumNonPseudos()) {
-        NumSamples /= BB.getNumNonPseudos();
-      } else if (NormalizeByCalls) {
-        NumSamples /= BB.getNumCalls() + 1;
-      }
-      BB.setExecutionCount(NumSamples);
-      if (BB.isEntryPoint())
-        FunctionExecutionCount += NumSamples;
-      continue;
-    }
-
-    BB.setExecutionCount(YamlBB.ExecCount);
-
-    for (const auto &YamlCSI: YamlBB.CallSites) {
-      auto *Callee = YamlCSI.DestId < YamlProfileToFunction.size() ?
-          YamlProfileToFunction[YamlCSI.DestId] : nullptr;
-      bool IsFunction = Callee ? true : false;
-      MCSymbol *CalleeSymbol = nullptr;
-      if (IsFunction) {
-        CalleeSymbol = Callee->getSymbolForEntryID(YamlCSI.EntryDiscriminator);
-      }
-      BF.getAllCallSites().emplace_back(
-          CalleeSymbol, YamlCSI.Count, YamlCSI.Mispreds, YamlCSI.Offset);
-
-      if (YamlCSI.Offset >= BB.getOriginalSize()) {
-        if (opts::Verbosity >= 2)
-          errs() << "BOLT-WARNING: offset " << YamlCSI.Offset
-                 << " out of bounds in block " << BB.getName() << '\n';
-        ++MismatchedCalls;
-        continue;
-      }
-
-      auto *Instr =
-        BF.getInstructionAtOffset(BB.getInputOffset() + YamlCSI.Offset);
-      if (!Instr) {
-        if (opts::Verbosity >= 2)
-          errs() << "BOLT-WARNING: no instruction at offset " << YamlCSI.Offset
-                 << " in block " << BB.getName() << '\n';
-        ++MismatchedCalls;
-        continue;
-      }
-      if (!BC.MIB->isCall(*Instr) && !BC.MIB->isIndirectBranch(*Instr)) {
-        if (opts::Verbosity >= 2)
-          errs() << "BOLT-WARNING: expected call at offset " << YamlCSI.Offset
-                 << " in block " << BB.getName() << '\n';
-        ++MismatchedCalls;
-        continue;
-      }
-
-      auto setAnnotation = [&](StringRef Name, uint64_t Count) {
-        if (BC.MIB->hasAnnotation(*Instr, Name)) {
-          if (opts::Verbosity >= 1)
-            errs() << "BOLT-WARNING: ignoring duplicate " << Name
-                   << " info for offset 0x" << Twine::utohexstr(YamlCSI.Offset)
-                   << " in function " << BF << '\n';
-          return;
-        }
-        BC.MIB->addAnnotation(*Instr, Name, Count);
-      };
-
-      if (BC.MIB->isIndirectCall(*Instr) || BC.MIB->isIndirectBranch(*Instr)) {
-        IndirectCallSiteProfile &CSP =
-          BC.MIB->getOrCreateAnnotationAs<IndirectCallSiteProfile>(
-              *Instr, "CallProfile");
-        CSP.emplace_back(CalleeSymbol, YamlCSI.Count, YamlCSI.Mispreds);
-      } else if (BC.MIB->getConditionalTailCall(*Instr)) {
-        setAnnotation("CTCTakenCount", YamlCSI.Count);
-        setAnnotation("CTCMispredCount", YamlCSI.Mispreds);
-      } else {
-        setAnnotation("Count", YamlCSI.Count);
-      }
-    }
-
-    for (const auto &YamlSI : YamlBB.Successors) {
-      if (YamlSI.Index >= DFSOrder.size()) {
-        if (opts::Verbosity >= 1)
-          errs() << "BOLT-WARNING: index out of bounds for profiled block\n";
-        ++MismatchedEdges;
-        continue;
-      }
-
-      auto &SuccessorBB = *DFSOrder[YamlSI.Index];
-      if (!BB.getSuccessor(SuccessorBB.getLabel())) {
-        if (opts::Verbosity >= 1)
-          errs() << "BOLT-WARNING: no successor for block " << BB.getName()
-                 << " that matches index " << YamlSI.Index << " or block "
-                 << SuccessorBB.getName() << '\n';
-        ++MismatchedEdges;
-        continue;
-      }
-
-      auto &BI = BB.getBranchInfo(SuccessorBB);
-      BI.Count += YamlSI.Count;
-      BI.MispredictedCount += YamlSI.Mispreds;
-    }
-  }
-
-  // If basic block profile wasn't read it should be 0.
-  for (auto &BB : BF) {
-    if (BB.getExecutionCount() == BinaryBasicBlock::COUNT_NO_PROFILE)
-      BB.setExecutionCount(0);
-  }
-
-  if (YamlBP.Header.Flags & BinaryFunction::PF_SAMPLE) {
-    BF.setExecutionCount(FunctionExecutionCount);
-    estimateEdgeCounts(BF);
-  }
-
-  ProfileMatched &= !MismatchedBlocks && !MismatchedCalls && !MismatchedEdges;
-
-  if (ProfileMatched)
-    BF.markProfiled(YamlBP.Header.Flags);
-
-  if (!ProfileMatched && opts::Verbosity >= 1) {
-    errs() << "BOLT-WARNING: " << MismatchedBlocks << " blocks, "
-           << MismatchedCalls << " calls, and " << MismatchedEdges
-           << " edges in profile did not match function " << BF << '\n';
-  }
-
-  return ProfileMatched;
-}
-
-std::error_code
-ProfileReader::readProfile(const std::string &FileName,
-                           std::map<uint64_t, BinaryFunction> &Functions) {
-  auto MB = MemoryBuffer::getFileOrSTDIN(FileName);
-  if (std::error_code EC = MB.getError()) {
-    errs() << "ERROR: cannot open " << FileName << ": " << EC.message() << "\n";
-    return EC;
-  }
-  yaml::Input YamlInput(MB.get()->getBuffer());
-
-  // Consume YAML file.
-  YamlInput >> YamlBP;
-  if (YamlInput.error()) {
-    errs() << "BOLT-ERROR: syntax error parsing profile in " << FileName
-           << " : " << YamlInput.error().message() << '\n';
-    return YamlInput.error();
-  }
-
-  // Sanity check.
-  if (YamlBP.Header.Version != 1) {
-    errs() << "BOLT-ERROR: cannot read profile : unsupported version\n";
-    return std::make_error_code(std::errc::executable_format_error);
-  }
-  if (YamlBP.Header.EventNames.find(',') != StringRef::npos) {
-    errs() << "BOLT-ERROR: multiple events in profile are not supported\n";
-    return std::make_error_code(std::errc::executable_format_error);
-  }
-
-  NormalizeByInsnCount = usesEvent("cycles") || usesEvent("instructions");
-  NormalizeByCalls = usesEvent("branches");
-
-  // Match profile to function based on a function name.
-  buildNameMaps(Functions);
-
-  YamlProfileToFunction.resize(YamlBP.Functions.size() + 1);
-
-  auto profileMatches = [](const yaml::bolt::BinaryFunctionProfile &Profile,
-                           BinaryFunction &BF) {
-    if (opts::IgnoreHash && Profile.NumBasicBlocks == BF.size())
-      return true;
-    if (!opts::IgnoreHash &&
-        Profile.Hash == static_cast<uint64_t>(BF.getHash()))
-      return true;
-    return false;
-  };
-
-  // We have to do 2 passes since LTO introduces an ambiguity in function
-  // names. The first pass assigns profiles that match 100% by name and
-  // by hash. The second pass allows name ambiguity for LTO private functions.
-  for (auto &BFI : Functions) {
-    auto &Function = BFI.second;
-
-    // Recompute hash once per function.
-    if (!opts::IgnoreHash)
-      Function.computeHash(/*UseDFS=*/true);
-
-    for (auto FunctionName : Function.getNames()) {
-      auto PI = ProfileNameToProfile.find(FunctionName);
-      if (PI == ProfileNameToProfile.end()) {
-        continue;
-      }
-      auto &YamlBF = *PI->getValue();
-      if (profileMatches(YamlBF, Function))
-        matchProfileToFunction(YamlBF, Function);
-    }
-  }
-
-  for (auto &BFI : Functions) {
-    auto &Function = BFI.second;
-
-    if (ProfiledFunctions.count(&Function))
-      continue;
-
-    for (auto FunctionName : Function.getNames()) {
-      const auto CommonName = getLTOCommonName(FunctionName);
-      if (CommonName) {
-        auto I = LTOCommonNameMap.find(*CommonName);
-        if (I == LTOCommonNameMap.end())
-          continue;
-
-        bool ProfileMatched{false};
-        auto &LTOProfiles = I->getValue();
-        for (auto *YamlBF : LTOProfiles) {
-          if (YamlBF->Used)
-            continue;
-          if ((ProfileMatched = profileMatches(*YamlBF, Function))) {
-            matchProfileToFunction(*YamlBF, Function);
-            break;
-          }
-        }
-        if (ProfileMatched)
-          break;
-
-        // If there's only one function with a given name, try to
-        // match it partially.
-        if (LTOProfiles.size() == 1 &&
-            LTOCommonNameFunctionMap[*CommonName].size() == 1 &&
-            !LTOProfiles.front()->Used) {
-          matchProfileToFunction(*LTOProfiles.front(), Function);
-          break;
-        }
-      } else {
-        auto PI = ProfileNameToProfile.find(FunctionName);
-        if (PI == ProfileNameToProfile.end())
-          continue;
-
-        auto &YamlBF = *PI->getValue();
-        if (!YamlBF.Used) {
-          matchProfileToFunction(YamlBF, Function);
-          break;
-        }
-      }
-    }
-  }
-  for (auto &YamlBF : YamlBP.Functions) {
-    if (!YamlBF.Used) {
-      errs() << "BOLT-WARNING: profile ignored for function "
-             << YamlBF.Name << '\n';
-    }
-  }
-
-  for (auto &YamlBF : YamlBP.Functions) {
-    if (YamlBF.Id >= YamlProfileToFunction.size()) {
-      // Such profile was ignored.
-      continue;
-    }
-    if (auto *BF = YamlProfileToFunction[YamlBF.Id]) {
-      parseFunctionProfile(*BF, YamlBF);
-    }
-  }
-
-  return YamlInput.error();
-}
-
-bool ProfileReader::usesEvent(StringRef Name) const {
-  return YamlBP.Header.EventNames.find(Name) != StringRef::npos;
-}
-
-} // end namespace bolt
-} // end namespace llvm
--- a/bolt/src/ProfileReader.h
+++ b/bolt/src/ProfileReader.h
@ -1,83 +0,0 @@
-//===-- ProfileReader.h - BOLT profile deserializer -------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_LLVM_BOLT_PROFILEREADER_H
-#define LLVM_TOOLS_LLVM_BOLT_PROFILEREADER_H
-
-#include "BinaryFunction.h"
-#include "ProfileYAMLMapping.h"
-#include <unordered_set>
-
-namespace llvm {
-namespace bolt {
-
-class ProfileReader {
-private:
-
-  /// Adjustments for basic samples profiles (without LBR).
-  bool NormalizeByInsnCount{false};
-  bool NormalizeByCalls{false};
-
-  /// Binary profile in YAML format.
-  yaml::bolt::BinaryProfile YamlBP;
-
-  /// Map a function ID from a YAML profile to a BinaryFunction object.
-  std::vector<BinaryFunction *> YamlProfileToFunction;
-
-  /// To keep track of functions that have a matched profile before the profile
-  /// is attributed.
-  std::unordered_set<const BinaryFunction *> ProfiledFunctions;
-
-  /// Populate \p Function profile with the one supplied in YAML format.
-  bool parseFunctionProfile(BinaryFunction &Function,
-                            const yaml::bolt::BinaryFunctionProfile &YamlBF);
-
-  /// For LTO symbol resolution.
-  /// Map a common LTO prefix to a list of YAML profiles matching the prefix.
-  StringMap<std::vector<yaml::bolt::BinaryFunctionProfile *>> LTOCommonNameMap;
-
-  /// Map a common LTO prefix to a set of binary functions.
-  StringMap<std::unordered_set<const BinaryFunction *>>
-                                                      LTOCommonNameFunctionMap;
-
-  /// Strict matching of a name in a profile to its contents.
-  StringMap<yaml::bolt::BinaryFunctionProfile *> ProfileNameToProfile;
-
-  /// Initialize maps for profile matching.
-  void buildNameMaps(std::map<uint64_t, BinaryFunction> &Functions);
-
-  /// Update matched YAML -> BinaryFunction pair.
-  void matchProfileToFunction(yaml::bolt::BinaryFunctionProfile &YamlBF,
-                              BinaryFunction &BF) {
-    if (YamlBF.Id >= YamlProfileToFunction.size())
-      YamlProfileToFunction.resize(YamlBF.Id + 1);
-    YamlProfileToFunction[YamlBF.Id] = &BF;
-    YamlBF.Used = true;
-
-    assert(!ProfiledFunctions.count(&BF) &&
-           "function already has an assigned profile");
-    ProfiledFunctions.emplace(&BF);
-  }
-
-  /// Check if the profile uses an event with a given \p Name.
-  bool usesEvent(StringRef Name) const;
-
-public:
-  /// Read profile from a file and associate with a set of functions.
-  std::error_code readProfile(const std::string &FileName,
-                              std::map<uint64_t, BinaryFunction> &Functions);
-
-};
-
-}
-}
-
-#endif
--- a/bolt/src/ProfileWriter.cpp
+++ b/bolt/src/ProfileWriter.cpp
@ -1,231 +0,0 @@
-//===-- ProfileWriter.cpp - Serialize profiling data ------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#include "BinaryBasicBlock.h"
-#include "BinaryFunction.h"
-#include "DataAggregator.h"
-#include "ProfileWriter.h"
-#include "ProfileYAMLMapping.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/raw_ostream.h"
-
-#undef  DEBUG_TYPE
-#define DEBUG_TYPE "bolt-prof"
-
-namespace llvm {
-namespace bolt {
-
-namespace {
-void
-convert(const BinaryFunction &BF, yaml::bolt::BinaryFunctionProfile &YamlBF) {
-  auto &BC = BF.getBinaryContext();
-
-  const auto LBRProfile = BF.getProfileFlags() & BinaryFunction::PF_LBR;
-
-  YamlBF.Name = BF.getPrintName();
-  YamlBF.Id = BF.getFunctionNumber();
-  YamlBF.Hash = BF.computeHash(/*UseDFS=*/true);
-  YamlBF.NumBasicBlocks = BF.size();
-  YamlBF.ExecCount = BF.getKnownExecutionCount();
-
-  FuncSampleData *SampleDataOrErr{nullptr};
-  if (!LBRProfile) {
-    SampleDataOrErr = BC.DR.getFuncSampleData(BF.getNames());
-    if (!SampleDataOrErr)
-      return;
-  }
-
-  for (const auto *BB : BF.dfs()) {
-    yaml::bolt::BinaryBasicBlockProfile YamlBB;
-    YamlBB.Index = BB->getLayoutIndex();
-    YamlBB.NumInstructions = BB->getNumNonPseudos();
-
-    if (!LBRProfile) {
-      YamlBB.EventCount =
-        SampleDataOrErr->getSamples(BB->getInputOffset(), BB->getEndOffset());
-      if (YamlBB.EventCount)
-        YamlBF.Blocks.emplace_back(YamlBB);
-      continue;
-    }
-
-    YamlBB.ExecCount = BB->getKnownExecutionCount();
-
-    for (const auto &Instr : *BB) {
-      if (!BC.MIB->isCall(Instr) && !BC.MIB->isIndirectBranch(Instr))
-        continue;
-
-      yaml::bolt::CallSiteInfo CSI;
-      auto Offset = BC.MIB->tryGetAnnotationAs<uint32_t>(Instr, "Offset");
-      if (!Offset || Offset.get() < BB->getInputOffset())
-        continue;
-      CSI.Offset = Offset.get() - BB->getInputOffset();
-
-      if (BC.MIB->isIndirectCall(Instr) || BC.MIB->isIndirectBranch(Instr)) {
-        auto ICSP =
-          BC.MIB->tryGetAnnotationAs<IndirectCallSiteProfile>(Instr,
-                                                              "CallProfile");
-        if (!ICSP)
-          continue;
-        for (auto &CSP : ICSP.get()) {
-          CSI.DestId = 0; // designated for unknown functions
-          CSI.EntryDiscriminator = 0;
-          if (CSP.Symbol) {
-            const auto *Callee = BC.getFunctionForSymbol(CSP.Symbol);
-            if (Callee) {
-              CSI.DestId = Callee->getFunctionNumber();
-            }
-          }
-          CSI.Count = CSP.Count;
-          CSI.Mispreds = CSP.Mispreds;
-          YamlBB.CallSites.push_back(CSI);
-        }
-      } else { // direct call or a tail call
-        uint64_t EntryID{0};
-        const auto *CalleeSymbol = BC.MIB->getTargetSymbol(Instr);
-        const auto Callee = BC.getFunctionForSymbol(CalleeSymbol, &EntryID);
-        if (Callee) {
-          CSI.DestId = Callee->getFunctionNumber();;
-          CSI.EntryDiscriminator = EntryID;
-        }
-
-        if (BC.MIB->getConditionalTailCall(Instr)) {
-          auto CTCCount =
-            BC.MIB->tryGetAnnotationAs<uint64_t>(Instr, "CTCTakenCount");
-          if (CTCCount) {
-            CSI.Count = *CTCCount;
-            auto CTCMispreds =
-              BC.MIB->tryGetAnnotationAs<uint64_t>(Instr, "CTCMispredCount");
-            if (CTCMispreds)
-              CSI.Mispreds = *CTCMispreds;
-          }
-        } else {
-          auto Count = BC.MIB->tryGetAnnotationAs<uint64_t>(Instr, "Count");
-          if (Count)
-            CSI.Count = *Count;
-        }
-
-        if (CSI.Count)
-          YamlBB.CallSites.emplace_back(CSI);
-      }
-    }
-
-    std::sort(YamlBB.CallSites.begin(), YamlBB.CallSites.end());
-
-    // Skip printing if there's no profile data for non-entry basic block.
-    // Include landing pads with non-zero execution count.
-    if (YamlBB.CallSites.empty() &&
-        !BB->isEntryPoint() &&
-        !(BB->isLandingPad() && BB->getKnownExecutionCount() != 0)) {
-      uint64_t SuccessorExecCount = 0;
-      for (auto &BranchInfo : BB->branch_info()) {
-        SuccessorExecCount += BranchInfo.Count;
-      }
-      if (!SuccessorExecCount)
-        continue;
-    }
-
-    auto BranchInfo = BB->branch_info_begin();
-    for (const auto *Successor : BB->successors()) {
-      yaml::bolt::SuccessorInfo YamlSI;
-      YamlSI.Index = Successor->getLayoutIndex();
-      YamlSI.Count = BranchInfo->Count;
-      YamlSI.Mispreds = BranchInfo->MispredictedCount;
-
-      YamlBB.Successors.emplace_back(YamlSI);
-
-      ++BranchInfo;
-    }
-
-    YamlBF.Blocks.emplace_back(YamlBB);
-  }
-}
-} // end anonymous namespace
-
-std::error_code
-ProfileWriter::writeProfile(const RewriteInstance &RI) {
-  const auto &Functions = RI.getBinaryContext().getBinaryFunctions();
-
-  std::error_code EC;
-  OS = llvm::make_unique<raw_fd_ostream>(FileName, EC, sys::fs::F_None);
-  if (EC) {
-    errs() << "BOLT-WARNING: " << EC.message() << " : unable to open "
-           << FileName << " for output.\n";
-    return EC;
-  }
-
-  yaml::bolt::BinaryProfile BP;
-
-  // Fill out the header info.
-  BP.Header.Version = 1;
-  auto FileName = RI.getInputFileName();
-  BP.Header.FileName = FileName ? *FileName : "<unknown>";
-  auto BuildID = RI.getPrintableBuildID();
-  BP.Header.Id = BuildID ? *BuildID : "<unknown>";
-
-  if (RI.getDataAggregator().started()) {
-    BP.Header.Origin = "aggregator";
-  } else {
-    BP.Header.Origin = "conversion";
-  }
-
-  auto EventNames = RI.getDataAggregator().getEventNames();
-  if (EventNames.empty())
-    EventNames = RI.getBinaryContext().DR.getEventNames();
-  if (!EventNames.empty()) {
-    std::string Sep = "";
-    for (const auto &EventEntry : EventNames) {
-      BP.Header.EventNames += Sep + EventEntry.first().str();
-      Sep = ",";
-    }
-  }
-
-  // Make sure the profile is consistent across all functions.
-  uint16_t ProfileFlags = BinaryFunction::PF_NONE;
-  for (const auto &BFI : Functions) {
-    const auto &BF = BFI.second;
-    if (BF.hasProfile() && !BF.empty()) {
-      assert(BF.getProfileFlags() != BinaryFunction::PF_NONE);
-      if (ProfileFlags == BinaryFunction::PF_NONE) {
-        ProfileFlags = BF.getProfileFlags();
-      }
-      assert(BF.getProfileFlags() == ProfileFlags &&
-             "expected consistent profile flags across all functions");
-    }
-  }
-  BP.Header.Flags = ProfileFlags;
-
-  // Add all function objects.
-  for (const auto &BFI : Functions) {
-    const auto &BF = BFI.second;
-    if (BF.hasProfile()) {
-      // In conversion mode ignore stale functions.
-      if (!BF.hasValidProfile() && !RI.getDataAggregator().started())
-        continue;
-
-      yaml::bolt::BinaryFunctionProfile YamlBF;
-      convert(BF, YamlBF);
-      BP.Functions.emplace_back(YamlBF);
-    }
-  }
-
-  // Write the profile.
-  yaml::Output Out(*OS, nullptr, 0);
-  Out << BP;
-
-  return std::error_code();
-}
-
-} // namespace bolt
-} // namespace llvm
--- a/bolt/src/ProfileWriter.h
+++ b/bolt/src/ProfileWriter.h
@ -1,48 +0,0 @@
-//===-- ProfileWriter.cpp - serialize profiling data ------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-
-#ifndef LLVM_TOOLS_LLVM_BOLT_PROFILE_WRITER_H
-#define LLVM_TOOLS_LLVM_BOLT_PROFILE_WRITER_H
-
-#include "BinaryBasicBlock.h"
-#include "BinaryContext.h"
-#include "BinaryFunction.h"
-#include "ProfileYAMLMapping.h"
-#include "RewriteInstance.h"
-#include "llvm/Support/ErrorOr.h"
-#include "llvm/Support/raw_ostream.h"
-#include <system_error>
-
-namespace llvm {
-namespace bolt {
-
-class ProfileWriter {
-  ProfileWriter() = delete;
-
-  std::string FileName;
-
-  std::unique_ptr<raw_fd_ostream> OS;
-
-public:
-  explicit ProfileWriter(const std::string &FileName)
-    : FileName(FileName) {
-  }
-
-  /// Save execution profile for that instance.
-  std::error_code writeProfile(const RewriteInstance &RI);
-};
-
-} // namespace bolt
-} // namespace llvm
-
-#endif // LLVM_TOOLS_LLVM_BOLT_PROFILE_WRITER_H
--- a/bolt/src/Relocation.cpp
+++ b/bolt/src/Relocation.cpp
@ -378,8 +378,8 @@ size_t Relocation::emit(MCStreamer *Streamer) const {
  const auto Size = getSizeForType(Type);
  auto &Ctx = Streamer->getContext();
  if (isPCRelative(Type)) {
-    auto *TempLabel = Ctx.createTempSymbol();
-    Streamer->EmitLabel(TempLabel);
+    auto *TempLabel = Ctx.createNamedTempSymbol();
+    Streamer->emitLabel(TempLabel);
    const MCExpr *Value{nullptr};
    if (Symbol) {
      Value = MCSymbolRefExpr::create(Symbol, Ctx);
@ -394,7 +394,7 @@ size_t Relocation::emit(MCStreamer *Streamer) const {
    Value = MCBinaryExpr::createSub(Value,
                                    MCSymbolRefExpr::create(TempLabel, Ctx),
                                    Ctx);
-    Streamer->EmitValue(Value, Size);
+    Streamer->emitValue(Value, Size);

    return Size;
  }
@ -403,11 +403,11 @@ size_t Relocation::emit(MCStreamer *Streamer) const {
    auto Value = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(Symbol, Ctx),
                                         MCConstantExpr::create(Addend, Ctx),
                                         Ctx);
-    Streamer->EmitValue(Value, Size);
+    Streamer->emitValue(Value, Size);
  } else if (Symbol) {
-    Streamer->EmitSymbolValue(Symbol, Size);
+    Streamer->emitSymbolValue(Symbol, Size);
  } else {
-    Streamer->EmitIntValue(Addend, Size);
+    Streamer->emitIntValue(Addend, Size);
  }

  return Size;
--- a/bolt/src/RewriteInstance.cpp
+++ b/bolt/src/RewriteInstance.cpp
--- a/bolt/src/RewriteInstance.h
+++ b/bolt/src/RewriteInstance.h
@ -20,12 +20,12 @@
 #include "Passes/Instrumentation.h"
 #include "ProfileReaderBase.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/StringPool.h"
 #include <map>
 #include <set>

@ -184,10 +184,10 @@ private:
  std::vector<BinarySection *> getCodeSections();

  /// Map all sections to their final addresses.
-  void mapCodeSections(orc::VModuleKey ObjectsHandle);
-  void mapDataSections(orc::VModuleKey ObjectsHandle);
-  void mapFileSections(orc::VModuleKey ObjectsHandle);
-  void mapExtraSections(orc::VModuleKey ObjectsHandle);
+  void mapCodeSections(RuntimeDyld &RTDyld);
+  void mapDataSections(RuntimeDyld &RTDyld);
+  void mapFileSections(RuntimeDyld &RTDyld);
+  void mapExtraSections(RuntimeDyld &RTDyld);

  /// Update output object's values based on the final \p Layout.
  void updateOutputValues(const MCAsmLayout &Layout);
@ -278,7 +278,7 @@ private:

  /// Return a name of the input file section in the output file.
  template<typename ELFObjType, typename ELFShdrTy>
-  std::string getOutputSectionName(const ELFObjType *Obj,
+  std::string getOutputSectionName(const ELFObjType &Obj,
                                   const ELFShdrTy &Section);

  /// Return a list of all sections to include in the output binary.
@ -413,11 +413,8 @@ private:
  std::unique_ptr<BinaryContext> BC;
  std::unique_ptr<CFIReaderWriter> CFIRdWrt;

-  std::unique_ptr<orc::SymbolStringPool> SSP;
-  std::unique_ptr<orc::ExecutionSession> ES;
-
-  // Run ObjectLinkingLayer() with custom memory manager and symbol resolver.
-  std::unique_ptr<orc::RTDyldObjectLinkingLayer> OLT;
+  // Run ExecutionEngine linker with custom memory manager and symbol resolver.
+  std::unique_ptr<RuntimeDyld> RTDyld;

  /// Output file where we mix original code from the input binary and
  /// optimized code for selected functions.
@ -498,8 +495,7 @@ private:

  /// Section header string table.
  StringTableBuilder SHStrTab;
-  StringPool SHStrTabPool;
-  std::vector<PooledStringPtr> AllSHStrTabStrings;
+  std::vector<std::string> SHStrTabPool;

  /// A rewrite of strtab
  std::string NewStrTab;
--- a/bolt/src/RuntimeLibs/CMakeLists.txt
+++ b/bolt/src/RuntimeLibs/CMakeLists.txt
@ -7,4 +7,4 @@ add_llvm_library(LLVMBOLTRuntimeLibs
  intrinsics_gen
  )

-include_directories( ${LLVM_MAIN_SRC_DIR}/tools/llvm-bolt/src )
+include_directories( ${BOLT_SOURCE_DIR}/src )
--- a/bolt/src/RuntimeLibs/HugifyRuntimeLibrary.cpp
+++ b/bolt/src/RuntimeLibs/HugifyRuntimeLibrary.cpp
@ -10,6 +10,8 @@
 #include "HugifyRuntimeLibrary.h"
 #include "BinaryFunction.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/CommandLine.h"

 using namespace llvm;
 using namespace bolt;
@ -73,26 +75,31 @@ void HugifyRuntimeLibrary::emitBinary(BinaryContext &BC, MCStreamer &Streamer) {
  // jump to after finishing the init code.
  MCSymbol *InitPtr = BC.Ctx->getOrCreateSymbol("__bolt_hugify_init_ptr");

-  Section->setAlignment(BC.RegularPageSize);
+  Section->setAlignment(llvm::Align(BC.RegularPageSize));
  Streamer.SwitchSection(Section);

-  Streamer.EmitLabel(InitPtr);
-  Streamer.EmitSymbolAttribute(InitPtr, MCSymbolAttr::MCSA_Global);
-  Streamer.EmitValue(
+  Streamer.emitLabel(InitPtr);
+  Streamer.emitSymbolAttribute(InitPtr, MCSymbolAttr::MCSA_Global);
+  Streamer.emitValue(
      MCSymbolRefExpr::create(StartFunction->getSymbol(), *(BC.Ctx)),
      /*Size=*/8);
 }

 void HugifyRuntimeLibrary::link(BinaryContext &BC, StringRef ToolPath,
-                                orc::ExecutionSession &ES,
-                                orc::RTDyldObjectLinkingLayer &OLT) {
+                                RuntimeDyld &RTDyld,
+                                std::function<void(RuntimeDyld &)> OnLoad) {
  auto LibPath = getLibPath(ToolPath, opts::RuntimeHugifyLib);
-  loadLibraryToOLT(LibPath, ES, OLT);
+  loadLibrary(LibPath, RTDyld);
+  OnLoad(RTDyld);
+  RTDyld.finalizeWithMemoryManagerLocking();
+  if (RTDyld.hasError()) {
+    outs() << "BOLT-ERROR: RTDyld failed: " << RTDyld.getErrorString() << "\n";
+    exit(1);
+  }

  assert(!RuntimeStartAddress &&
         "We don't currently support linking multiple runtime libraries");
-  RuntimeStartAddress =
-      cantFail(OLT.findSymbol("__bolt_hugify_self", false).getAddress());
+  RuntimeStartAddress = RTDyld.getSymbol("__bolt_hugify_self").getAddress();
  if (!RuntimeStartAddress) {
    errs() << "BOLT-ERROR: instrumentation library does not define "
              "__bolt_hugify_self: "
--- a/bolt/src/RuntimeLibs/HugifyRuntimeLibrary.h
+++ b/bolt/src/RuntimeLibs/HugifyRuntimeLibrary.h
@ -19,17 +19,16 @@ class HugifyRuntimeLibrary : public RuntimeLibrary {
 public:
  /// Add custom section names generated by the runtime libraries to \p
  /// SecNames.
-  void
-  addRuntimeLibSections(std::vector<std::string> &SecNames) const override {
+  void addRuntimeLibSections(std::vector<std::string> &SecNames) const final {
    SecNames.push_back(".bolt.hugify.entries");
  }

-  void adjustCommandLineOptions(const BinaryContext &BC) const override;
+  void adjustCommandLineOptions(const BinaryContext &BC) const final;

-  void emitBinary(BinaryContext &BC, MCStreamer &Streamer) override;
+  void emitBinary(BinaryContext &BC, MCStreamer &Streamer) final;

-  void link(BinaryContext &BC, StringRef ToolPath, orc::ExecutionSession &ES,
-            orc::RTDyldObjectLinkingLayer &OLT) override;
+  void link(BinaryContext &BC, StringRef ToolPath, RuntimeDyld &RTDyld,
+            std::function<void(RuntimeDyld &)> OnLoad) final;
 };

 } // namespace bolt
--- a/bolt/src/RuntimeLibs/InstrumentationRuntimeLibrary.cpp
+++ b/bolt/src/RuntimeLibs/InstrumentationRuntimeLibrary.cpp
@ -12,6 +12,8 @@
 #include "InstrumentationRuntimeLibrary.h"
 #include "BinaryFunction.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/CommandLine.h"

 using namespace llvm;
 using namespace bolt;
@ -111,12 +113,12 @@ void InstrumentationRuntimeLibrary::emitBinary(BinaryContext &BC,
  MCSymbol *FiniPtr = BC.Ctx->getOrCreateSymbol("__bolt_instr_fini_ptr");
  MCSymbol *SleepSym = BC.Ctx->getOrCreateSymbol("__bolt_instr_sleep_time");

-  Section->setAlignment(BC.RegularPageSize);
+  Section->setAlignment(llvm::Align(BC.RegularPageSize));
  Streamer.SwitchSection(Section);
-  Streamer.EmitLabel(Locs);
-  Streamer.EmitSymbolAttribute(Locs, MCSymbolAttr::MCSA_Global);
+  Streamer.emitLabel(Locs);
+  Streamer.emitSymbolAttribute(Locs, MCSymbolAttr::MCSA_Global);
  for (const auto &Label : Summary->Counters) {
-    Streamer.EmitLabel(Label);
+    Streamer.emitLabel(Label);
    Streamer.emitFill(8, 0);
  }
  const uint64_t Padding =
@ -124,50 +126,50 @@ void InstrumentationRuntimeLibrary::emitBinary(BinaryContext &BC,
      8 * Summary->Counters.size();
  if (Padding)
    Streamer.emitFill(Padding, 0);
-  Streamer.EmitLabel(SleepSym);
-  Streamer.EmitSymbolAttribute(SleepSym, MCSymbolAttr::MCSA_Global);
-  Streamer.EmitIntValue(opts::InstrumentationSleepTime, /*Size=*/4);
-  Streamer.EmitLabel(NumLocs);
-  Streamer.EmitSymbolAttribute(NumLocs, MCSymbolAttr::MCSA_Global);
-  Streamer.EmitIntValue(Summary->Counters.size(), /*Size=*/4);
-  Streamer.EmitLabel(Summary->IndCallHandlerFunc);
-  Streamer.EmitSymbolAttribute(Summary->IndCallHandlerFunc,
+  Streamer.emitLabel(SleepSym);
+  Streamer.emitSymbolAttribute(SleepSym, MCSymbolAttr::MCSA_Global);
+  Streamer.emitIntValue(opts::InstrumentationSleepTime, /*Size=*/4);
+  Streamer.emitLabel(NumLocs);
+  Streamer.emitSymbolAttribute(NumLocs, MCSymbolAttr::MCSA_Global);
+  Streamer.emitIntValue(Summary->Counters.size(), /*Size=*/4);
+  Streamer.emitLabel(Summary->IndCallHandlerFunc);
+  Streamer.emitSymbolAttribute(Summary->IndCallHandlerFunc,
                               MCSymbolAttr::MCSA_Global);
-  Streamer.EmitValue(
+  Streamer.emitValue(
      MCSymbolRefExpr::create(
          Summary->InitialIndCallHandlerFunction->getSymbol(), *BC.Ctx),
      /*Size=*/8);
-  Streamer.EmitLabel(Summary->IndTailCallHandlerFunc);
-  Streamer.EmitSymbolAttribute(Summary->IndTailCallHandlerFunc,
+  Streamer.emitLabel(Summary->IndTailCallHandlerFunc);
+  Streamer.emitSymbolAttribute(Summary->IndTailCallHandlerFunc,
                               MCSymbolAttr::MCSA_Global);
-  Streamer.EmitValue(
+  Streamer.emitValue(
      MCSymbolRefExpr::create(
          Summary->InitialIndTailCallHandlerFunction->getSymbol(), *BC.Ctx),
      /*Size=*/8);
-  Streamer.EmitLabel(NumIndCalls);
-  Streamer.EmitSymbolAttribute(NumIndCalls, MCSymbolAttr::MCSA_Global);
-  Streamer.EmitIntValue(Summary->IndCallDescriptions.size(), /*Size=*/4);
-  Streamer.EmitLabel(NumIndCallTargets);
-  Streamer.EmitSymbolAttribute(NumIndCallTargets, MCSymbolAttr::MCSA_Global);
-  Streamer.EmitIntValue(Summary->IndCallTargetDescriptions.size(), /*Size=*/4);
-  Streamer.EmitLabel(NumFuncs);
-  Streamer.EmitSymbolAttribute(NumFuncs, MCSymbolAttr::MCSA_Global);
+  Streamer.emitLabel(NumIndCalls);
+  Streamer.emitSymbolAttribute(NumIndCalls, MCSymbolAttr::MCSA_Global);
+  Streamer.emitIntValue(Summary->IndCallDescriptions.size(), /*Size=*/4);
+  Streamer.emitLabel(NumIndCallTargets);
+  Streamer.emitSymbolAttribute(NumIndCallTargets, MCSymbolAttr::MCSA_Global);
+  Streamer.emitIntValue(Summary->IndCallTargetDescriptions.size(), /*Size=*/4);
+  Streamer.emitLabel(NumFuncs);
+  Streamer.emitSymbolAttribute(NumFuncs, MCSymbolAttr::MCSA_Global);

-  Streamer.EmitIntValue(Summary->FunctionDescriptions.size(), /*Size=*/4);
-  Streamer.EmitLabel(FilenameSym);
-  Streamer.EmitBytes(opts::InstrumentationFilename);
+  Streamer.emitIntValue(Summary->FunctionDescriptions.size(), /*Size=*/4);
+  Streamer.emitLabel(FilenameSym);
+  Streamer.emitBytes(opts::InstrumentationFilename);
  Streamer.emitFill(1, 0);
-  Streamer.EmitLabel(UsePIDSym);
-  Streamer.EmitIntValue(opts::InstrumentationFileAppendPID ? 1 : 0, /*Size=*/1);
+  Streamer.emitLabel(UsePIDSym);
+  Streamer.emitIntValue(opts::InstrumentationFileAppendPID ? 1 : 0, /*Size=*/1);

-  Streamer.EmitLabel(InitPtr);
-  Streamer.EmitSymbolAttribute(InitPtr, MCSymbolAttr::MCSA_Global);
-  Streamer.EmitValue(
+  Streamer.emitLabel(InitPtr);
+  Streamer.emitSymbolAttribute(InitPtr, MCSymbolAttr::MCSA_Global);
+  Streamer.emitValue(
      MCSymbolRefExpr::create(StartFunction->getSymbol(), *BC.Ctx), /*Size=*/8);
  if (FiniFunction) {
-    Streamer.EmitLabel(FiniPtr);
-    Streamer.EmitSymbolAttribute(FiniPtr, MCSymbolAttr::MCSA_Global);
-    Streamer.EmitValue(
+    Streamer.emitLabel(FiniPtr);
+    Streamer.emitSymbolAttribute(FiniPtr, MCSymbolAttr::MCSA_Global);
+    Streamer.emitValue(
      MCSymbolRefExpr::create(FiniFunction->getSymbol(), *BC.Ctx), /*Size=*/8);
  }

@ -176,33 +178,37 @@ void InstrumentationRuntimeLibrary::emitBinary(BinaryContext &BC,
                                 "__BOLT", "__tables", MachO::S_REGULAR,
                                 SectionKind::getData());
    MCSymbol *Tables = BC.Ctx->getOrCreateSymbol("__bolt_instr_tables");
-    TablesSection->setAlignment(BC.RegularPageSize);
+    TablesSection->setAlignment(llvm::Align(BC.RegularPageSize));
    Streamer.SwitchSection(TablesSection);
-    Streamer.EmitLabel(Tables);
-    Streamer.EmitSymbolAttribute(Tables, MCSymbolAttr::MCSA_Global);
-    Streamer.EmitBytes(buildTables(BC));
+    Streamer.emitLabel(Tables);
+    Streamer.emitSymbolAttribute(Tables, MCSymbolAttr::MCSA_Global);
+    Streamer.emitBytes(buildTables(BC));
  }
 }

-void InstrumentationRuntimeLibrary::link(BinaryContext &BC, StringRef ToolPath,
-                                         orc::ExecutionSession &ES,
-                                         orc::RTDyldObjectLinkingLayer &OLT) {
+void InstrumentationRuntimeLibrary::link(
+    BinaryContext &BC, StringRef ToolPath, RuntimeDyld &RTDyld,
+    std::function<void(RuntimeDyld &)> OnLoad) {
  auto LibPath = getLibPath(ToolPath, opts::RuntimeInstrumentationLib);
-  loadLibraryToOLT(LibPath, ES, OLT);
+  loadLibrary(LibPath, RTDyld);
+  OnLoad(RTDyld);
+  RTDyld.finalizeWithMemoryManagerLocking();
+  if (RTDyld.hasError()) {
+    outs() << "BOLT-ERROR: RTDyld failed: " << RTDyld.getErrorString() << "\n";
+    exit(1);
+  }

  if (BC.isMachO())
    return;

-  RuntimeFiniAddress =
-      cantFail(OLT.findSymbol("__bolt_instr_fini", false).getAddress());
+  RuntimeFiniAddress = RTDyld.getSymbol("__bolt_instr_fini").getAddress();
  if (!RuntimeFiniAddress) {
    errs() << "BOLT-ERROR: instrumentation library does not define "
              "__bolt_instr_fini: "
           << LibPath << "\n";
    exit(1);
  }
-  RuntimeStartAddress =
-      cantFail(OLT.findSymbol("__bolt_instr_start", false).getAddress());
+  RuntimeStartAddress = RTDyld.getSymbol("__bolt_instr_start").getAddress();
  if (!RuntimeStartAddress) {
    errs() << "BOLT-ERROR: instrumentation library does not define "
              "__bolt_instr_start: "
@ -212,11 +218,10 @@ void InstrumentationRuntimeLibrary::link(BinaryContext &BC, StringRef ToolPath,
  outs() << "BOLT-INFO: output linked against instrumentation runtime "
            "library, lib entry point is 0x"
         << Twine::utohexstr(RuntimeFiniAddress) << "\n";
-  outs()
-      << "BOLT-INFO: clear procedure is 0x"
-      << Twine::utohexstr(cantFail(
-             OLT.findSymbol("__bolt_instr_clear_counters", false).getAddress()))
-      << "\n";
+  outs() << "BOLT-INFO: clear procedure is 0x"
+         << Twine::utohexstr(
+                RTDyld.getSymbol("__bolt_instr_clear_counters").getAddress())
+         << "\n";

  emitTablesAsELFNote(BC);
 }
--- a/bolt/src/RuntimeLibs/InstrumentationRuntimeLibrary.h
+++ b/bolt/src/RuntimeLibs/InstrumentationRuntimeLibrary.h
@ -21,17 +21,16 @@ public:
  InstrumentationRuntimeLibrary(std::unique_ptr<InstrumentationSummary> Summary)
      : Summary(std::move(Summary)) {}

-  void
-  addRuntimeLibSections(std::vector<std::string> &SecNames) const override {
+  void addRuntimeLibSections(std::vector<std::string> &SecNames) const final {
    SecNames.push_back(".bolt.instr.counters");
  }

-  void adjustCommandLineOptions(const BinaryContext &BC) const override;
+  void adjustCommandLineOptions(const BinaryContext &BC) const final;

-  void emitBinary(BinaryContext &BC, MCStreamer &Streamer) override;
+  void emitBinary(BinaryContext &BC, MCStreamer &Streamer) final;

-  void link(BinaryContext &BC, StringRef ToolPath, orc::ExecutionSession &ES,
-            orc::RTDyldObjectLinkingLayer &OLT) override;
+  void link(BinaryContext &BC, StringRef ToolPath, RuntimeDyld &RTDyld,
+            std::function<void(RuntimeDyld &)> OnLoad) final;

 private:
  std::string buildTables(BinaryContext &BC);
--- a/bolt/src/RuntimeLibs/RuntimeLibrary.cpp
+++ b/bolt/src/RuntimeLibs/RuntimeLibrary.cpp
@ -11,7 +11,8 @@

 #include "RuntimeLibrary.h"
 #include "Utils.h"
-#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
+#include "llvm/BinaryFormat/Magic.h"
+#include "llvm/ExecutionEngine/RuntimeDyld.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Support/Path.h"

@ -40,13 +41,10 @@ std::string RuntimeLibrary::getLibPath(StringRef ToolPath,
    errs() << "BOLT-ERROR: library not found: " << LibPath << "\n";
    exit(1);
  }
-  return LibPath.str();
+  return std::string(LibPath.str());
 }

-void RuntimeLibrary::loadLibraryToOLT(StringRef LibPath,
-                                      orc::ExecutionSession &ES,
-                                      orc::RTDyldObjectLinkingLayer &OLT) {
-  OLT.setProcessAllSections(false);
+void RuntimeLibrary::loadLibrary(StringRef LibPath, RuntimeDyld &RTDyld) {
  ErrorOr<std::unique_ptr<MemoryBuffer>> MaybeBuf =
      MemoryBuffer::getFile(LibPath, -1, false);
  check_error(MaybeBuf.getError(), LibPath);
@ -57,18 +55,18 @@ void RuntimeLibrary::loadLibraryToOLT(StringRef LibPath,
    Error Err = Error::success();
    object::Archive Archive(B.get()->getMemBufferRef(), Err);
    for (auto &C : Archive.children(Err)) {
-      auto ChildKey = ES.allocateVModule();
-      auto ChildBuf =
-          MemoryBuffer::getMemBuffer(cantFail(C.getMemoryBufferRef()));
-      cantFail(OLT.addObject(ChildKey, std::move(ChildBuf)));
-      cantFail(OLT.emitAndFinalize(ChildKey));
+      std::unique_ptr<object::Binary> Bin = cantFail(C.getAsBinary());
+      if (auto *Obj = dyn_cast<object::ObjectFile>(&*Bin)) {
+        RTDyld.loadObject(*Obj);
+      }
    }
    check_error(std::move(Err), B->getBufferIdentifier());
  } else if (Magic == file_magic::elf_relocatable ||
             Magic == file_magic::elf_shared_object) {
-    auto K2 = ES.allocateVModule();
-    cantFail(OLT.addObject(K2, std::move(B)));
-    cantFail(OLT.emitAndFinalize(K2));
+    std::unique_ptr<object::ObjectFile> Obj = cantFail(
+      object::ObjectFile::createObjectFile(B.get()->getMemBufferRef()),
+      "error creating in-memory object");
+    RTDyld.loadObject(*Obj);
  } else {
    errs() << "BOLT-ERROR: unrecognized library format: " << LibPath << "\n";
    exit(1);
--- a/bolt/src/RuntimeLibs/RuntimeLibrary.h
+++ b/bolt/src/RuntimeLibs/RuntimeLibrary.h
@ -16,15 +16,13 @@
 #define LLVM_TOOLS_LLVM_BOLT_LINKRUNTIME_H

 #include <llvm/ADT/StringRef.h>
+#include <functional>
 #include <vector>

 namespace llvm {
-class MCStreamer;

-namespace orc {
-class ExecutionSession;
-class RTDyldObjectLinkingLayer;
-} // namespace orc
+class MCStreamer;
+class RuntimeDyld;

 namespace bolt {

@ -53,9 +51,8 @@ public:
  virtual void emitBinary(BinaryContext &BC, MCStreamer &Streamer) = 0;

  /// Link with the library code.
-  virtual void link(BinaryContext &BC, StringRef ToolPath,
-                    orc::ExecutionSession &ES,
-                    orc::RTDyldObjectLinkingLayer &OLT) = 0;
+  virtual void link(BinaryContext &BC, StringRef ToolPath, RuntimeDyld &RTDyld,
+                    std::function<void(RuntimeDyld &)> OnLoad) = 0;

 protected:
  /// The fini and init address set by the runtime library.
@ -65,9 +62,8 @@ protected:
  /// Get the full path to a runtime library specified by \p LibFileName.
  static std::string getLibPath(StringRef ToolPath, StringRef LibFileName);

-  /// Load a static runtime library specified by \p LibPath to OLT.
-  static void loadLibraryToOLT(StringRef LibPath, orc::ExecutionSession &ES,
-                               orc::RTDyldObjectLinkingLayer &OLT);
+  /// Load a static runtime library specified by \p LibPath.
+  static void loadLibrary(StringRef LibPath, RuntimeDyld &RTDyld);
 };

 } // namespace bolt
--- a/bolt/src/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/src/Target/AArch64/AArch64MCPlusBuilder.cpp
@ -24,7 +24,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
-#include "InstPrinter/AArch64InstPrinter.h"
+#include "MCTargetDesc/AArch64InstPrinter.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "MCTargetDesc/AArch64ELFStreamer.h"
@ -316,8 +316,7 @@ public:
      }
      assert(OI != Inst.end() && "Literal operand not found");
    }
-    OI = Inst.erase(OI);
-    Inst.insert(OI, Operand);
+    *OI = Operand;
    return true;
  }

@ -432,9 +431,8 @@ public:
      OI = Inst.begin() + 2;
    }

-    OI = Inst.erase(OI);
-    Inst.insert(OI, MCOperand::createExpr(MCSymbolRefExpr::create(
-                        TBB, MCSymbolRefExpr::VK_None, *Ctx)));
+    *OI = MCOperand::createExpr(
+        MCSymbolRefExpr::create(TBB, MCSymbolRefExpr::VK_None, *Ctx));
    return true;
  }

@ -582,7 +580,7 @@ public:
        if (!Instr.getOperand(OpNum).isReg())
          continue;
        Uses[&Instr].push_back(RegAliasTable[Instr.getOperand(OpNum).getReg()]);
-        DEBUG({
+        LLVM_DEBUG({
          dbgs() << "Adding reg operand " << Instr.getOperand(OpNum).getReg()
                 << " refs ";
          if (RegAliasTable[Instr.getOperand(OpNum).getReg()] != nullptr)
@ -593,7 +591,7 @@ public:
      }
    };

-    DEBUG(dbgs() << "computeLocalUDChain\n");
+    LLVM_DEBUG(dbgs() << "computeLocalUDChain\n");
    bool TerminatorSeen = false;
    for (auto II = Begin; II != End; ++II) {
      auto &Instr = *II;
@ -605,8 +603,8 @@ public:
        Uses.clear();
      }

-      DEBUG(dbgs() << "Now updating for:\n ");
-      DEBUG(Instr.dump());
+      LLVM_DEBUG(dbgs() << "Now updating for:\n ");
+      LLVM_DEBUG(Instr.dump());
      addInstrOperands(Instr);

      BitVector Regs = BitVector(RegInfo->getNumRegs(), false);
@ -616,7 +614,8 @@ public:
      int Idx = Regs.find_first();
      while (Idx != -1) {
        RegAliasTable[Idx] = &Instr;
-        DEBUG(dbgs() << "Setting reg " << Idx << " def to current instr.\n");
+        LLVM_DEBUG(dbgs() << "Setting reg " << Idx
+                          << " def to current instr.\n");
        Idx = Regs.find_next(Idx);
      }

@ -688,7 +687,13 @@ public:
    }
  }

-  unsigned getCanonicalBranchOpcode(unsigned Opcode) const override {
+  unsigned getCondCode(const MCInst &Inst) const override {
+    // AArch64 does not use conditional codes, so we just return the opcode
+    // of the conditional branch here.
+    return Inst.getOpcode();
+  }
+
+  unsigned getCanonicalBranchCondCode(unsigned Opcode) const override {
    switch (Opcode) {
    default:
      return Opcode;
@ -711,7 +716,7 @@ public:
             Inst.getOperand(0).getImm() != AArch64CC::NV &&
             "Can't reverse ALWAYS cond code");
    } else {
-      DEBUG(Inst.dump());
+      LLVM_DEBUG(Inst.dump());
      llvm_unreachable("Unrecognized branch instruction");
    }
    return replaceBranchTarget(Inst, TBB, Ctx);
--- a/bolt/src/Target/AArch64/CMakeLists.txt
+++ b/bolt/src/Target/AArch64/CMakeLists.txt
@ -7,5 +7,4 @@ add_llvm_library(LLVMBOLTTargetAArch64
  )

 include_directories(${LLVM_MAIN_SRC_DIR}/lib/Target/AArch64 ${LLVM_BINARY_DIR}/lib/Target/AArch64)
-include_directories(${LLVM_MAIN_SRC_DIR}/tools/llvm-bolt/src)
-
+include_directories(${BOLT_SOURCE_DIR}/src)
--- a/bolt/src/Target/X86/CMakeLists.txt
+++ b/bolt/src/Target/X86/CMakeLists.txt
@ -7,5 +7,4 @@ add_llvm_library(LLVMBOLTTargetX86
  )

 include_directories(${LLVM_MAIN_SRC_DIR}/lib/Target/X86 ${LLVM_BINARY_DIR}/lib/Target/X86)
-include_directories(${LLVM_MAIN_SRC_DIR}/tools/llvm-bolt/src)
-
+include_directories(${BOLT_SOURCE_DIR}/src)
--- a/bolt/src/Target/X86/X86MCPlusBuilder.cpp
+++ b/bolt/src/Target/X86/X86MCPlusBuilder.cpp
@ -30,8 +30,8 @@
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/TargetRegistry.h"
-#include "InstPrinter/X86ATTInstPrinter.h"
-#include "InstPrinter/X86IntelInstPrinter.h"
+#include "MCTargetDesc/X86ATTInstPrinter.h"
+#include "MCTargetDesc/X86IntelInstPrinter.h"
 #include "MCTargetDesc/X86MCTargetDesc.h"
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "MCTargetDesc/X86MCAsmInfo.h"
@ -49,38 +49,8 @@ unsigned getShortBranchOpcode(unsigned Opcode) {
    return Opcode;
  case X86::JMP_2: return X86::JMP_1;
  case X86::JMP_4: return X86::JMP_1;
-  case X86::JE_2:  return X86::JE_1;
-  case X86::JE_4:  return X86::JE_1;
-  case X86::JNE_2: return X86::JNE_1;
-  case X86::JNE_4: return X86::JNE_1;
-  case X86::JL_2:  return X86::JL_1;
-  case X86::JL_4:  return X86::JL_1;
-  case X86::JLE_2: return X86::JLE_1;
-  case X86::JLE_4: return X86::JLE_1;
-  case X86::JG_2:  return X86::JG_1;
-  case X86::JG_4:  return X86::JG_1;
-  case X86::JGE_2: return X86::JGE_1;
-  case X86::JGE_4: return X86::JGE_1;
-  case X86::JB_2:  return X86::JB_1;
-  case X86::JB_4:  return X86::JB_1;
-  case X86::JBE_2: return X86::JBE_1;
-  case X86::JBE_4: return X86::JBE_1;
-  case X86::JA_2:  return X86::JA_1;
-  case X86::JA_4:  return X86::JA_1;
-  case X86::JAE_2: return X86::JAE_1;
-  case X86::JAE_4: return X86::JAE_1;
-  case X86::JS_2:  return X86::JS_1;
-  case X86::JS_4:  return X86::JS_1;
-  case X86::JNS_2: return X86::JNS_1;
-  case X86::JNS_4: return X86::JNS_1;
-  case X86::JP_2:  return X86::JP_1;
-  case X86::JP_4:  return X86::JP_1;
-  case X86::JNP_2: return X86::JNP_1;
-  case X86::JNP_4: return X86::JNP_1;
-  case X86::JO_2:  return X86::JO_1;
-  case X86::JO_4:  return X86::JO_1;
-  case X86::JNO_2: return X86::JNO_1;
-  case X86::JNO_4: return X86::JNO_1;
+  case X86::JCC_2: return X86::JCC_1;
+  case X86::JCC_4: return X86::JCC_1;
  }
 }

@ -152,67 +122,25 @@ unsigned getShortArithOpcode(unsigned Opcode) {
  }
 }

-unsigned getInvertedBranchOpcode(unsigned Opcode) {
-  switch (Opcode) {
-  default:
-    return Opcode;
-  case X86::JE_1:  return X86::JNE_1;
-  case X86::JE_2:  return X86::JNE_2;
-  case X86::JE_4:  return X86::JNE_4;
-  case X86::JNE_1: return X86::JE_1;
-  case X86::JNE_2: return X86::JE_2;
-  case X86::JNE_4: return X86::JE_4;
-  case X86::JL_1:  return X86::JGE_1;
-  case X86::JL_2:  return X86::JGE_2;
-  case X86::JL_4:  return X86::JGE_4;
-  case X86::JLE_1: return X86::JG_1;
-  case X86::JLE_2: return X86::JG_2;
-  case X86::JLE_4: return X86::JG_4;
-  case X86::JG_1:  return X86::JLE_1;
-  case X86::JG_2:  return X86::JLE_2;
-  case X86::JG_4:  return X86::JLE_4;
-  case X86::JGE_1: return X86::JL_1;
-  case X86::JGE_2: return X86::JL_2;
-  case X86::JGE_4: return X86::JL_4;
-  case X86::JB_1:  return X86::JAE_1;
-  case X86::JB_2:  return X86::JAE_2;
-  case X86::JB_4:  return X86::JAE_4;
-  case X86::JBE_1: return X86::JA_1;
-  case X86::JBE_2: return X86::JA_2;
-  case X86::JBE_4: return X86::JA_4;
-  case X86::JA_1:  return X86::JBE_1;
-  case X86::JA_2:  return X86::JBE_2;
-  case X86::JA_4:  return X86::JBE_4;
-  case X86::JAE_1: return X86::JB_1;
-  case X86::JAE_2: return X86::JB_2;
-  case X86::JAE_4: return X86::JB_4;
-  case X86::JS_1:  return X86::JNS_1;
-  case X86::JS_2:  return X86::JNS_2;
-  case X86::JS_4:  return X86::JNS_4;
-  case X86::JNS_1: return X86::JS_1;
-  case X86::JNS_2: return X86::JS_2;
-  case X86::JNS_4: return X86::JS_4;
-  case X86::JP_1:  return X86::JNP_1;
-  case X86::JP_2:  return X86::JNP_2;
-  case X86::JP_4:  return X86::JNP_4;
-  case X86::JNP_1: return X86::JP_1;
-  case X86::JNP_2: return X86::JP_2;
-  case X86::JNP_4: return X86::JP_4;
-  case X86::JO_1:  return X86::JNO_1;
-  case X86::JO_2:  return X86::JNO_2;
-  case X86::JO_4:  return X86::JNO_4;
-  case X86::JNO_1: return X86::JO_1;
-  case X86::JNO_2: return X86::JO_2;
-  case X86::JNO_4: return X86::JO_4;
-  case X86::LOOP:
-  case X86::LOOPE:
-  case X86::LOOPNE:
-  case X86::JECXZ:
-  case X86::JRCXZ:
-    // Loop/JCXZ instructions don't have a direct inverse correspondent, so
-    // inverting them would require more complex code transformations.
-    llvm_unreachable("Support for properly inverting LOOP/JCXZ "
-                     "instructions is currently unimplemented.");
+unsigned getInvertedCondCode(unsigned CC) {
+  switch (CC) {
+  default: return X86::COND_INVALID;
+  case X86::COND_E:  return X86::COND_NE;
+  case X86::COND_NE: return X86::COND_E;
+  case X86::COND_L:  return X86::COND_GE;
+  case X86::COND_LE: return X86::COND_G;
+  case X86::COND_G:  return X86::COND_LE;
+  case X86::COND_GE: return X86::COND_L;
+  case X86::COND_B:  return X86::COND_AE;
+  case X86::COND_BE: return X86::COND_A;
+  case X86::COND_A:  return X86::COND_BE;
+  case X86::COND_AE: return X86::COND_B;
+  case X86::COND_S:  return X86::COND_NS;
+  case X86::COND_NS: return X86::COND_S;
+  case X86::COND_P:  return X86::COND_NP;
+  case X86::COND_NP: return X86::COND_P;
+  case X86::COND_O:  return X86::COND_NO;
+  case X86::COND_NO: return X86::COND_O;
  }
 }

@ -471,6 +399,14 @@ public:
                       const MCRegisterInfo *RegInfo)
    : MCPlusBuilder(Analysis, Info, RegInfo) {}

+  bool isBranch(const MCInst &Inst) const override {
+    return Analysis->isBranch(Inst) && !isTailCall(Inst);
+  }
+
+  bool isUnconditionalBranch(const MCInst &Inst) const override {
+    return Analysis->isUnconditionalBranch(Inst) && !isTailCall(Inst);
+  }
+
  bool isNoop(const MCInst &Inst) const override {
    switch (Inst.getOpcode()) {
    case X86::NOOP:
@ -485,6 +421,18 @@ public:
    return false;
  }

+  unsigned getCondCode(const MCInst &Inst) const override {
+    switch (Inst.getOpcode()) {
+    default:
+      return X86::COND_INVALID;
+    case X86::JCC_1:
+    case X86::JCC_2:
+    case X86::JCC_4:
+      return Inst.getOperand(Info->get(Inst.getOpcode()).NumOperands - 1)
+          .getImm();
+    }
+  }
+
  bool isBreakpoint(const MCInst &Inst) const override {
    return Inst.getOpcode() == X86::INT3;
  }
@ -509,8 +457,21 @@ public:

  // FIXME: For compatibility with old LLVM only!
  bool isTerminator(const MCInst &Inst) const override {
-    return Info->get(Inst.getOpcode()).isTerminator() ||
-           Inst.getOpcode() == X86::UD2B || Inst.getOpcode() == X86::TRAP;
+    if (Info->get(Inst.getOpcode()).isTerminator())
+      return true;
+    switch (Inst.getOpcode()) {
+    default:
+      return false;
+    case X86::TRAP:
+    // Opcodes previously known as X86::UD2B
+    case X86::UD1Wm:
+    case X86::UD1Lm:
+    case X86::UD1Qm:
+    case X86::UD1Wr:
+    case X86::UD1Lr:
+    case X86::UD1Qr:
+      return true;
+    }
  }

  bool isIndirectCall(const MCInst &Inst) const override {
@ -696,6 +657,8 @@ public:
            Inst.getOperand(2).getReg());
  }

+  unsigned getTrapFillValue() const override { return 0xCC; }
+
  struct IndJmpMatcherFrag1 : MCInstMatcher {
    std::unique_ptr<MCInstMatcher> Base;
    std::unique_ptr<MCInstMatcher> Scale;
@ -996,24 +959,23 @@ public:
    if (FirstInstGroup == 0)
      return false;

-    const auto CondCode =
-        getShortBranchOpcode(getCanonicalBranchOpcode(SecondInst.getOpcode()));
+    const auto CondCode = getCanonicalBranchCondCode(getCondCode(SecondInst));
    switch (CondCode) {
    default:
      llvm_unreachable("unexpected conditional code");
      return false;
-    case X86::JE_1:
-    case X86::JL_1:
-    case X86::JG_1:
+    case X86::COND_E:
+    case X86::COND_L:
+    case X86::COND_G:
      return true;
-    case X86::JO_1:
-    case X86::JP_1:
-    case X86::JS_1:
+    case X86::COND_O:
+    case X86::COND_P:
+    case X86::COND_S:
      if (FirstInstGroup == 1)
        return true;
      return false;
-    case X86::JA_1:
-    case X86::JB_1:
+    case X86::COND_A:
+    case X86::COND_B:
      if (FirstInstGroup != 3)
        return true;
      return false;
@ -1110,8 +1072,7 @@ public:
    auto OI = getMemOperandDisp(Inst);
    if (OI == Inst.end())
      return false;
-    OI = Inst.erase(OI);
-    Inst.insert(OI, Operand);
+    *OI = Operand;
    return true;
  }

@ -1339,7 +1300,7 @@ public:
          if (isUpper8BitReg(Operand.getReg()))
            return true;
        }
-        // Fall-through
+      LLVM_FALLTHROUGH;
      default:
        return false;
    }
@ -1414,7 +1375,7 @@ public:
          continue;
        if (static_cast<int>(I) >= MemOpNo && I < X86::AddrNumOperands)
          continue;
-        Sz = RegInfo->getRegClass(MCII.OpInfo[I].RegClass).getPhysRegSize();
+        Sz = RegInfo->getRegClass(MCII.OpInfo[I].RegClass).getSizeInBits() / 8;
        break;
      }
      I = {Sz, IsLoad, IsStore, false, false};
@ -1438,8 +1399,8 @@ public:
    const MCExpr *DispExpr;
    if (!evaluateX86MemoryOperand(Inst, &BaseRegNum, &ScaleValue, &IndexRegNum,
                                  &DispValue, &SegRegNum, &DispExpr)) {
-      DEBUG(dbgs() << "Evaluate failed on ");
-      DEBUG(Inst.dump());
+      LLVM_DEBUG(dbgs() << "Evaluate failed on ");
+      LLVM_DEBUG(Inst.dump());
      return false;
    }

@ -1702,7 +1663,7 @@ public:
    auto MemOpNo = getMemoryOperandNo(Inst);
    const auto MCII = Info->get(Inst.getOpcode());
    const auto NumDefs = MCII.getNumDefs();
-    static BitVector SPBPAliases(BitVector(getAliases(X86::RSP)) |
+    static BitVector SPBPAliases(BitVector(getAliases(X86::RSP)) |=
                                 getAliases(X86::RBP));
    static BitVector SPAliases(getAliases(X86::RSP));

@ -1798,7 +1759,7 @@ public:
  /// load from memory. It can be extended to work with memory store opcodes as
  /// well as more memory load opcodes.
  bool replaceMemOperandWithImm(MCInst &Inst, StringRef ConstantData,
-                                uint32_t Offset) const override {
+                                uint64_t Offset) const override {
    enum CheckSignExt : uint8_t {
      NOCHECK = 0,
      CHECK8,
@ -2017,12 +1978,9 @@ public:
  }

  bool isTailCall(const MCInst &Inst) const override {
-    switch (Inst.getOpcode()) {
-    case X86::TAILJMPd:
-    case X86::TAILJMPm:
-    case X86::TAILJMPr:
-      return true;
-    }
+    auto IsTCOrErr = tryGetAnnotationAs<bool>(Inst, "TC");
+    if (IsTCOrErr)
+      return *IsTCOrErr;

    if (getConditionalTailCall(Inst))
      return true;
@ -2043,6 +2001,9 @@ public:
  }

  bool convertJmpToTailCall(MCInst &Inst) override {
+    if (isTailCall(Inst))
+      return false;
+
    int NewOpcode;
    switch (Inst.getOpcode()) {
    default:
@ -2050,21 +2011,22 @@ public:
    case X86::JMP_1:
    case X86::JMP_2:
    case X86::JMP_4:
-      NewOpcode = X86::TAILJMPd;
+      NewOpcode = X86::JMP_4;
      break;
    case X86::JMP16m:
    case X86::JMP32m:
    case X86::JMP64m:
-      NewOpcode = X86::TAILJMPm;
+      NewOpcode = X86::JMP32m;
      break;
    case X86::JMP16r:
    case X86::JMP32r:
    case X86::JMP64r:
-      NewOpcode = X86::TAILJMPr;
+      NewOpcode = X86::JMP32r;
      break;
    }

    Inst.setOpcode(NewOpcode);
+    addAnnotation(Inst, "TC", true);
    return true;
  }

@ -2073,50 +2035,54 @@ public:
    switch (Inst.getOpcode()) {
    default:
      return false;
-    case X86::TAILJMPd:
+    case X86::JMP_4:
      NewOpcode = X86::JMP_1;
      break;
-    case X86::TAILJMPm:
+    case X86::JMP32m:
      NewOpcode = X86::JMP64m;
      break;
-    case X86::TAILJMPr:
+    case X86::JMP32r:
      NewOpcode = X86::JMP64r;
      break;
    }

    Inst.setOpcode(NewOpcode);
+    removeAnnotation(Inst, "TC");
+    removeAnnotation(Inst, "Offset");
    return true;
  }

-  bool convertTailCallToCall(MCInst &Inst) const override {
+  bool convertTailCallToCall(MCInst &Inst) override {
    int NewOpcode;
    switch (Inst.getOpcode()) {
    default:
      return false;
-    case X86::TAILJMPd:
+    case X86::JMP_4:
      NewOpcode = X86::CALL64pcrel32;
      break;
-    case X86::TAILJMPm:
+    case X86::JMP32m:
      NewOpcode = X86::CALL64m;
      break;
-    case X86::TAILJMPr:
+    case X86::JMP32r:
      NewOpcode = X86::CALL64r;
      break;
    }

    Inst.setOpcode(NewOpcode);
+    removeAnnotation(Inst, "TC");
    return true;
  }

  bool convertCallToIndirectCall(MCInst &Inst,
                                 const MCSymbol *TargetLocation,
-                                 MCContext *Ctx) const override {
+                                 MCContext *Ctx) override {
+    bool IsTailCall = isTailCall(Inst);
    assert((Inst.getOpcode() == X86::CALL64pcrel32 ||
-            Inst.getOpcode() == X86::TAILJMPd) &&
+            (Inst.getOpcode() == X86::JMP_4 && IsTailCall)) &&
           "64-bit direct (tail) call instruction expected");
    const auto NewOpcode = (Inst.getOpcode() == X86::CALL64pcrel32)
      ? X86::CALL64m
-      : X86::TAILJMPm;
+      : X86::JMP32m;
    Inst.setOpcode(NewOpcode);

    // Replace the first operand and preserve auxiliary operands of
@ -2139,20 +2105,23 @@ public:
    return true;
  }

-  void convertIndirectCallToLoad(MCInst &Inst, MCPhysReg Reg) const override {
+  void convertIndirectCallToLoad(MCInst &Inst, MCPhysReg Reg) override {
+    bool IsTailCall = isTailCall(Inst);
+    if (IsTailCall)
+      removeAnnotation(Inst, "TC");
    if (Inst.getOpcode() == X86::CALL64m ||
-        Inst.getOpcode() == X86::TAILJMPm) {
+        (Inst.getOpcode() == X86::JMP32m && IsTailCall)) {
      Inst.setOpcode(X86::MOV64rm);
      Inst.insert(Inst.begin(), MCOperand::createReg(Reg));
      return;
    }
    if (Inst.getOpcode() == X86::CALL64r ||
-        Inst.getOpcode() == X86::TAILJMPr) {
+        (Inst.getOpcode() == X86::JMP32r && IsTailCall)) {
      Inst.setOpcode(X86::MOV64rr);
      Inst.insert(Inst.begin(), MCOperand::createReg(Reg));
      return;
    }
-    DEBUG(Inst.dump());
+    LLVM_DEBUG(Inst.dump());
    llvm_unreachable("not implemented");
  }

@ -2160,6 +2129,19 @@ public:
    unsigned OldOpcode = Inst.getOpcode();
    unsigned NewOpcode = OldOpcode;

+    // Check and remove EIZ/RIZ. These cases represent ambiguous cases where SIB
+    // byte is present, but no index is used and modrm alone shoud have been
+    // enough. Converting to NoRegister effectively removes the SIB byte.
+    auto MemOpNo = getMemoryOperandNo(Inst);
+    if (MemOpNo >= 0) {
+      auto &IndexOp =
+          Inst.getOperand(static_cast<unsigned>(MemOpNo) + X86::AddrIndexReg);
+      if (IndexOp.getReg() == X86::EIZ ||
+          IndexOp.getReg() == X86::RIZ) {
+        IndexOp = MCOperand::createReg(X86::NoRegister);
+      }
+    }
+
    if (isBranch(Inst)) {
      NewOpcode = getShortBranchOpcode(OldOpcode);
    } else if (OldOpcode == X86::MOV64ri) {
@ -2191,8 +2173,9 @@ public:
  }

  bool lowerTailCall(MCInst &Inst) override {
-    if (Inst.getOpcode() == X86::TAILJMPd) {
+    if (Inst.getOpcode() == X86::JMP_4 && isTailCall(Inst)) {
      Inst.setOpcode(X86::JMP_1);
+      removeAnnotation(Inst, "TC");
      return true;
    }
    return false;
@ -2214,6 +2197,12 @@ public:
    return &SymExpr->getSymbol();
  }

+  // This is the same as the base class, but since we are overriding one of
+  // getTargetSymbol's signatures above, we need to override all of them.
+  const MCSymbol *getTargetSymbol(const MCExpr *Expr) const override {
+    return &cast<const MCSymbolRefExpr>(Expr)->getSymbol();
+  }
+
  bool analyzeBranch(InstructionIterator Begin,
                     InstructionIterator End,
                     const MCSymbol *&TBB,
@ -2238,9 +2227,10 @@ public:
        break;

      // Handle unconditional branches.
-      if (I->getOpcode() == X86::JMP_1 ||
-          I->getOpcode() == X86::JMP_2 ||
-          I->getOpcode() == X86::JMP_4) {
+      if ((I->getOpcode() == X86::JMP_1 ||
+           I->getOpcode() == X86::JMP_2 ||
+           I->getOpcode() == X86::JMP_4) &&
+          !isTailCall(*I)) {
        // If any code was seen after this unconditional branch, we've seen
        // unreachable code. Ignore them.
        CondBranch = nullptr;
@ -2254,7 +2244,7 @@ public:

      // Handle conditional branches and ignore indirect branches
      if (!isUnsupportedBranch(I->getOpcode()) &&
-          getInvertedBranchOpcode(I->getOpcode()) == I->getOpcode()) {
+          getCondCode(*I) == X86::COND_INVALID) {
        // Indirect branch
        return false;
      }
@ -2315,7 +2305,7 @@ public:
    //    = R_X86_64_PC32(Ln) + En - JT
    //    = R_X86_64_PC32(Ln + offsetof(En))
    //
-    DEBUG(dbgs() << "Checking for PIC jump table\n");
+    LLVM_DEBUG(dbgs() << "Checking for PIC jump table\n");
    MCInst *MemLocInstr = nullptr;
    const MCInst *MovInstr = nullptr;
    while (++II != IE) {
@ -2328,7 +2318,7 @@ public:
      } else if (!MovInstr) {
        // Expect to see MOV instruction.
        if (!isMOVSX64rm32(Instr)) {
-          DEBUG(dbgs() << "MOV instruction expected.\n");
+          LLVM_DEBUG(dbgs() << "MOV instruction expected.\n");
          break;
        }

@ -2339,7 +2329,7 @@ public:
        if (MovDestReg != R2)
          std::swap(R1, R2);
        if (MovDestReg != R2) {
-          DEBUG(dbgs() << "MOV instruction expected to set %r2\n");
+          LLVM_DEBUG(dbgs() << "MOV instruction expected to set %r2\n");
          break;
        }

@ -2364,11 +2354,11 @@ public:
        if (!InstrDesc.hasDefOfPhysReg(Instr, R1, *RegInfo))
          continue;
        if (!isLEA64r(Instr)) {
-          DEBUG(dbgs() << "LEA instruction expected\n");
+          LLVM_DEBUG(dbgs() << "LEA instruction expected\n");
          break;
        }
        if (Instr.getOperand(0).getReg() != R1) {
-          DEBUG(dbgs() << "LEA instruction expected to set %r1\n");
+          LLVM_DEBUG(dbgs() << "LEA instruction expected to set %r1\n");
          break;
        }

@ -2396,7 +2386,7 @@ public:
    if (!MemLocInstr)
      return std::make_pair(IndirectBranchType::UNKNOWN, nullptr);

-    DEBUG(dbgs() << "checking potential PIC jump table\n");
+    LLVM_DEBUG(dbgs() << "checking potential PIC jump table\n");
    return std::make_pair(IndirectBranchType::POSSIBLE_PIC_JUMP_TABLE,
                          MemLocInstr);
  }
@ -2868,11 +2858,10 @@ public:
    Code.emplace_back(MCInstBuilder(X86::CMP64ri8)
                          .addReg(RegNo)
                          .addImm(Imm));
-    Code.emplace_back(MCInstBuilder(X86::JE_1)
+    Code.emplace_back(MCInstBuilder(X86::JCC_1)
                          .addExpr(MCSymbolRefExpr::create(
-                            Target,
-                            MCSymbolRefExpr::VK_None,
-                            *Ctx)));
+                              Target, MCSymbolRefExpr::VK_None, *Ctx))
+                          .addImm(X86::COND_E));
    return Code;
  }

@ -2919,13 +2908,13 @@ public:
             "unexpected binary expression");
      const MCExpr *LHS = BinaryExpr->getLHS();
      assert(LHS->getKind() == MCExpr::SymbolRef && "unexpected LHS");
-      Symbol = const_cast<MCSymbol *>(&LHS->getSymbol());
+      Symbol = const_cast<MCSymbol *>(this->getTargetSymbol(LHS));
      const MCExpr *RHS = BinaryExpr->getRHS();
      assert(RHS->getKind() == MCExpr::Constant && "unexpected RHS");
      Addend = cast<MCConstantExpr>(RHS)->getValue();
    } else {
      assert(ValueExpr->getKind() == MCExpr::SymbolRef && "unexpected value");
-      Symbol = const_cast<MCSymbol *>(&ValueExpr->getSymbol());
+      Symbol = const_cast<MCSymbol *>(this->getTargetSymbol(ValueExpr));
    }

    return Relocation({RelOffset, Symbol, RelType, Addend, 0});
@ -2977,8 +2966,8 @@ public:
  }

  bool createIndirectCall(MCInst &Inst, const MCSymbol *TargetLocation,
-                          MCContext *Ctx, bool IsTailCall) const override {
-    Inst.setOpcode(IsTailCall ? X86::TAILJMPm : X86::CALL64m);
+                          MCContext *Ctx, bool IsTailCall) override {
+    Inst.setOpcode(IsTailCall ? X86::JMP32m : X86::CALL64m);
    Inst.addOperand(MCOperand::createReg(X86::RIP));        // BaseReg
    Inst.addOperand(MCOperand::createImm(1));               // ScaleAmt
    Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // IndexReg
@ -2986,14 +2975,17 @@ public:
        MCSymbolRefExpr::create(TargetLocation, MCSymbolRefExpr::VK_None,
                                *Ctx)));
    Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg
+    if (IsTailCall)
+      addAnnotation(Inst, "TC", true);
    return true;
  }

  bool createTailCall(MCInst &Inst, const MCSymbol *Target,
                      MCContext *Ctx) override {
-    Inst.setOpcode(X86::TAILJMPd);
+    Inst.setOpcode(X86::JMP_4);
    Inst.addOperand(MCOperand::createExpr(
        MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx)));
+    addAnnotation(Inst, "TC", true);
    return true;
  }

@ -3005,73 +2997,41 @@ public:

  bool reverseBranchCondition(MCInst &Inst, const MCSymbol *TBB,
                              MCContext *Ctx) const override {
-    Inst.setOpcode(getInvertedBranchOpcode(Inst.getOpcode()));
-    assert(Inst.getOpcode() != 0 && "invalid branch instruction");
+    unsigned InvCC = getInvertedCondCode(getCondCode(Inst));
+    assert(InvCC != X86::COND_INVALID && "invalid branch instruction");
+    Inst.getOperand(Info->get(Inst.getOpcode()).NumOperands - 1).setImm(InvCC);
    Inst.getOperand(0) = MCOperand::createExpr(
        MCSymbolRefExpr::create(TBB, MCSymbolRefExpr::VK_None, *Ctx));
    return true;
  }

-  unsigned getCanonicalBranchOpcode(unsigned Opcode) const override {
-    switch (Opcode) {
-    default:
-      return Opcode;
+  unsigned getCanonicalBranchCondCode(unsigned CC) const override {
+    switch (CC) {
+    default:           return X86::COND_INVALID;

-    case X86::JE_1:  return X86::JE_1;
-    case X86::JE_2:  return X86::JE_2;
-    case X86::JE_4:  return X86::JE_4;
-    case X86::JNE_1: return X86::JE_1;
-    case X86::JNE_2: return X86::JE_2;
-    case X86::JNE_4: return X86::JE_4;
+    case X86::COND_E:  return X86::COND_E;
+    case X86::COND_NE: return X86::COND_E;

-    case X86::JL_1:  return X86::JL_1;
-    case X86::JL_2:  return X86::JL_2;
-    case X86::JL_4:  return X86::JL_4;
-    case X86::JGE_1: return X86::JL_1;
-    case X86::JGE_2: return X86::JL_2;
-    case X86::JGE_4: return X86::JL_4;
+    case X86::COND_L:  return X86::COND_L;
+    case X86::COND_GE: return X86::COND_L;

-    case X86::JLE_1: return X86::JG_1;
-    case X86::JLE_2: return X86::JG_2;
-    case X86::JLE_4: return X86::JG_4;
-    case X86::JG_1:  return X86::JG_1;
-    case X86::JG_2:  return X86::JG_2;
-    case X86::JG_4:  return X86::JG_4;
+    case X86::COND_LE: return X86::COND_G;
+    case X86::COND_G:  return X86::COND_G;

-    case X86::JB_1:  return X86::JB_1;
-    case X86::JB_2:  return X86::JB_2;
-    case X86::JB_4:  return X86::JB_4;
-    case X86::JAE_1: return X86::JB_1;
-    case X86::JAE_2: return X86::JB_2;
-    case X86::JAE_4: return X86::JB_4;
+    case X86::COND_B:  return X86::COND_B;
+    case X86::COND_AE: return X86::COND_B;

-    case X86::JBE_1: return X86::JA_1;
-    case X86::JBE_2: return X86::JA_2;
-    case X86::JBE_4: return X86::JA_4;
-    case X86::JA_1:  return X86::JA_1;
-    case X86::JA_2:  return X86::JA_2;
-    case X86::JA_4:  return X86::JA_4;
+    case X86::COND_BE: return X86::COND_A;
+    case X86::COND_A:  return X86::COND_A;

-    case X86::JS_1:  return X86::JS_1;
-    case X86::JS_2:  return X86::JS_2;
-    case X86::JS_4:  return X86::JS_4;
-    case X86::JNS_1: return X86::JS_1;
-    case X86::JNS_2: return X86::JS_2;
-    case X86::JNS_4: return X86::JS_4;
+    case X86::COND_S:  return X86::COND_S;
+    case X86::COND_NS: return X86::COND_S;

-    case X86::JP_1:  return X86::JP_1;
-    case X86::JP_2:  return X86::JP_2;
-    case X86::JP_4:  return X86::JP_4;
-    case X86::JNP_1: return X86::JP_1;
-    case X86::JNP_2: return X86::JP_2;
-    case X86::JNP_4: return X86::JP_4;
+    case X86::COND_P:  return X86::COND_P;
+    case X86::COND_NP: return X86::COND_P;

-    case X86::JO_1:  return X86::JO_1;
-    case X86::JO_2:  return X86::JO_2;
-    case X86::JO_4:  return X86::JO_4;
-    case X86::JNO_1: return X86::JO_1;
-    case X86::JNO_2: return X86::JO_2;
-    case X86::JNO_4: return X86::JO_4;
+    case X86::COND_O:  return X86::COND_O;
+    case X86::COND_NO: return X86::COND_O;
    }
  }

@ -3136,7 +3096,7 @@ public:

  bool isBranchOnMem(const MCInst &Inst) const override {
    auto OpCode = Inst.getOpcode();
-    if (OpCode == X86::CALL64m || OpCode == X86::TAILJMPm ||
+    if (OpCode == X86::CALL64m || (OpCode == X86::JMP32m && isTailCall(Inst)) ||
        OpCode == X86::JMP64m)
      return true;

@ -3145,7 +3105,7 @@ public:

  bool isBranchOnReg(const MCInst &Inst) const override {
    auto OpCode = Inst.getOpcode();
-    if (OpCode == X86::CALL64r || OpCode == X86::TAILJMPr ||
+    if (OpCode == X86::CALL64r || (OpCode == X86::JMP32r && isTailCall(Inst)) ||
        OpCode == X86::JMP64r)
      return true;

@ -3237,7 +3197,7 @@ public:
  std::vector<MCInst>
  createInstrumentedIndirectCall(const MCInst &CallInst, bool TailCall,
                                 MCSymbol *HandlerFuncAddr, int CallSiteID,
-                                 MCContext *Ctx) const override {
+                                 MCContext *Ctx) override {
    // Check if the target address expression used in the original indirect call
    // uses the stack pointer, which we are going to clobber.
    static BitVector SPAliases(getAliases(X86::RSP));
@ -3270,6 +3230,9 @@ public:
      createStackPointerDecrement(Insts.back(), 8, /*NoFlagsClobber=*/false);
    }
    Insts.emplace_back(CallInst);
+    // Insts.back() and CallInst now share the same annotation instruction.
+    // Strip it from Insts.back(), only preserving tail call annotation.
+    stripAnnotations(Insts.back(), /*KeepTC=*/true);
    convertIndirectCallToLoad(Insts.back(), TempReg);
    if (UsesSP) {
      Insts.emplace_back();
@ -3504,7 +3467,8 @@ public:
      }

      // jump to next target compare.
-      NextTarget = Ctx->createTempSymbol(); // generate label for the next block
+      NextTarget =
+          Ctx->createNamedTempSymbol(); // generate label for the next block
      NewCall->push_back(CallInst);

      if (IsJumpTable) {
@ -3512,7 +3476,7 @@ public:

        // Jump to next compare if target addresses don't match.
        Je.clear();
-        Je.setOpcode(X86::JE_1);
+        Je.setOpcode(X86::JCC_1);
        if (Targets[i].first) {
          Je.addOperand(MCOperand::createExpr(
            MCSymbolRefExpr::create(Targets[i].first,
@ -3521,19 +3485,21 @@ public:
        } else {
          Je.addOperand(MCOperand::createImm(Targets[i].second));
        }
+        Je.addOperand(MCOperand::createImm(X86::COND_E));
        assert(!isInvoke(CallInst));
      } else {
        MCInst &Jne = NewCall->back();

        // Jump to next compare if target addresses don't match.
        Jne.clear();
-        Jne.setOpcode(X86::JNE_1);
+        Jne.setOpcode(X86::JCC_1);
        Jne.addOperand(MCOperand::createExpr(MCSymbolRefExpr::create(
            NextTarget, MCSymbolRefExpr::VK_None, *Ctx)));
+        Jne.addOperand(MCOperand::createImm(X86::COND_NE));

        // Call specific target directly.
-        Results.push_back(
-            std::make_pair(Ctx->createTempSymbol(), std::vector<MCInst>()));
+        Results.push_back(std::make_pair(Ctx->createNamedTempSymbol(),
+                                         std::vector<MCInst>()));
        NewCall = &Results.back().second;
        NewCall->push_back(CallInst);
        MCInst &CallOrJmp = NewCall->back();
@ -3541,10 +3507,10 @@ public:
        CallOrJmp.clear();

        if (MinimizeCodeSize && !LoadElim) {
-          CallOrJmp.setOpcode(IsTailCall ? X86::TAILJMPr : X86::CALL64r);
+          CallOrJmp.setOpcode(IsTailCall ? X86::JMP32r : X86::CALL64r);
          CallOrJmp.addOperand(MCOperand::createReg(FuncAddrReg));
        } else {
-          CallOrJmp.setOpcode(IsTailCall ? X86::TAILJMPd : X86::CALL64pcrel32);
+          CallOrJmp.setOpcode(IsTailCall ? X86::JMP_4 : X86::CALL64pcrel32);

          if (Targets[i].first) {
            CallOrJmp.addOperand(MCOperand::createExpr(MCSymbolRefExpr::create(
@ -3553,6 +3519,8 @@ public:
            CallOrJmp.addOperand(MCOperand::createImm(Targets[i].second));
          }
        }
+        if (IsTailCall)
+          addAnnotation(CallOrJmp, "TC", true);

        if (isInvoke(CallInst) && !isInvoke(CallOrJmp)) {
          // Copy over any EH or GNU args size information from the original
@ -3570,7 +3538,7 @@ public:
          // the merge block.
          if (i == 0) {
            // Fallthrough to merge block.
-            MergeBlock = Ctx->createTempSymbol();
+            MergeBlock = Ctx->createNamedTempSymbol();
          } else {
            // Insert jump to the merge block if we are not doing a fallthrough.
            jumpToMergeBlock(*NewCall);
@ -3635,16 +3603,18 @@ public:
      shortenInstruction(CompareInst);

      // jump to next target compare.
-      NextTarget = Ctx->createTempSymbol(); // generate label for the next block
+      NextTarget =
+          Ctx->createNamedTempSymbol(); // generate label for the next block
      CurBB->push_back(MCInst());

      MCInst &JEInst = CurBB->back();
      JEInst.setLoc(IJmpInst.getLoc());

      // Jump to target if indices match
-      JEInst.setOpcode(X86::JE_1);
+      JEInst.setOpcode(X86::JCC_1);
      JEInst.addOperand(MCOperand::createExpr(MCSymbolRefExpr::create(
          Targets[i].first, MCSymbolRefExpr::VK_None, *Ctx)));
+      JEInst.addOperand(MCOperand::createImm(X86::COND_E));
    }

    // Cold call block.
--- a/bolt/src/Utils.cpp
+++ b/bolt/src/Utils.cpp
@ -12,6 +12,9 @@
 //===----------------------------------------------------------------------===//

 #include "Utils.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/Support/LEB128.h"
 #include "llvm/Support/raw_ostream.h"

 namespace llvm {
@ -46,5 +49,52 @@ void check_error(Error E, Twine Message) {
  });
 }

+Optional<uint8_t> readDWARFExpressionTargetReg(StringRef ExprBytes) {
+  uint8_t Opcode = ExprBytes[0];
+  if (Opcode == dwarf::DW_CFA_def_cfa_expression)
+    return NoneType();
+  assert((Opcode == dwarf::DW_CFA_expression ||
+          Opcode == dwarf::DW_CFA_val_expression) &&
+         "invalid DWARF expression CFI");
+  assert(ExprBytes.size() > 1 && "DWARF expression CFI is too short");
+  const uint8_t *const Start =
+      reinterpret_cast<const uint8_t *>(ExprBytes.drop_front(1).data());
+  const uint8_t *const End =
+      reinterpret_cast<const uint8_t *>(Start + ExprBytes.size() - 1);
+  uint8_t Reg = decodeULEB128(Start, nullptr, End);
+  return Reg;
+}
+
 } // namespace bolt
+
+bool operator==(const llvm::MCCFIInstruction &L,
+                const llvm::MCCFIInstruction &R) {
+  if (L.getOperation() != R.getOperation())
+    return false;
+  switch (L.getOperation()) {
+  case MCCFIInstruction::OpRestore:
+  case MCCFIInstruction::OpSameValue:
+  case MCCFIInstruction::OpUndefined:
+  case MCCFIInstruction::OpDefCfaRegister:
+    return L.getRegister() == R.getRegister();
+  case MCCFIInstruction::OpRegister:
+    return L.getRegister() == R.getRegister() &&
+           L.getRegister2() == R.getRegister2();
+  case MCCFIInstruction::OpOffset:
+  case MCCFIInstruction::OpRelOffset:
+  case MCCFIInstruction::OpDefCfa:
+    return L.getRegister() == R.getRegister() && L.getOffset() == R.getOffset();
+  case MCCFIInstruction::OpEscape:
+    return L.getValues() == R.getValues();
+  case MCCFIInstruction::OpRememberState:
+  case MCCFIInstruction::OpRestoreState:
+    return true;
+  case MCCFIInstruction::OpDefCfaOffset:
+  case MCCFIInstruction::OpAdjustCfaOffset:
+    return L.getOffset() == R.getOffset();
+  default:
+    return false;
+  }
+}
+
 } // namespace llvm
--- a/Show More
+++ b/Show More