Compiler projects using llvm
#!/usr/bin/env python
#
# This is a tool that works like debug location coverage calculator.
# It parses the llvm-dwarfdump --statistics output by reporting it
# in a more human readable way.
#

from __future__ import print_function
import argparse
import os
import sys
from json import loads
from math import ceil
from collections import OrderedDict
from subprocess import Popen, PIPE

# This special value has been used to mark statistics that overflowed.
TAINT_VALUE = "tainted"

# Initialize the plot.
def init_plot(plt):
  plt.title('Debug Location Statistics', fontweight='bold')
  plt.xlabel('location buckets')
  plt.ylabel('number of variables in the location buckets')
  plt.xticks(rotation=45, fontsize='x-small')
  plt.yticks()

# Finalize the plot.
def finish_plot(plt):
  plt.legend()
  plt.grid(color='grey', which='major', axis='y', linestyle='-', linewidth=0.3)
  plt.savefig('locstats.png')
  print('The plot was saved within "locstats.png".')

# Holds the debug location statistics.
class LocationStats:
  def __init__(self, file_name, variables_total, variables_total_locstats,
    variables_with_loc, variables_scope_bytes_covered, variables_scope_bytes,
    variables_coverage_map):
    self.file_name = file_name
    self.variables_total = variables_total
    self.variables_total_locstats = variables_total_locstats
    self.variables_with_loc = variables_with_loc
    self.scope_bytes_covered = variables_scope_bytes_covered
    self.scope_bytes = variables_scope_bytes
    self.variables_coverage_map = variables_coverage_map

  # Get the PC ranges coverage.
  def get_pc_coverage(self):
    if self.scope_bytes_covered == TAINT_VALUE or \
       self.scope_bytes == TAINT_VALUE:
      return TAINT_VALUE
    pc_ranges_covered = int(ceil(self.scope_bytes_covered * 100.0) \
                / self.scope_bytes)
    return pc_ranges_covered

  # Pretty print the debug location buckets.
  def pretty_print(self):
    if self.scope_bytes == 0:
      print ('No scope bytes found.')
      return -1

    pc_ranges_covered = self.get_pc_coverage()
    variables_coverage_per_map = {}
    for cov_bucket in coverage_buckets():
      variables_coverage_per_map[cov_bucket] = None
      if self.variables_coverage_map[cov_bucket] == TAINT_VALUE or \
         self.variables_total_locstats == TAINT_VALUE:
        variables_coverage_per_map[cov_bucket] = TAINT_VALUE
      else:
        variables_coverage_per_map[cov_bucket] = \
          int(ceil(self.variables_coverage_map[cov_bucket] * 100.0) \
                   / self.variables_total_locstats)

    print (' =================================================')
    print ('            Debug Location Statistics       ')
    print (' =================================================')
    print ('     cov%           samples         percentage(~)  ')
    print (' -------------------------------------------------')
    for cov_bucket in coverage_buckets():
      if self.variables_coverage_map[cov_bucket] or \
         self.variables_total_locstats == TAINT_VALUE:
        print ('   {0:10}     {1:8}              {2:3}%'. \
          format(cov_bucket, self.variables_coverage_map[cov_bucket], \
                 variables_coverage_per_map[cov_bucket]))
      else:
        print ('   {0:10}     {1:8d}              {2:3d}%'. \
          format(cov_bucket, self.variables_coverage_map[cov_bucket], \
                 variables_coverage_per_map[cov_bucket]))
    print (' =================================================')
    print (' -the number of debug variables processed: ' \
      + str(self.variables_total_locstats))
    print (' -PC ranges covered: ' + str(pc_ranges_covered) + '%')

    # Only if we are processing all the variables output the total
    # availability.
    if self.variables_total and self.variables_with_loc:
      total_availability = None
      if self.variables_total == TAINT_VALUE or \
         self.variables_with_loc == TAINT_VALUE:
        total_availability = TAINT_VALUE
      else:
        total_availability = int(ceil(self.variables_with_loc * 100.0) \
                                      / self.variables_total)
      print (' -------------------------------------------------')
      print (' -total availability: ' + str(total_availability) + '%')
    print (' =================================================')

    return 0

  # Draw a plot representing the location buckets.
  def draw_plot(self):
    from matplotlib import pyplot as plt

    buckets = range(len(self.variables_coverage_map))
    plt.figure(figsize=(12, 8))
    init_plot(plt)
    plt.bar(buckets, self.variables_coverage_map.values(), align='center',
            tick_label=self.variables_coverage_map.keys(),
            label='variables of {}'.format(self.file_name))

    # Place the text box with the coverage info.
    pc_ranges_covered = self.get_pc_coverage()
    props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
    plt.text(0.02, 0.90, 'PC ranges covered: {}%'.format(pc_ranges_covered),
             transform=plt.gca().transAxes, fontsize=12,
             verticalalignment='top', bbox=props)

    finish_plot(plt)

  # Compare the two LocationStats objects and draw a plot showing
  # the difference.
  def draw_location_diff(self, locstats_to_compare):
    from matplotlib import pyplot as plt

    pc_ranges_covered = self.get_pc_coverage()
    pc_ranges_covered_to_compare = locstats_to_compare.get_pc_coverage()

    buckets = range(len(self.variables_coverage_map))
    buckets_to_compare = range(len(locstats_to_compare.variables_coverage_map))

    fig = plt.figure(figsize=(12, 8))
    ax = fig.add_subplot(111)
    init_plot(plt)

    comparison_keys = list(coverage_buckets())
    ax.bar(buckets, self.variables_coverage_map.values(), align='edge',
           width=0.4,
           label='variables of {}'.format(self.file_name))
    ax.bar(buckets_to_compare,
           locstats_to_compare.variables_coverage_map.values(),
           color='r', align='edge', width=-0.4,
           label='variables of {}'.format(locstats_to_compare.file_name))
    ax.set_xticks(range(len(comparison_keys)))
    ax.set_xticklabels(comparison_keys)

    props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
    plt.text(0.02, 0.88,
             '{} PC ranges covered: {}%'. \
             format(self.file_name, pc_ranges_covered),
             transform=plt.gca().transAxes, fontsize=12,
             verticalalignment='top', bbox=props)
    plt.text(0.02, 0.83,
             '{} PC ranges covered: {}%'. \
             format(locstats_to_compare.file_name,
                    pc_ranges_covered_to_compare),
             transform=plt.gca().transAxes, fontsize=12,
             verticalalignment='top', bbox=props)

    finish_plot(plt)

# Define the location buckets.
def coverage_buckets():
  yield '0%'
  yield '(0%,10%)'
  for start in range(10, 91, 10):
    yield '[{0}%,{1}%)'.format(start, start + 10)
  yield '100%'

# Parse the JSON representing the debug statistics, and create a
# LocationStats object.
def parse_locstats(opts, binary):
  # These will be different due to different options enabled.
  variables_total = None
  variables_total_locstats = None
  variables_with_loc = None
  variables_scope_bytes_covered = None
  variables_scope_bytes = None
  variables_scope_bytes_entry_values = None
  variables_coverage_map = OrderedDict()

  # Get the directory of the LLVM tools.
  llvm_dwarfdump_cmd = os.path.join(os.path.dirname(__file__), \
                                    "llvm-dwarfdump")
  # The statistics llvm-dwarfdump option.
  llvm_dwarfdump_stats_opt = "--statistics"

  # Generate the stats with the llvm-dwarfdump.
  subproc = Popen([llvm_dwarfdump_cmd, llvm_dwarfdump_stats_opt, binary], \
                  stdin=PIPE, stdout=PIPE, stderr=PIPE, \
                  universal_newlines = True)
  cmd_stdout, cmd_stderr = subproc.communicate()

  # TODO: Handle errors that are coming from llvm-dwarfdump.

  # Get the JSON and parse it.
  json_parsed = None

  try:
    json_parsed = loads(cmd_stdout)
  except:
    print ('error: No valid llvm-dwarfdump statistics found.')
    sys.exit(1)

  # TODO: Parse the statistics Version from JSON.

  def init_field(name):
    if json_parsed[name] == 'overflowed':
      print ('warning: "' + name + '" field overflowed.')
      return TAINT_VALUE
    return json_parsed[name]

  if opts.only_variables:
    # Read the JSON only for local variables.
    variables_total_locstats = \
      init_field('#local vars processed by location statistics')
    variables_scope_bytes_covered = \
      init_field('sum_all_local_vars(#bytes in parent scope covered' \
                  ' by DW_AT_location)')
    variables_scope_bytes = \
      init_field('sum_all_local_vars(#bytes in parent scope)')
    if not opts.ignore_debug_entry_values:
      for cov_bucket in coverage_buckets():
        cov_category = "#local vars with {} of parent scope covered " \
                       "by DW_AT_location".format(cov_bucket)
        variables_coverage_map[cov_bucket] = init_field(cov_category)
    else:
      variables_scope_bytes_entry_values = \
        init_field('sum_all_local_vars(#bytes in parent scope ' \
                    'covered by DW_OP_entry_value)')
      if variables_scope_bytes_covered != TAINT_VALUE and \
         variables_scope_bytes_entry_values != TAINT_VALUE:
        variables_scope_bytes_covered = variables_scope_bytes_covered \
           - variables_scope_bytes_entry_values
      for cov_bucket in coverage_buckets():
        cov_category = \
          "#local vars - entry values with {} of parent scope " \
          "covered by DW_AT_location".format(cov_bucket)
        variables_coverage_map[cov_bucket] = init_field(cov_category)
  elif opts.only_formal_parameters:
    # Read the JSON only for formal parameters.
    variables_total_locstats = \
      init_field('#params processed by location statistics')
    variables_scope_bytes_covered = \
      init_field('sum_all_params(#bytes in parent scope covered ' \
                  'by DW_AT_location)')
    variables_scope_bytes = \
      init_field('sum_all_params(#bytes in parent scope)')
    if not opts.ignore_debug_entry_values:
      for cov_bucket in coverage_buckets():
        cov_category = "#params with {} of parent scope covered " \
                       "by DW_AT_location".format(cov_bucket)
        variables_coverage_map[cov_bucket] = init_field(cov_category)
    else:
      variables_scope_bytes_entry_values = \
        init_field('sum_all_params(#bytes in parent scope covered ' \
                    'by DW_OP_entry_value)')
      if variables_scope_bytes_covered != TAINT_VALUE and \
         variables_scope_bytes_entry_values != TAINT_VALUE:
        variables_scope_bytes_covered = variables_scope_bytes_covered \
          - variables_scope_bytes_entry_values
      for cov_bucket in coverage_buckets():
        cov_category = \
          "#params - entry values with {} of parent scope covered" \
          " by DW_AT_location".format(cov_bucket)
        variables_coverage_map[cov_bucket] = init_field(cov_category)
  else:
    # Read the JSON for both local variables and formal parameters.
    variables_total = \
      init_field('#source variables')
    variables_with_loc = init_field('#source variables with location')
    variables_total_locstats = \
      init_field('#variables processed by location statistics')
    variables_scope_bytes_covered = \
      init_field('sum_all_variables(#bytes in parent scope covered ' \
                  'by DW_AT_location)')
    variables_scope_bytes = \
      init_field('sum_all_variables(#bytes in parent scope)')

    if not opts.ignore_debug_entry_values:
      for cov_bucket in coverage_buckets():
        cov_category = "#variables with {} of parent scope covered " \
                       "by DW_AT_location".format(cov_bucket)
        variables_coverage_map[cov_bucket] = init_field(cov_category)
    else:
      variables_scope_bytes_entry_values = \
        init_field('sum_all_variables(#bytes in parent scope covered ' \
                    'by DW_OP_entry_value)')
      if variables_scope_bytes_covered != TAINT_VALUE and \
         variables_scope_bytes_entry_values != TAINT_VALUE:
        variables_scope_bytes_covered = variables_scope_bytes_covered \
          - variables_scope_bytes_entry_values
      for cov_bucket in coverage_buckets():
        cov_category = \
          "#variables - entry values with {} of parent scope covered " \
          "by DW_AT_location".format(cov_bucket)
        variables_coverage_map[cov_bucket] = init_field(cov_category)

  return LocationStats(binary, variables_total, variables_total_locstats,
                       variables_with_loc, variables_scope_bytes_covered,
                       variables_scope_bytes, variables_coverage_map)

# Parse the program arguments.
def parse_program_args(parser):
  parser.add_argument('--only-variables', action='store_true', default=False,
            help='calculate the location statistics only for local variables')
  parser.add_argument('--only-formal-parameters', action='store_true',
            default=False,
            help='calculate the location statistics only for formal parameters')
  parser.add_argument('--ignore-debug-entry-values', action='store_true',
            default=False,
            help='ignore the location statistics on locations with '
                 'entry values')
  parser.add_argument('--draw-plot', action='store_true', default=False,
            help='show histogram of location buckets generated (requires '
                 'matplotlib)')
  parser.add_argument('--compare', action='store_true', default=False,
            help='compare the debug location coverage on two files provided, '
                 'and draw a plot showing the difference  (requires '
                 'matplotlib)')
  parser.add_argument('file_names', nargs='+', type=str, help='file to process')

  return parser.parse_args()

# Verify that the program inputs meet the requirements.
def verify_program_inputs(opts):
  if len(sys.argv) < 2:
    print ('error: Too few arguments.')
    return False

  if opts.only_variables and opts.only_formal_parameters:
    print ('error: Please use just one --only* option.')
    return False

  if not opts.compare and len(opts.file_names) != 1:
    print ('error: Please specify only one file to process.')
    return False

  if opts.compare and len(opts.file_names) != 2:
    print ('error: Please specify two files to process.')
    return False

  if opts.draw_plot or opts.compare:
    try:
      import matplotlib
    except ImportError:
      print('error: matplotlib not found.')
      return False

  return True

def Main():
  parser = argparse.ArgumentParser()
  opts = parse_program_args(parser)

  if not verify_program_inputs(opts):
    parser.print_help()
    sys.exit(1)

  binary_file = opts.file_names[0]
  locstats = parse_locstats(opts, binary_file)

  if not opts.compare:
    if opts.draw_plot:
      # Draw a histogram representing the location buckets.
      locstats.draw_plot()
    else:
      # Pretty print collected info on the standard output.
      if locstats.pretty_print() == -1:
        sys.exit(0)
  else:
    binary_file_to_compare = opts.file_names[1]
    locstats_to_compare = parse_locstats(opts, binary_file_to_compare)
    # Draw a plot showing the difference in debug location coverage between
    # two files.
    locstats.draw_location_diff(locstats_to_compare)

if __name__ == '__main__':
  Main()
  sys.exit(0)