The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
/*
 * ====================================================================
 *    Licensed to the Apache Software Foundation (ASF) under one
 *    or more contributor license agreements.  See the NOTICE file
 *    distributed with this work for additional information
 *    regarding copyright ownership.  The ASF licenses this file
 *    to you under the Apache License, Version 2.0 (the
 *    "License"); you may not use this file except in compliance
 *    with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing,
 *    software distributed under the License is distributed on an
 *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 *    KIND, either express or implied.  See the License for the
 *    specific language governing permissions and limitations
 *    under the License.
 * ====================================================================
 */

#include <apr_signal.h>

#include "svn_cmdline.h"
#include "svn_dirent_uri.h"
#include "svn_pools.h"
#include "svn_repos.h"
#include "svn_opt.h"
#include "svn_utf.h"
#include "svn_version.h"

#include "../../subversion/libsvn_fs_fs/fs.h"
#include "../../subversion/libsvn_fs_fs/fs_fs.h"
/* for svn_fs_fs__id_* (used in assertions only) */
#include "../../subversion/libsvn_fs_fs/id.h"

#include "svn_private_config.h"


/** Help messages and version checking. **/

static svn_error_t *
version(apr_pool_t *pool)
{
  return svn_opt_print_help3(NULL, "svn-rep-sharing-stats", TRUE, FALSE, NULL,
                             NULL, NULL, NULL, NULL, NULL, pool);
}

static void
usage(apr_pool_t *pool)
{
  svn_error_clear(svn_cmdline_fprintf
                  (stderr, pool,
                   _("Type 'svn-rep-sharing-stats --help' for usage.\n")));
}


static void
help(const apr_getopt_option_t *options, apr_pool_t *pool)
{
  svn_error_clear
    (svn_cmdline_fprintf
     (stdout, pool,
      _("usage: svn-rep-sharing-stats [OPTIONS] REPOS_PATH\n\n"
        "  Prints the reference count statistics for representations\n"
        "  in an FSFS repository.\n"
        "\n"
        "  At least one of the options --data/--prop/--both must be specified.\n"
        "\n"
        "Valid options:\n")));
  while (options->description)
    {
      const char *optstr;
      svn_opt_format_option(&optstr, options, TRUE, pool);
      svn_error_clear(svn_cmdline_fprintf(stdout, pool, "  %s\n", optstr));
      ++options;
    }
  svn_error_clear(svn_cmdline_fprintf(stdout, pool, "\n"));
  exit(0);
}


/* Version compatibility check */
static svn_error_t *
check_lib_versions(void)
{
  static const svn_version_checklist_t checklist[] =
    {
      /* ### check FSFS version */
      { "svn_subr",   svn_subr_version },
      { "svn_fs",     svn_fs_version },
      { NULL, NULL }
    };

  SVN_VERSION_DEFINE(my_version);
  return svn_error_trace(svn_ver_check_list(&my_version, checklist));
}



/** Cancellation stuff, ### copied from subversion/svn/main.c */

/* A flag to see if we've been cancelled by the client or not. */
static volatile sig_atomic_t cancelled = FALSE;

/* A signal handler to support cancellation. */
static void
signal_handler(int signum)
{
  apr_signal(signum, SIG_IGN);
  cancelled = TRUE;
}

/* Our cancellation callback. */
static svn_error_t *
svn_cl__check_cancel(void *baton)
{
  if (cancelled)
    return svn_error_create(SVN_ERR_CANCELLED, NULL, _("Caught signal"));
  else
    return SVN_NO_ERROR;
}

static svn_cancel_func_t cancel_func = svn_cl__check_cancel;

static void set_up_cancellation(void)
{
  /* Set up our cancellation support. */
  apr_signal(SIGINT, signal_handler);
#ifdef SIGBREAK
  /* SIGBREAK is a Win32 specific signal generated by ctrl-break. */
  apr_signal(SIGBREAK, signal_handler);
#endif
#ifdef SIGHUP
  apr_signal(SIGHUP, signal_handler);
#endif
#ifdef SIGTERM
  apr_signal(SIGTERM, signal_handler);
#endif

#ifdef SIGPIPE
  /* Disable SIGPIPE generation for the platforms that have it. */
  apr_signal(SIGPIPE, SIG_IGN);
#endif

#ifdef SIGXFSZ
  /* Disable SIGXFSZ generation for the platforms that have it, otherwise
   * working with large files when compiled against an APR that doesn't have
   * large file support will crash the program, which is uncool. */
  apr_signal(SIGXFSZ, SIG_IGN);
#endif
}


/** Program-specific code. **/
enum {
  OPT_VERSION = SVN_OPT_FIRST_LONGOPT_ID,
  OPT_DATA,
  OPT_PROP,
  OPT_BOTH
};

static svn_error_t *check_experimental(void)
{
  if (getenv("SVN_REP_SHARING_STATS_IS_EXPERIMENTAL"))
      return SVN_NO_ERROR;

  return svn_error_create(APR_EGENERAL, NULL,
                          "This code is experimental and should not "
                          "be used on live data.");
}

/* The parts of a rep that determine whether it's being shared. */
struct key_t
{
  svn_revnum_t revision;
  apr_off_t offset;
};

/* What we need to know about a rep. */
struct value_t
{
  svn_checksum_t *sha1_checksum;
  apr_uint64_t refcount;
};

/* Increment records[rep] if both are non-NULL and REP contains a sha1.
 * Allocate keys and values in RESULT_POOL.
 */
static svn_error_t *record(apr_hash_t *records,
                           representation_t *rep,
                           apr_pool_t *result_pool)
{
  struct key_t *key;
  struct value_t *value;

  /* Skip if we ignore this particular kind of reps, or if the rep doesn't
   * exist or doesn't have the checksum we are after.  (The latter case
   * often corresponds to node_rev->kind == svn_node_dir.)
   */
  if (records == NULL || rep == NULL || rep->sha1_checksum == NULL)
    return SVN_NO_ERROR;

  /* Construct the key.
   *
   * Must use calloc() because apr_hash_* pay attention to padding bytes too.
   */
  key = apr_pcalloc(result_pool, sizeof(*key));
  key->revision = rep->revision;
  key->offset = rep->offset;

  /* Update or create the value. */
  if ((value = apr_hash_get(records, key, sizeof(*key))))
    {
      /* Paranoia. */
      SVN_ERR_ASSERT(value->sha1_checksum != NULL);
      SVN_ERR_ASSERT(svn_checksum_match(value->sha1_checksum,
                                        rep->sha1_checksum));
      /* Real work. */
      value->refcount++;
    }
  else
    {
      value = apr_palloc(result_pool, sizeof(*value));
      value->sha1_checksum = svn_checksum_dup(rep->sha1_checksum, result_pool);
      value->refcount = 1;
    }

  /* Store them. */
  apr_hash_set(records, key, sizeof(*key), value);

  return SVN_NO_ERROR;
}

/* Inspect the data and/or prop reps of revision REVNUM in FS.  Store
 * reference count tallies in passed hashes (allocated in RESULT_POOL).
 *
 * If PROP_REPS or DATA_REPS is NULL, the respective kind of reps are not
 * tallied.
 *
 * Print progress report to STDERR unless QUIET is true.
 *
 * Use SCRATCH_POOL for temporary allocations.
 */
static svn_error_t *
process_one_revision(svn_fs_t *fs,
                     svn_revnum_t revnum,
                     svn_boolean_t quiet,
                     apr_hash_t *prop_reps,
                     apr_hash_t *data_reps,
                     apr_hash_t *both_reps,
                     apr_pool_t *result_pool,
                     apr_pool_t *scratch_pool)
{
  svn_fs_root_t *rev_root;
  apr_hash_t *paths_changed;
  apr_hash_index_t *hi;

  if (! quiet)
    SVN_ERR(svn_cmdline_fprintf(stderr, scratch_pool,
                                "processing r%ld\n", revnum));

  /* Get the changed paths. */
  SVN_ERR(svn_fs_revision_root(&rev_root, fs, revnum, scratch_pool));
  SVN_ERR(svn_fs_paths_changed2(&paths_changed, rev_root, scratch_pool));

  /* Iterate them. */
  /* ### use iterpool? */
  for (hi = apr_hash_first(scratch_pool, paths_changed);
       hi; hi = apr_hash_next(hi))
    {
      const char *path;
      const svn_fs_path_change2_t *change;
      const svn_fs_id_t *node_rev_id1, *node_rev_id2;
      const svn_fs_id_t *the_id;

      node_revision_t *node_rev;

      path = svn__apr_hash_index_key(hi);
      change = svn__apr_hash_index_val(hi);
      if (! quiet)
        SVN_ERR(svn_cmdline_fprintf(stderr, scratch_pool,
                                    "processing r%ld:%s\n", revnum, path));

      if (change->change_kind == svn_fs_path_change_delete)
        /* Can't ask for reps of PATH at REVNUM if the path no longer exists
         * at that revision! */
        continue;

      /* Okay, we have two node_rev id's for this change: the txn one and
       * the revision one.  We'll use the latter. */
      node_rev_id1 = change->node_rev_id;
      SVN_ERR(svn_fs_node_id(&node_rev_id2, rev_root, path, scratch_pool));

      SVN_ERR_ASSERT(svn_fs_fs__id_txn_id(node_rev_id1) != NULL);
      SVN_ERR_ASSERT(svn_fs_fs__id_rev(node_rev_id2) != SVN_INVALID_REVNUM);

      the_id = node_rev_id2;

      /* Get the node_rev using the chosen node_rev_id. */
      SVN_ERR(svn_fs_fs__get_node_revision(&node_rev, fs, the_id, scratch_pool));

      /* Maybe record the sha1's. */
      SVN_ERR(record(prop_reps, node_rev->prop_rep, result_pool));
      SVN_ERR(record(data_reps, node_rev->data_rep, result_pool));
      SVN_ERR(record(both_reps, node_rev->prop_rep, result_pool));
      SVN_ERR(record(both_reps, node_rev->data_rep, result_pool));
    }

  return SVN_NO_ERROR;
}

/* Print REPS_REF_COUNT (a hash as for process_one_revision())
 * to stdout in "refcount => sha1" format.  A sha1 may appear
 * more than once if not all its instances are shared.  Prepend
 * each line by NAME.
 *
 * Use SCRATCH_POOL for temporary allocations.
 */
static svn_error_t *
pretty_print(const char *name,
             apr_hash_t *reps_ref_counts,
             apr_pool_t *scratch_pool)
{
  apr_hash_index_t *hi;

  if (reps_ref_counts == NULL)
    return SVN_NO_ERROR;

  for (hi = apr_hash_first(scratch_pool, reps_ref_counts);
       hi; hi = apr_hash_next(hi))
    {
      struct value_t *value;

      SVN_ERR(cancel_func(NULL));

      value = svn__apr_hash_index_val(hi);
      SVN_ERR(svn_cmdline_printf(scratch_pool, "%s %" APR_UINT64_T_FMT " %s\n",
                                 name, value->refcount,
                                 svn_checksum_to_cstring_display(
                                   value->sha1_checksum,
                                   scratch_pool)));
    }

  return SVN_NO_ERROR;
}

/* Return an error unless FS is an fsfs fs. */
static svn_error_t *is_fs_fsfs(svn_fs_t *fs, apr_pool_t *scratch_pool)
{
  const char *actual, *expected, *path;

  path = svn_fs_path(fs, scratch_pool);

  expected = SVN_FS_TYPE_FSFS;
  SVN_ERR(svn_fs_type(&actual, path, scratch_pool));

  if (strcmp(actual, expected) != 0)
    return svn_error_createf(SVN_ERR_FS_UNKNOWN_FS_TYPE, NULL,
                             "Filesystem '%s' is not of type '%s'",
                             svn_dirent_local_style(path, scratch_pool),
                             actual);

  return SVN_NO_ERROR;
}

/* The core logic.  This function iterates the repository REPOS_PATH
 * and sends all the (DATA and/or PROP) reps in each revision for counting
 * by process_one_revision().  QUIET is passed to process_one_revision().
 */
static svn_error_t *process(const char *repos_path,
                            svn_boolean_t prop,
                            svn_boolean_t data,
                            svn_boolean_t quiet,
                            apr_pool_t *scratch_pool)
{
  apr_hash_t *prop_reps = NULL;
  apr_hash_t *data_reps = NULL;
  apr_hash_t *both_reps = NULL;
  svn_revnum_t rev, youngest;
  apr_pool_t *iterpool;
  svn_repos_t *repos;
  svn_fs_t *fs;

  if (prop)
    prop_reps = apr_hash_make(scratch_pool);
  if (data)
    data_reps = apr_hash_make(scratch_pool);
  if (prop && data)
    both_reps = apr_hash_make(scratch_pool);

  /* Open the FS. */
  SVN_ERR(svn_repos_open2(&repos, repos_path, NULL, scratch_pool));
  fs = svn_repos_fs(repos);

  SVN_ERR(is_fs_fsfs(fs, scratch_pool));

  SVN_ERR(svn_fs_youngest_rev(&youngest, fs, scratch_pool));

  /* Iterate the revisions. */
  iterpool = svn_pool_create(scratch_pool);
  for (rev = 0; rev <= youngest; rev++)
    {
      svn_pool_clear(iterpool);
      SVN_ERR(cancel_func(NULL));
      SVN_ERR(process_one_revision(fs, rev, quiet,
                                   prop_reps, data_reps, both_reps,
                                   scratch_pool, iterpool));
    }
  svn_pool_destroy(iterpool);

  /* Print stats. */
  SVN_ERR(pretty_print("prop", prop_reps, scratch_pool));
  SVN_ERR(pretty_print("data", data_reps, scratch_pool));
  SVN_ERR(pretty_print("both", both_reps, scratch_pool));

  return SVN_NO_ERROR;
}

int
main(int argc, const char *argv[])
{
  const char *repos_path;
  apr_allocator_t *allocator;
  apr_pool_t *pool;
  svn_boolean_t prop = FALSE, data = FALSE;
  svn_boolean_t quiet = FALSE;
  svn_error_t *err;
  apr_getopt_t *os;
  const apr_getopt_option_t options[] =
    {
      {"data", OPT_DATA, 0, N_("display data reps stats")},
      {"prop", OPT_PROP, 0, N_("display prop reps stats")},
      {"both", OPT_BOTH, 0, N_("display combined (data+prop) reps stats")},
      {"quiet", 'q', 0, N_("no progress (only errors) to stderr")},
      {"help", 'h', 0, N_("display this help")},
      {"version", OPT_VERSION, 0,
       N_("show program version information")},
      {0,             0,  0,  0}
    };

  /* Initialize the app. */
  if (svn_cmdline_init("svn-rep-sharing-stats", stderr) != EXIT_SUCCESS)
    return EXIT_FAILURE;

  /* Create our top-level pool.  Use a separate mutexless allocator,
   * given this application is single threaded.
   */
  if (apr_allocator_create(&allocator))
    return EXIT_FAILURE;

  apr_allocator_max_free_set(allocator, SVN_ALLOCATOR_RECOMMENDED_MAX_FREE);

  pool = svn_pool_create_ex(NULL, allocator);
  apr_allocator_owner_set(allocator, pool);

  /* Check library versions */
  err = check_lib_versions();
  if (err)
    return svn_cmdline_handle_exit_error(err, pool, "svn-rep-sharing-stats: ");

  err = svn_cmdline__getopt_init(&os, argc, argv, pool);
  if (err)
    return svn_cmdline_handle_exit_error(err, pool, "svn-rep-sharing-stats: ");

  SVN_INT_ERR(check_experimental());

  os->interleave = 1;
  while (1)
    {
      int opt;
      const char *arg;
      apr_status_t status = apr_getopt_long(os, options, &opt, &arg);
      if (APR_STATUS_IS_EOF(status))
        break;
      if (status != APR_SUCCESS)
        {
          usage(pool);
          return EXIT_FAILURE;
        }
      switch (opt)
        {
        case OPT_DATA:
          data = TRUE;
          break;
        /* It seems we don't actually rep-share props yet. */
        case OPT_PROP:
          prop = TRUE;
          break;
        case OPT_BOTH:
          data = TRUE;
          prop = TRUE;
          break;
        case 'q':
          quiet = TRUE;
          break;
        case 'h':
          help(options, pool);
          break;
        case OPT_VERSION:
          SVN_INT_ERR(version(pool));
          exit(0);
          break;
        default:
          usage(pool);
          return EXIT_FAILURE;
        }
    }

  /* Exactly 1 non-option argument,
   * and at least one of "--data"/"--prop"/"--both".
   */
  if (os->ind + 1 != argc || (!data && !prop))
    {
      usage(pool);
      return EXIT_FAILURE;
    }

  /* Grab REPOS_PATH from argv. */
  SVN_INT_ERR(svn_utf_cstring_to_utf8(&repos_path, os->argv[os->ind], pool));
  repos_path = svn_dirent_internal_style(repos_path, pool);

  set_up_cancellation();

  /* Do something. */
  SVN_INT_ERR(process(repos_path, prop, data, quiet, pool));

  /* We're done. */

  svn_pool_destroy(pool);
  /* Flush stdout to make sure that the user will see any printing errors. */
  SVN_INT_ERR(svn_cmdline_fflush(stdout));

  return EXIT_SUCCESS;
}