summaryrefslogtreecommitdiffstats
path: root/misc-utils
diff options
context:
space:
mode:
authorKarel Zak2019-02-06 13:55:20 +0100
committerKarel Zak2019-02-06 13:55:20 +0100
commit117f4d8bf4fc3a21d9009222cbfad56f79b73c94 (patch)
tree2f4b086f9da6476a52b5623c79b39ec487f7d8df /misc-utils
parentmount: Do not call mnt_pretty_path() on net file systems. (diff)
parenthardlink: add first simple tests (diff)
downloadkernel-qcow2-util-linux-117f4d8bf4fc3a21d9009222cbfad56f79b73c94.tar.gz
kernel-qcow2-util-linux-117f4d8bf4fc3a21d9009222cbfad56f79b73c94.tar.xz
kernel-qcow2-util-linux-117f4d8bf4fc3a21d9009222cbfad56f79b73c94.zip
Merge branch 'hardlink' of https://github.com/rudimeier/util-linux into hardlink
* 'hardlink' of https://github.com/rudimeier/util-linux: (25 commits) hardlink: add first simple tests hardlink: util-linux usage hardlink: fix compiler warnings hardlink: style indentations and license header hardlink: enable build with and without pcre2 fixes for the fixes temporal fix before re-patch (updates from Fedora repo) Update hardlink.1 Fixed version number, added changelog about Todd Lewis' patch exclude files via pcre Fixed 32 bit build with gcc7 (RH Bugzilla ID 1422989) spec file reflects the atomic hardlinking patch; removed cleaning buildroot (redundant); update FSF address at .c source file Revert "spec file reflects the atomic hardlinking patch; removed cleaning buildroot (redundant); current FSF address at .c source file" spec file reflects the atomic hardlinking patch; removed cleaning buildroot (redundant); current FSF address at .c source file Mention -f option in the man page do not allow to hardlink files across filesystems by default (#786719) (use -f option to override) fix possible buffer overflows, integer overflows, update man page fix URL and remove mmap() (#676962, #672917) - update docs to describe highest verbosity -vv option (#210816) - use dist Resolves: 210816 mostly spec cleanup ...
Diffstat (limited to 'misc-utils')
-rw-r--r--misc-utils/Makemodule.am12
-rw-r--r--misc-utils/hardlink.162
-rw-r--r--misc-utils/hardlink.c534
3 files changed, 608 insertions, 0 deletions
diff --git a/misc-utils/Makemodule.am b/misc-utils/Makemodule.am
index 3043687e6..f56a819ac 100644
--- a/misc-utils/Makemodule.am
+++ b/misc-utils/Makemodule.am
@@ -216,3 +216,15 @@ fincore_SOURCES = misc-utils/fincore.c
fincore_LDADD = $(LDADD) libsmartcols.la libcommon.la
fincore_CFLAGS = $(AM_CFLAGS) -I$(ul_libsmartcols_incdir)
endif
+
+if BUILD_HARDLINK
+usrbin_exec_PROGRAMS += hardlink
+hardlink_SOURCES = misc-utils/hardlink.c
+hardlink_LDADD = $(LDADD) libcommon.la
+hardlink_CFLAGS = $(AM_CFLAGS)
+if HAVE_PCRE
+hardlink_LDADD += $(PCRE_LIBS)
+hardlink_CFLAGS += $(PCRE_CFLAGS)
+endif
+dist_man_MANS += misc-utils/hardlink.1
+endif
diff --git a/misc-utils/hardlink.1 b/misc-utils/hardlink.1
new file mode 100644
index 000000000..5aa022a1f
--- /dev/null
+++ b/misc-utils/hardlink.1
@@ -0,0 +1,62 @@
+.TH "hardlink" "1"
+.SH "NAME"
+hardlink \- Consolidate duplicate files via hardlinks
+.SH "SYNOPSIS"
+.PP
+\fBhardlink\fP [\fB-c\fP] [\fB-n\fP] [\fB-v\fP] [\fB-vv\fP] [\fB-x pattern\fP] [\fB-h\fP] directory1 [ directory2 ... ]
+.SH "DESCRIPTION"
+.PP
+This manual page documents \fBhardlink\fP, a
+program which consolidates duplicate files in one or more directories
+using hardlinks.
+.PP
+\fBhardlink\fP traverses one
+or more directories searching for duplicate files. When it finds duplicate
+files, it uses one of them as the master. It then removes all other
+duplicates and places a hardlink for each one pointing to the master file.
+This allows for conservation of disk space where multiple directories
+on a single filesystem contain many duplicate files.
+.PP
+Since hard links can only span a single filesystem, \fBhardlink\fP
+is only useful when all directories specified are on the same filesystem.
+.SH "OPTIONS"
+.PP
+.IP "\fB-c\fP" 10
+Compare only the contents of the files being considered for consolidation.
+Disregards permission, ownership and other differences.
+.IP "\fB-f\fP" 10
+Force hardlinking across file systems.
+.IP "\fB-n\fP" 10
+Do not perform the consolidation; only print what would be changed.
+.IP "\fB-v\fP" 10
+Print summary after hardlinking.
+.IP "\fB-vv\fP" 10
+Print every hardlinked file and bytes saved. Also print summary after hardlinking.
+.IP "\fB-x pattern\fP" 10
+Exclude files and directories matching pattern from hardlinking.
+.IP "\fB-h\fP" 10
+Show help.
+.PP
+The optional pattern for excluding files and directories must be a PCRE2
+compatible regular expression. Only the basename of the file or directory
+is checked, not its path. Excluded directories' contents will not be examined.
+.SH "AUTHOR"
+.PP
+\fBhardlink\fP was written by Jakub Jelinek <jakub@redhat.com>.
+.PP
+Man page written by Brian Long.
+.PP
+Man page updated by Jindrich Novy <jnovy@redhat.com>
+.SH "BUGS"
+.PP
+\fBhardlink\fP assumes that its target directory trees do not change from under
+it. If a directory tree does change, this may result in \fBhardlink\fP
+accessing files and/or directories outside of the intended directory tree.
+Thus, you must avoid running \fBhardlink\fP on potentially changing directory
+trees, and especially on directory trees under control of another user.
+.PP
+Historically \fBhardlink\fP silently excluded any names beginning with
+".in.", as well as any names beginning with "." followed by exactly 6
+other characters. That prior behavior can be achieved by specifying
+.br
+-x '^(\\.in\\.|\\.[^.]{6}$)'
diff --git a/misc-utils/hardlink.c b/misc-utils/hardlink.c
new file mode 100644
index 000000000..56edaa686
--- /dev/null
+++ b/misc-utils/hardlink.c
@@ -0,0 +1,534 @@
+/*
+ * hardlink - consolidate duplicate files via hardlinks
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All rights reserved.
+ * Written by Jakub Jelinek <jakub@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <sys/types.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <string.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <errno.h>
+#ifdef HAVE_PCRE
+# define PCRE2_CODE_UNIT_WIDTH 8
+# include <pcre2.h>
+#endif
+
+#include "c.h"
+#include "nls.h"
+#include "closestream.h"
+
+#define NHASH (1<<17) /* Must be a power of 2! */
+#define NIOBUF (1<<12)
+#define NAMELEN 4096
+#define NBUF 64
+
+#ifdef HAVE_PCRE
+pcre2_code *re;
+PCRE2_SPTR exclude_pattern;
+pcre2_match_data *match_data;
+#endif
+
+struct _f;
+typedef struct _h {
+ struct _h *next;
+ struct _f *chain;
+ off_t size;
+ time_t mtime;
+} h;
+
+typedef struct _d {
+ struct _d *next;
+ char name[0];
+} d;
+
+d *dirs;
+
+h *hps[NHASH];
+
+int no_link = 0;
+int verbose = 0;
+int content_only = 0;
+int force = 0;
+
+typedef struct _f {
+ struct _f *next;
+ ino_t ino;
+ dev_t dev;
+ unsigned int cksum;
+ char name[0];
+} f;
+
+__attribute__ ((always_inline))
+static inline unsigned int hash(off_t size, time_t mtime)
+{
+ return (size ^ mtime) & (NHASH - 1);
+}
+
+__attribute__ ((always_inline))
+static inline int stcmp(struct stat *st1, struct stat *st2, int content_only)
+{
+ if (content_only)
+ return st1->st_size != st2->st_size;
+ return st1->st_mode != st2->st_mode || st1->st_uid != st2->st_uid ||
+ st1->st_gid != st2->st_gid || st1->st_size != st2->st_size ||
+ st1->st_mtime != st2->st_mtime;
+}
+
+long long ndirs, nobjects, nregfiles, ncomp, nlinks, nsaved;
+
+static void doexit(int i)
+{
+ if (verbose) {
+ fprintf(stderr, "\n\n");
+ fprintf(stderr, "Directories %lld\n", ndirs);
+ fprintf(stderr, "Objects %lld\n", nobjects);
+ fprintf(stderr, "IFREG %lld\n", nregfiles);
+ fprintf(stderr, "Comparisons %lld\n", ncomp);
+ fprintf(stderr, "%s %lld\n",
+ (no_link ? "Would link" : "Linked"), nlinks);
+ fprintf(stderr, "%s %lld\n", (no_link ? "Would save" : "saved"),
+ nsaved);
+ }
+ exit(i);
+}
+
+static void __attribute__((__noreturn__)) usage(void)
+{
+ fputs(USAGE_HEADER, stdout);
+ printf(_(" %s [options] directory...\n"), program_invocation_short_name);
+
+ fputs(USAGE_SEPARATOR, stdout);
+ puts(_("Consolidate duplicate files using hardlinks."));
+
+ fputs(USAGE_OPTIONS, stdout);
+ puts(_(" -c when finding candidates for linking, compare only file contents"));
+ puts(_(" -n don't actually link anything, just report what would be done"));
+ puts(_(" -v print summary after hardlinking"));
+ puts(_(" -vv print every hardlinked file and bytes saved + summary"));
+ puts(_(" -f force hardlinking across filesystems"));
+ puts(_(" -x <regex> exclude files matching pattern"));
+ fputs(USAGE_SEPARATOR, stdout);
+ printf(USAGE_HELP_OPTIONS(16)); /* char offset to align option descriptions */
+ printf(USAGE_MAN_TAIL("hardlink(1)"));
+ exit(EXIT_SUCCESS);
+}
+
+
+unsigned int buf[NBUF];
+char iobuf1[NIOBUF], iobuf2[NIOBUF];
+
+__attribute__ ((always_inline))
+static inline size_t add2(size_t a, size_t b)
+{
+ size_t sum = a + b;
+ if (sum < a) {
+ fprintf(stderr, "\nInteger overflow\n");
+ doexit(5);
+ }
+ return sum;
+}
+
+__attribute__ ((always_inline))
+static inline size_t add3(size_t a, size_t b, size_t c)
+{
+ return add2(add2(a, b), c);
+}
+
+typedef struct {
+ char *buf;
+ size_t alloc;
+} dynstr;
+
+static void growstr(dynstr * str, size_t newlen)
+{
+ if (newlen < str->alloc)
+ return;
+ str->buf = realloc(str->buf, str->alloc = add2(newlen, 1));
+ if (!str->buf) {
+ fprintf(stderr, "\nOut of memory 4\n");
+ doexit(4);
+ }
+}
+
+dev_t dev = 0;
+static void rf(const char *name)
+{
+ struct stat st, st2, st3;
+ const size_t namelen = strlen(name);
+ nobjects++;
+ if (lstat(name, &st))
+ return;
+ if (st.st_dev != dev && !force) {
+ if (dev) {
+ fprintf(stderr,
+ "%s is on different filesystem than the rest.\nUse -f option to override.\n",
+ name);
+ doexit(6);
+ }
+ dev = st.st_dev;
+ }
+ if (S_ISDIR(st.st_mode)) {
+ d *dp = malloc(add3(sizeof(d), namelen, 1));
+ if (!dp) {
+ fprintf(stderr, "\nOut of memory 3\n");
+ doexit(3);
+ }
+ memcpy(dp->name, name, namelen + 1);
+ dp->next = dirs;
+ dirs = dp;
+ } else if (S_ISREG(st.st_mode)) {
+ int fd, i;
+ f *fp, *fp2;
+ h *hp;
+ const char *n1, *n2;
+ int cksumsize = sizeof(buf);
+ unsigned int cksum;
+ time_t mtime = content_only ? 0 : st.st_mtime;
+ unsigned int hsh = hash(st.st_size, mtime);
+ off_t fsize;
+ nregfiles++;
+ if (verbose > 1)
+ fprintf(stderr, " %s", name);
+ fd = open(name, O_RDONLY);
+ if (fd < 0)
+ return;
+ if ((size_t)st.st_size < sizeof(buf)) {
+ cksumsize = st.st_size;
+ memset(((char *)buf) + cksumsize, 0,
+ (sizeof(buf) - cksumsize) % sizeof(buf[0]));
+ }
+ if (read(fd, buf, cksumsize) != cksumsize) {
+ close(fd);
+ if (verbose > 1 && namelen <= NAMELEN)
+ fprintf(stderr, "\r%*s\r", (int)(namelen + 2),
+ "");
+ return;
+ }
+ cksumsize = (cksumsize + sizeof(buf[0]) - 1) / sizeof(buf[0]);
+ for (i = 0, cksum = 0; i < cksumsize; i++) {
+ if (cksum + buf[i] < cksum)
+ cksum += buf[i] + 1;
+ else
+ cksum += buf[i];
+ }
+ for (hp = hps[hsh]; hp; hp = hp->next)
+ if (hp->size == st.st_size && hp->mtime == mtime)
+ break;
+ if (!hp) {
+ hp = malloc(sizeof(h));
+ if (!hp) {
+ fprintf(stderr, "\nOut of memory 1\n");
+ doexit(1);
+ }
+ hp->size = st.st_size;
+ hp->mtime = mtime;
+ hp->chain = NULL;
+ hp->next = hps[hsh];
+ hps[hsh] = hp;
+ }
+ for (fp = hp->chain; fp; fp = fp->next)
+ if (fp->cksum == cksum)
+ break;
+ for (fp2 = fp; fp2 && fp2->cksum == cksum; fp2 = fp2->next)
+ if (fp2->ino == st.st_ino && fp2->dev == st.st_dev) {
+ close(fd);
+ if (verbose > 1 && namelen <= NAMELEN)
+ fprintf(stderr, "\r%*s\r",
+ (int)(namelen + 2), "");
+ return;
+ }
+ for (fp2 = fp; fp2 && fp2->cksum == cksum; fp2 = fp2->next)
+ if (!lstat(fp2->name, &st2) && S_ISREG(st2.st_mode) &&
+ !stcmp(&st, &st2, content_only) &&
+ st2.st_ino != st.st_ino &&
+ st2.st_dev == st.st_dev) {
+ int fd2 = open(fp2->name, O_RDONLY);
+ if (fd2 < 0)
+ continue;
+ if (fstat(fd2, &st2) || !S_ISREG(st2.st_mode)
+ || st2.st_size == 0) {
+ close(fd2);
+ continue;
+ }
+ ncomp++;
+ lseek(fd, 0, SEEK_SET);
+ for (fsize = st.st_size; fsize > 0;
+ fsize -= NIOBUF) {
+ off_t rsize =
+ fsize >= NIOBUF ? NIOBUF : fsize;
+ if (read(fd, iobuf1, rsize) != rsize
+ || read(fd2, iobuf2,
+ rsize) != rsize) {
+ close(fd);
+ close(fd2);
+ fprintf(stderr,
+ "\nReading error\n");
+ return;
+ }
+ if (memcmp(iobuf1, iobuf2, rsize))
+ break;
+ }
+ close(fd2);
+ if (fsize > 0)
+ continue;
+ if (lstat(name, &st3)) {
+ fprintf(stderr,
+ "\nCould not stat %s again\n",
+ name);
+ close(fd);
+ return;
+ }
+ st3.st_atime = st.st_atime;
+ if (stcmp(&st, &st3, 0)) {
+ fprintf(stderr,
+ "\nFile %s changed underneath us\n",
+ name);
+ close(fd);
+ return;
+ }
+ n1 = fp2->name;
+ n2 = name;
+ if (!no_link) {
+ const char *suffix =
+ ".$$$___cleanit___$$$";
+ const size_t suffixlen = strlen(suffix);
+ size_t n2len = strlen(n2);
+ dynstr nam2 = { NULL, 0 };
+ growstr(&nam2, add2(n2len, suffixlen));
+ memcpy(nam2.buf, n2, n2len);
+ memcpy(&nam2.buf[n2len], suffix,
+ suffixlen + 1);
+ /* First create a temporary link to n1 under a new name */
+ if (link(n1, nam2.buf)) {
+ fprintf(stderr,
+ "\nFailed to hardlink %s to %s (create temporary link as %s failed - %s)\n",
+ n1, n2, nam2.buf,
+ strerror(errno));
+ free(nam2.buf);
+ continue;
+ }
+ /* Then rename into place over the existing n2 */
+ if (rename(nam2.buf, n2)) {
+ fprintf(stderr,
+ "\nFailed to hardlink %s to %s (rename temporary link to %s failed - %s)\n",
+ n1, n2, n2,
+ strerror(errno));
+ /* Something went wrong, try to remove the now redundant temporary link */
+ if (unlink(nam2.buf)) {
+ fprintf(stderr,
+ "\nFailed to remove temporary link %s - %s\n",
+ nam2.buf,
+ strerror
+ (errno));
+ }
+ free(nam2.buf);
+ continue;
+ }
+ free(nam2.buf);
+ }
+ nlinks++;
+ if (st3.st_nlink > 1) {
+ /* We actually did not save anything this time, since the link second argument
+ had some other links as well. */
+ if (verbose > 1)
+ fprintf(stderr,
+ "\r%*s\r%s %s to %s\n",
+ (int)(((namelen >
+ NAMELEN) ? 0 :
+ namelen) + 2),
+ "",
+ (no_link ? "Would link"
+ : "Linked"), n1, n2);
+ } else {
+ nsaved +=
+ ((st.st_size + 4095) / 4096) * 4096;
+ if (verbose > 1)
+ fprintf(stderr,
+ "\r%*s\r%s %s to %s, %s %jd\n",
+ (int)(((namelen >
+ NAMELEN) ? 0 :
+ namelen) + 2),
+ "",
+ (no_link ? "Would link"
+ : "Linked"), n1, n2,
+ (no_link ? "would save"
+ : "saved"),
+ (intmax_t)st.st_size);
+ }
+ close(fd);
+ return;
+ }
+ fp2 = malloc(add3(sizeof(f), namelen, 1));
+ if (!fp2) {
+ fprintf(stderr, "\nOut of memory 2\n");
+ doexit(2);
+ }
+ close(fd);
+ fp2->ino = st.st_ino;
+ fp2->dev = st.st_dev;
+ fp2->cksum = cksum;
+ memcpy(fp2->name, name, namelen + 1);
+ if (fp) {
+ fp2->next = fp->next;
+ fp->next = fp2;
+ } else {
+ fp2->next = hp->chain;
+ hp->chain = fp2;
+ }
+ if (verbose > 1 && namelen <= NAMELEN)
+ fprintf(stderr, "\r%*s\r", (int)(namelen + 2), "");
+ return;
+ }
+}
+
+int main(int argc, char **argv)
+{
+ int ch;
+ int i;
+#ifdef HAVE_PCRE
+ int errornumber;
+ PCRE2_SIZE erroroffset;
+#endif
+ dynstr nam1 = { NULL, 0 };
+
+ static const struct option longopts[] = {
+ { "version", no_argument, NULL, 'V' },
+ { "help", no_argument, NULL, 'h' },
+ { NULL, 0, NULL, 0 },
+ };
+
+ setlocale(LC_ALL, "");
+ bindtextdomain(PACKAGE, LOCALEDIR);
+ textdomain(PACKAGE);
+ atexit(close_stdout);
+
+ while ((ch = getopt_long(argc, argv, "cnvfx:Vh", longopts, NULL)) != -1) {
+ switch (ch) {
+ case 'n':
+ no_link++;
+ break;
+ case 'v':
+ verbose++;
+ break;
+ case 'c':
+ content_only++;
+ break;
+ case 'f':
+ force = 1;
+ break;
+ case 'x':
+#ifdef HAVE_PCRE
+ exclude_pattern = (PCRE2_SPTR) optarg;
+#else
+ errx(EXIT_FAILURE,
+ _("option -x not supported (built without pcre2)"));
+ exit(1);
+#endif
+ break;
+ case 'V':
+ printf(UTIL_LINUX_VERSION);
+ return EXIT_SUCCESS;
+ case 'h':
+ usage();
+ default:
+ errtryhelp(EXIT_FAILURE);
+ }
+ }
+
+ if (optind == argc) {
+ warnx(_("no directory specified"));
+ errtryhelp(EXIT_FAILURE);
+ }
+
+#ifdef HAVE_PCRE
+ if (exclude_pattern) {
+ re = pcre2_compile(exclude_pattern, /* the pattern */
+ PCRE2_ZERO_TERMINATED, /* indicates pattern is zero-terminate */
+ 0, /* default options */
+ &errornumber, &erroroffset, NULL); /* use default compile context */
+ if (!re) {
+ PCRE2_UCHAR buffer[256];
+ pcre2_get_error_message(errornumber, buffer,
+ sizeof(buffer));
+ errx(EXIT_FAILURE, _("pattern error at offset %d: %s"),
+ (int)erroroffset, buffer);
+ }
+ match_data = pcre2_match_data_create_from_pattern(re, NULL);
+ }
+#endif
+ for (i = optind; i < argc; i++)
+ rf(argv[i]);
+ while (dirs) {
+ DIR *dh;
+ struct dirent *di;
+ d *dp = dirs;
+ size_t nam1baselen = strlen(dp->name);
+ dirs = dp->next;
+ growstr(&nam1, add2(nam1baselen, 1));
+ memcpy(nam1.buf, dp->name, nam1baselen);
+ free(dp);
+ nam1.buf[nam1baselen++] = '/';
+ nam1.buf[nam1baselen] = 0;
+ dh = opendir(nam1.buf);
+ if (dh == NULL)
+ continue;
+ ndirs++;
+ while ((di = readdir(dh)) != NULL) {
+ if (!di->d_name[0])
+ continue;
+ if (di->d_name[0] == '.') {
+ if (!di->d_name[1] || !strcmp(di->d_name, ".."))
+ continue;
+ }
+#ifdef HAVE_PCRE
+ if (re && pcre2_match(re, /* compiled regex */
+ (PCRE2_SPTR) di->d_name, strlen(di->d_name), 0, /* start at offset 0 */
+ 0, /* default options */
+ match_data, /* block for storing the result */
+ NULL) /* use default match context */
+ >=0) {
+ if (verbose) {
+ nam1.buf[nam1baselen] = 0;
+ fprintf(stderr, "Skipping %s%s\n",
+ nam1.buf, di->d_name);
+ }
+ continue;
+ }
+#endif
+ {
+ size_t subdirlen;
+ growstr(&nam1,
+ add2(nam1baselen, subdirlen =
+ strlen(di->d_name)));
+ memcpy(&nam1.buf[nam1baselen], di->d_name,
+ add2(subdirlen, 1));
+ }
+ rf(nam1.buf);
+ }
+ closedir(dh);
+ }
+ doexit(0);
+ return 0;
+}