containers-storage: new command 'dedup'

introduce a new `dedup` command to the `containers-storage` tool to
deduplicate similar files in image layers.  Reflinks support from the
underlying file system is needed.

Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
This commit is contained in:
Giuseppe Scrivano 2024-11-21 15:05:00 +01:00
parent f99d1f662b
commit 179437f65c
3 changed files with 152 additions and 0 deletions

View File

@ -0,0 +1,56 @@
package main
import (
"fmt"
"github.com/containers/storage"
"github.com/containers/storage/pkg/mflag"
)
var hashMethodArg = "crc"
func dedupCmd(flags *mflag.FlagSet, action string, m storage.Store, args []string) (int, error) {
var hashMethod storage.DedupHashMethod
switch hashMethodArg {
case "crc":
hashMethod = storage.DedupHashCRC
case "size":
hashMethod = storage.DedupHashFileSize
case "sha256":
hashMethod = storage.DedupHashSHA256
default:
return 1, fmt.Errorf("invalid hash method: %s", hashMethodArg)
}
res, err := m.Dedup(storage.DedupArgs{
Options: storage.DedupOptions{
HashMethod: hashMethod,
},
})
if err != nil {
if jsonOutput {
_, err2 := outputJSON(err)
return 1, err2 // Note that err2 is usually nil
}
return 1, fmt.Errorf("%s: %+v", action, err)
}
if jsonOutput {
return outputJSON(res)
} else {
fmt.Printf("Deduplicated %v bytes\n", res.Deduped)
}
return 0, nil
}
func init() {
commands = append(commands, command{
names: []string{"dedup"},
usage: "Dedup all images",
minArgs: 0,
maxArgs: 0,
action: dedupCmd,
addFlags: func(flags *mflag.FlagSet, cmd *command) {
flags.BoolVar(&jsonOutput, []string{"-json", "j"}, jsonOutput, "Prefer JSON output")
flags.StringVar(&hashMethodArg, []string{"-hash-method"}, hashMethodArg, "Specify the hash function to use to detect identical files")
},
})
}

View File

@ -0,0 +1,18 @@
## containers-storage-dedup 1 "November 2024"
## NAME
containers-storage dedup - Deduplicate similar files in the images
## SYNOPSIS
**containers-storage** **dedup**
## DESCRIPTION
Find similar files in the images and deduplicate them. It requires reflink support from the file system.
## OPTIONS
**--hash-method** *method*
Specify the function to use to calculate the hash for a file. It can be one of: *size*, *crc*, *sha256sum*.
## EXAMPLE
**containers-storage dedup**

78
storage/tests/dedup.bats Normal file
View File

@ -0,0 +1,78 @@
#!/usr/bin/env bats
load helpers
@test "dedup" {
case "$STORAGE_DRIVER" in
overlay*|vfs)
;;
*)
skip "driver $STORAGE_DRIVER does not support dedup"
;;
esac
if test -z "$(which tar 2> /dev/null)" ; then
skip "need tar"
fi
if test -z "$(which jq 2> /dev/null)" ; then
skip "need jq"
fi
echo some content > $TESTDIR/from
# Skip the test if the underlying file system does not support reflinks.
if ! cp --reflink=always $TESTDIR/from $TESTDIR/to; then
skip "need reflink support"
fi
populate
storage diff -u -f $TESTDIR/lower.tar $lowerlayer
storage diff -c -f $TESTDIR/middle.tar $midlayer
storage diff -u -f $TESTDIR/upper.tar $upperlayer
# Delete the layers.
storage delete-layer $upperlayer
storage delete-layer $midlayer
storage delete-layer $lowerlayer
# Create new layers and populate them using the layer diffs.
run storage --debug=false create-layer
[ "$status" -eq 0 ]
[ "$output" != "" ]
lowerlayer="$output"
storage applydiff -f $TESTDIR/lower.tar "$lowerlayer"
run storage --debug=false create-layer "$lowerlayer"
[ "$status" -eq 0 ]
[ "$output" != "" ]
midlayer="$output"
storage applydiff -f $TESTDIR/middle.tar "$midlayer"
run storage --debug=false create-layer "$midlayer"
[ "$status" -eq 0 ]
[ "$output" != "" ]
upperlayer="$output"
storage applydiff -f $TESTDIR/lower.tar "$upperlayer"
storage applydiff -f $TESTDIR/upper.tar "$upperlayer"
for layer in $lowerlayer $midlayer $upperlayer; do
run storage --debug=false create-image $layer
[ "$status" -eq 0 ]
done
run storage --debug=false dedup -j
[ "$status" -eq 0 ]
deduped=$(jq -r .Deduped <<< $output)
[[ $deduped -gt 0 ]]
for METHOD in size crc sha256; do
# Test that it always returns the same value with any hash-method.
for i in $(seq 10); do
run storage --debug=false dedup -j --hash-method=$METHOD
[ "$status" -eq 0 ]
actual=$(jq -r .Deduped <<< $output)
[[ $deduped = $actual ]]
done
done
}