diff --git a/storage/cmd/containers-storage/dedup.go b/storage/cmd/containers-storage/dedup.go new file mode 100644 index 0000000000..43f57d8654 --- /dev/null +++ b/storage/cmd/containers-storage/dedup.go @@ -0,0 +1,56 @@ +package main + +import ( + "fmt" + + "github.com/containers/storage" + "github.com/containers/storage/pkg/mflag" +) + +var hashMethodArg = "crc" + +func dedupCmd(flags *mflag.FlagSet, action string, m storage.Store, args []string) (int, error) { + var hashMethod storage.DedupHashMethod + switch hashMethodArg { + case "crc": + hashMethod = storage.DedupHashCRC + case "size": + hashMethod = storage.DedupHashFileSize + case "sha256": + hashMethod = storage.DedupHashSHA256 + default: + return 1, fmt.Errorf("invalid hash method: %s", hashMethodArg) + } + res, err := m.Dedup(storage.DedupArgs{ + Options: storage.DedupOptions{ + HashMethod: hashMethod, + }, + }) + if err != nil { + if jsonOutput { + _, err2 := outputJSON(err) + return 1, err2 // Note that err2 is usually nil + } + return 1, fmt.Errorf("%s: %+v", action, err) + } + if jsonOutput { + return outputJSON(res) + } else { + fmt.Printf("Deduplicated %v bytes\n", res.Deduped) + } + return 0, nil +} + +func init() { + commands = append(commands, command{ + names: []string{"dedup"}, + usage: "Dedup all images", + minArgs: 0, + maxArgs: 0, + action: dedupCmd, + addFlags: func(flags *mflag.FlagSet, cmd *command) { + flags.BoolVar(&jsonOutput, []string{"-json", "j"}, jsonOutput, "Prefer JSON output") + flags.StringVar(&hashMethodArg, []string{"-hash-method"}, hashMethodArg, "Specify the hash function to use to detect identical files") + }, + }) +} diff --git a/storage/docs/containers-storage-dedup.md b/storage/docs/containers-storage-dedup.md new file mode 100644 index 0000000000..eb58ccab44 --- /dev/null +++ b/storage/docs/containers-storage-dedup.md @@ -0,0 +1,18 @@ +## containers-storage-dedup 1 "November 2024" + +## NAME +containers-storage dedup - Deduplicate similar files in the images + +## SYNOPSIS +**containers-storage** **dedup** + +## DESCRIPTION +Find similar files in the images and deduplicate them. It requires reflink support from the file system. + +## OPTIONS +**--hash-method** *method* + +Specify the function to use to calculate the hash for a file. It can be one of: *size*, *crc*, *sha256sum*. + +## EXAMPLE +**containers-storage dedup** diff --git a/storage/tests/dedup.bats b/storage/tests/dedup.bats new file mode 100644 index 0000000000..2eeed91861 --- /dev/null +++ b/storage/tests/dedup.bats @@ -0,0 +1,78 @@ +#!/usr/bin/env bats + +load helpers + +@test "dedup" { + case "$STORAGE_DRIVER" in + overlay*|vfs) + ;; + *) + skip "driver $STORAGE_DRIVER does not support dedup" + ;; + esac + + if test -z "$(which tar 2> /dev/null)" ; then + skip "need tar" + fi + + if test -z "$(which jq 2> /dev/null)" ; then + skip "need jq" + fi + + echo some content > $TESTDIR/from + # Skip the test if the underlying file system does not support reflinks. + if ! cp --reflink=always $TESTDIR/from $TESTDIR/to; then + skip "need reflink support" + fi + + populate + + storage diff -u -f $TESTDIR/lower.tar $lowerlayer + storage diff -c -f $TESTDIR/middle.tar $midlayer + storage diff -u -f $TESTDIR/upper.tar $upperlayer + + # Delete the layers. + storage delete-layer $upperlayer + storage delete-layer $midlayer + storage delete-layer $lowerlayer + + # Create new layers and populate them using the layer diffs. + run storage --debug=false create-layer + [ "$status" -eq 0 ] + [ "$output" != "" ] + lowerlayer="$output" + storage applydiff -f $TESTDIR/lower.tar "$lowerlayer" + + run storage --debug=false create-layer "$lowerlayer" + [ "$status" -eq 0 ] + [ "$output" != "" ] + midlayer="$output" + storage applydiff -f $TESTDIR/middle.tar "$midlayer" + + run storage --debug=false create-layer "$midlayer" + [ "$status" -eq 0 ] + [ "$output" != "" ] + upperlayer="$output" + storage applydiff -f $TESTDIR/lower.tar "$upperlayer" + storage applydiff -f $TESTDIR/upper.tar "$upperlayer" + + for layer in $lowerlayer $midlayer $upperlayer; do + run storage --debug=false create-image $layer + [ "$status" -eq 0 ] + done + + run storage --debug=false dedup -j + [ "$status" -eq 0 ] + deduped=$(jq -r .Deduped <<< $output) + [[ $deduped -gt 0 ]] + + for METHOD in size crc sha256; do + # Test that it always returns the same value with any hash-method. + for i in $(seq 10); do + run storage --debug=false dedup -j --hash-method=$METHOD + [ "$status" -eq 0 ] + actual=$(jq -r .Deduped <<< $output) + [[ $deduped = $actual ]] + done + done +}