containers-storage: new command 'dedup'
introduce a new `dedup` command to the `containers-storage` tool to deduplicate similar files in image layers. Reflinks support from the underlying file system is needed. Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
This commit is contained in:
parent
f99d1f662b
commit
179437f65c
|
@ -0,0 +1,56 @@
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
"github.com/containers/storage"
|
||||||
|
"github.com/containers/storage/pkg/mflag"
|
||||||
|
)
|
||||||
|
|
||||||
|
var hashMethodArg = "crc"
|
||||||
|
|
||||||
|
func dedupCmd(flags *mflag.FlagSet, action string, m storage.Store, args []string) (int, error) {
|
||||||
|
var hashMethod storage.DedupHashMethod
|
||||||
|
switch hashMethodArg {
|
||||||
|
case "crc":
|
||||||
|
hashMethod = storage.DedupHashCRC
|
||||||
|
case "size":
|
||||||
|
hashMethod = storage.DedupHashFileSize
|
||||||
|
case "sha256":
|
||||||
|
hashMethod = storage.DedupHashSHA256
|
||||||
|
default:
|
||||||
|
return 1, fmt.Errorf("invalid hash method: %s", hashMethodArg)
|
||||||
|
}
|
||||||
|
res, err := m.Dedup(storage.DedupArgs{
|
||||||
|
Options: storage.DedupOptions{
|
||||||
|
HashMethod: hashMethod,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
if jsonOutput {
|
||||||
|
_, err2 := outputJSON(err)
|
||||||
|
return 1, err2 // Note that err2 is usually nil
|
||||||
|
}
|
||||||
|
return 1, fmt.Errorf("%s: %+v", action, err)
|
||||||
|
}
|
||||||
|
if jsonOutput {
|
||||||
|
return outputJSON(res)
|
||||||
|
} else {
|
||||||
|
fmt.Printf("Deduplicated %v bytes\n", res.Deduped)
|
||||||
|
}
|
||||||
|
return 0, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
commands = append(commands, command{
|
||||||
|
names: []string{"dedup"},
|
||||||
|
usage: "Dedup all images",
|
||||||
|
minArgs: 0,
|
||||||
|
maxArgs: 0,
|
||||||
|
action: dedupCmd,
|
||||||
|
addFlags: func(flags *mflag.FlagSet, cmd *command) {
|
||||||
|
flags.BoolVar(&jsonOutput, []string{"-json", "j"}, jsonOutput, "Prefer JSON output")
|
||||||
|
flags.StringVar(&hashMethodArg, []string{"-hash-method"}, hashMethodArg, "Specify the hash function to use to detect identical files")
|
||||||
|
},
|
||||||
|
})
|
||||||
|
}
|
|
@ -0,0 +1,18 @@
|
||||||
|
## containers-storage-dedup 1 "November 2024"
|
||||||
|
|
||||||
|
## NAME
|
||||||
|
containers-storage dedup - Deduplicate similar files in the images
|
||||||
|
|
||||||
|
## SYNOPSIS
|
||||||
|
**containers-storage** **dedup**
|
||||||
|
|
||||||
|
## DESCRIPTION
|
||||||
|
Find similar files in the images and deduplicate them. It requires reflink support from the file system.
|
||||||
|
|
||||||
|
## OPTIONS
|
||||||
|
**--hash-method** *method*
|
||||||
|
|
||||||
|
Specify the function to use to calculate the hash for a file. It can be one of: *size*, *crc*, *sha256sum*.
|
||||||
|
|
||||||
|
## EXAMPLE
|
||||||
|
**containers-storage dedup**
|
|
@ -0,0 +1,78 @@
|
||||||
|
#!/usr/bin/env bats
|
||||||
|
|
||||||
|
load helpers
|
||||||
|
|
||||||
|
@test "dedup" {
|
||||||
|
case "$STORAGE_DRIVER" in
|
||||||
|
overlay*|vfs)
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
skip "driver $STORAGE_DRIVER does not support dedup"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
if test -z "$(which tar 2> /dev/null)" ; then
|
||||||
|
skip "need tar"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if test -z "$(which jq 2> /dev/null)" ; then
|
||||||
|
skip "need jq"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo some content > $TESTDIR/from
|
||||||
|
# Skip the test if the underlying file system does not support reflinks.
|
||||||
|
if ! cp --reflink=always $TESTDIR/from $TESTDIR/to; then
|
||||||
|
skip "need reflink support"
|
||||||
|
fi
|
||||||
|
|
||||||
|
populate
|
||||||
|
|
||||||
|
storage diff -u -f $TESTDIR/lower.tar $lowerlayer
|
||||||
|
storage diff -c -f $TESTDIR/middle.tar $midlayer
|
||||||
|
storage diff -u -f $TESTDIR/upper.tar $upperlayer
|
||||||
|
|
||||||
|
# Delete the layers.
|
||||||
|
storage delete-layer $upperlayer
|
||||||
|
storage delete-layer $midlayer
|
||||||
|
storage delete-layer $lowerlayer
|
||||||
|
|
||||||
|
# Create new layers and populate them using the layer diffs.
|
||||||
|
run storage --debug=false create-layer
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
[ "$output" != "" ]
|
||||||
|
lowerlayer="$output"
|
||||||
|
storage applydiff -f $TESTDIR/lower.tar "$lowerlayer"
|
||||||
|
|
||||||
|
run storage --debug=false create-layer "$lowerlayer"
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
[ "$output" != "" ]
|
||||||
|
midlayer="$output"
|
||||||
|
storage applydiff -f $TESTDIR/middle.tar "$midlayer"
|
||||||
|
|
||||||
|
run storage --debug=false create-layer "$midlayer"
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
[ "$output" != "" ]
|
||||||
|
upperlayer="$output"
|
||||||
|
storage applydiff -f $TESTDIR/lower.tar "$upperlayer"
|
||||||
|
storage applydiff -f $TESTDIR/upper.tar "$upperlayer"
|
||||||
|
|
||||||
|
for layer in $lowerlayer $midlayer $upperlayer; do
|
||||||
|
run storage --debug=false create-image $layer
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
done
|
||||||
|
|
||||||
|
run storage --debug=false dedup -j
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
deduped=$(jq -r .Deduped <<< $output)
|
||||||
|
[[ $deduped -gt 0 ]]
|
||||||
|
|
||||||
|
for METHOD in size crc sha256; do
|
||||||
|
# Test that it always returns the same value with any hash-method.
|
||||||
|
for i in $(seq 10); do
|
||||||
|
run storage --debug=false dedup -j --hash-method=$METHOD
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
actual=$(jq -r .Deduped <<< $output)
|
||||||
|
[[ $deduped = $actual ]]
|
||||||
|
done
|
||||||
|
done
|
||||||
|
}
|
Loading…
Reference in New Issue