containers-storage: new command 'dedup'
introduce a new `dedup` command to the `containers-storage` tool to deduplicate similar files in image layers. Reflinks support from the underlying file system is needed. Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
This commit is contained in:
parent
f99d1f662b
commit
179437f65c
|
@ -0,0 +1,56 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/containers/storage"
|
||||
"github.com/containers/storage/pkg/mflag"
|
||||
)
|
||||
|
||||
var hashMethodArg = "crc"
|
||||
|
||||
func dedupCmd(flags *mflag.FlagSet, action string, m storage.Store, args []string) (int, error) {
|
||||
var hashMethod storage.DedupHashMethod
|
||||
switch hashMethodArg {
|
||||
case "crc":
|
||||
hashMethod = storage.DedupHashCRC
|
||||
case "size":
|
||||
hashMethod = storage.DedupHashFileSize
|
||||
case "sha256":
|
||||
hashMethod = storage.DedupHashSHA256
|
||||
default:
|
||||
return 1, fmt.Errorf("invalid hash method: %s", hashMethodArg)
|
||||
}
|
||||
res, err := m.Dedup(storage.DedupArgs{
|
||||
Options: storage.DedupOptions{
|
||||
HashMethod: hashMethod,
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
if jsonOutput {
|
||||
_, err2 := outputJSON(err)
|
||||
return 1, err2 // Note that err2 is usually nil
|
||||
}
|
||||
return 1, fmt.Errorf("%s: %+v", action, err)
|
||||
}
|
||||
if jsonOutput {
|
||||
return outputJSON(res)
|
||||
} else {
|
||||
fmt.Printf("Deduplicated %v bytes\n", res.Deduped)
|
||||
}
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
commands = append(commands, command{
|
||||
names: []string{"dedup"},
|
||||
usage: "Dedup all images",
|
||||
minArgs: 0,
|
||||
maxArgs: 0,
|
||||
action: dedupCmd,
|
||||
addFlags: func(flags *mflag.FlagSet, cmd *command) {
|
||||
flags.BoolVar(&jsonOutput, []string{"-json", "j"}, jsonOutput, "Prefer JSON output")
|
||||
flags.StringVar(&hashMethodArg, []string{"-hash-method"}, hashMethodArg, "Specify the hash function to use to detect identical files")
|
||||
},
|
||||
})
|
||||
}
|
|
@ -0,0 +1,18 @@
|
|||
## containers-storage-dedup 1 "November 2024"
|
||||
|
||||
## NAME
|
||||
containers-storage dedup - Deduplicate similar files in the images
|
||||
|
||||
## SYNOPSIS
|
||||
**containers-storage** **dedup**
|
||||
|
||||
## DESCRIPTION
|
||||
Find similar files in the images and deduplicate them. It requires reflink support from the file system.
|
||||
|
||||
## OPTIONS
|
||||
**--hash-method** *method*
|
||||
|
||||
Specify the function to use to calculate the hash for a file. It can be one of: *size*, *crc*, *sha256sum*.
|
||||
|
||||
## EXAMPLE
|
||||
**containers-storage dedup**
|
|
@ -0,0 +1,78 @@
|
|||
#!/usr/bin/env bats
|
||||
|
||||
load helpers
|
||||
|
||||
@test "dedup" {
|
||||
case "$STORAGE_DRIVER" in
|
||||
overlay*|vfs)
|
||||
;;
|
||||
*)
|
||||
skip "driver $STORAGE_DRIVER does not support dedup"
|
||||
;;
|
||||
esac
|
||||
|
||||
if test -z "$(which tar 2> /dev/null)" ; then
|
||||
skip "need tar"
|
||||
fi
|
||||
|
||||
if test -z "$(which jq 2> /dev/null)" ; then
|
||||
skip "need jq"
|
||||
fi
|
||||
|
||||
echo some content > $TESTDIR/from
|
||||
# Skip the test if the underlying file system does not support reflinks.
|
||||
if ! cp --reflink=always $TESTDIR/from $TESTDIR/to; then
|
||||
skip "need reflink support"
|
||||
fi
|
||||
|
||||
populate
|
||||
|
||||
storage diff -u -f $TESTDIR/lower.tar $lowerlayer
|
||||
storage diff -c -f $TESTDIR/middle.tar $midlayer
|
||||
storage diff -u -f $TESTDIR/upper.tar $upperlayer
|
||||
|
||||
# Delete the layers.
|
||||
storage delete-layer $upperlayer
|
||||
storage delete-layer $midlayer
|
||||
storage delete-layer $lowerlayer
|
||||
|
||||
# Create new layers and populate them using the layer diffs.
|
||||
run storage --debug=false create-layer
|
||||
[ "$status" -eq 0 ]
|
||||
[ "$output" != "" ]
|
||||
lowerlayer="$output"
|
||||
storage applydiff -f $TESTDIR/lower.tar "$lowerlayer"
|
||||
|
||||
run storage --debug=false create-layer "$lowerlayer"
|
||||
[ "$status" -eq 0 ]
|
||||
[ "$output" != "" ]
|
||||
midlayer="$output"
|
||||
storage applydiff -f $TESTDIR/middle.tar "$midlayer"
|
||||
|
||||
run storage --debug=false create-layer "$midlayer"
|
||||
[ "$status" -eq 0 ]
|
||||
[ "$output" != "" ]
|
||||
upperlayer="$output"
|
||||
storage applydiff -f $TESTDIR/lower.tar "$upperlayer"
|
||||
storage applydiff -f $TESTDIR/upper.tar "$upperlayer"
|
||||
|
||||
for layer in $lowerlayer $midlayer $upperlayer; do
|
||||
run storage --debug=false create-image $layer
|
||||
[ "$status" -eq 0 ]
|
||||
done
|
||||
|
||||
run storage --debug=false dedup -j
|
||||
[ "$status" -eq 0 ]
|
||||
deduped=$(jq -r .Deduped <<< $output)
|
||||
[[ $deduped -gt 0 ]]
|
||||
|
||||
for METHOD in size crc sha256; do
|
||||
# Test that it always returns the same value with any hash-method.
|
||||
for i in $(seq 10); do
|
||||
run storage --debug=false dedup -j --hash-method=$METHOD
|
||||
[ "$status" -eq 0 ]
|
||||
actual=$(jq -r .Deduped <<< $output)
|
||||
[[ $deduped = $actual ]]
|
||||
done
|
||||
done
|
||||
}
|
Loading…
Reference in New Issue