compressible_test.go


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235

package compress

import (
	"crypto/rand"
	"encoding/base32"
	"testing"
)

func BenchmarkEstimate(b *testing.B) {
	b.ReportAllocs()
	// (predictable, low entropy distibution)
	b.Run("zeroes-5k", func(b *testing.B) {
		var testData = make([]byte, 5000)
		b.SetBytes(int64(len(testData)))
		b.ResetTimer()
		for i := 0; i < b.N; i++ {
			Estimate(testData)
		}
		b.Log(Estimate(testData))
	})

	// (predictable, high entropy distibution)
	b.Run("predictable-5k", func(b *testing.B) {
		var testData = make([]byte, 5000)
		for i := range testData {
			testData[i] = byte(float64(i) / float64(len(testData)) * 256)
		}
		b.SetBytes(int64(len(testData)))
		b.ResetTimer()
		for i := 0; i < b.N; i++ {
			Estimate(testData)
		}
		b.Log(Estimate(testData))
	})

	// (not predictable, high entropy distibution)
	b.Run("random-500b", func(b *testing.B) {
		var testData = make([]byte, 500)
		rand.Read(testData)
		b.SetBytes(int64(len(testData)))
		b.ResetTimer()
		for i := 0; i < b.N; i++ {
			Estimate(testData)
		}
		b.Log(Estimate(testData))
	})

	// (not predictable, high entropy distibution)
	b.Run("random-5k", func(b *testing.B) {
		var testData = make([]byte, 5000)
		rand.Read(testData)
		b.SetBytes(int64(len(testData)))
		b.ResetTimer()
		for i := 0; i < b.N; i++ {
			Estimate(testData)
		}
		b.Log(Estimate(testData))
	})

	// (not predictable, high entropy distibution)
	b.Run("random-50k", func(b *testing.B) {
		var testData = make([]byte, 50000)
		rand.Read(testData)
		b.SetBytes(int64(len(testData)))
		b.ResetTimer()
		for i := 0; i < b.N; i++ {
			Estimate(testData)
		}
		b.Log(Estimate(testData))
	})

	// (not predictable, high entropy distibution)
	b.Run("random-500k", func(b *testing.B) {
		var testData = make([]byte, 500000)
		rand.Read(testData)
		b.SetBytes(int64(len(testData)))
		b.ResetTimer()
		for i := 0; i < b.N; i++ {
			Estimate(testData)
		}
		b.Log(Estimate(testData))
	})

	// (not predictable, medium entropy distibution)
	b.Run("base-32-5k", func(b *testing.B) {
		var testData = make([]byte, 5000)
		rand.Read(testData)
		s := base32.StdEncoding.EncodeToString(testData)
		testData = []byte(s)
		testData = testData[:5000]
		b.SetBytes(int64(len(testData)))
		b.ResetTimer()
		for i := 0; i < b.N; i++ {
			Estimate(testData)
		}
		b.Log(Estimate(testData))
	})
	// (medium predictable, medium entropy distibution)
	b.Run("text", func(b *testing.B) {
		var testData = []byte(`If compression is done per-chunk, care should be taken that it doesn't leave restic backups open to watermarking/fingerprinting attacks.
This is essentially the same problem we discussed related to fingerprinting the CDC deduplication process:
With "naive" CDC, a "known plaintext" file can be verified to exist within the backup if the size of individual blocks can be observed by an attacker, by using CDC on the file in parallel and comparing the resulting amount of chunks and individual chunk lengths.
As discussed earlier, this can be somewhat mitigated by salting the CDC algorithm with a secret value, as done in attic.
With salted CDC, I assume compression would happen on each individual chunk, after splitting the problematic file into chunks. Restic chunks are in the range of 512 KB to 8 MB (but not evenly distributed - right?).
Attacker knows that the CDC algorithm uses a secret salt, so the attacker generates a range of chunks consisting of the first 512 KB to 8 MB of the file, one for each valid chunk length. The attacker is also able to determine the lengths of compressed chunks.
The attacker then compresses that chunk using the compression algorithm.
The attacker compares the lengths of the resulting chunks to the first chunk in the restic backup sets.
IF a matching block length is found, the attacker repeats the exercise with the next chunk, and the next chunk, and the next chunk, ... and the next chunk.
It is my belief that with sufficiently large files, and considering the fact that the CDC algorithm is "biased" (in lack of better of words) towards generating blocks of about 1 MB, this would be sufficient to ascertain whether or not a certain large file exists in the backup.
AS always, a paranoid and highly unscientific stream of consciousness.
Thoughts?`)
		testData = append(testData, testData...)
		testData = append(testData, testData...)
		b.SetBytes(int64(len(testData)))
		b.ResetTimer()
		for i := 0; i < b.N; i++ {
			Estimate(testData)
		}
		b.Log(Estimate(testData))
	})
}

func BenchmarkSnannonEntropyBits(b *testing.B) {
	b.ReportAllocs()
	// (predictable, low entropy distibution)
	b.Run("zeroes-5k", func(b *testing.B) {
		var testData = make([]byte, 5000)
		b.SetBytes(int64(len(testData)))
		b.ResetTimer()
		for i := 0; i < b.N; i++ {
			ShannonEntropyBits(testData)
		}
		b.Log(ShannonEntropyBits(testData))
	})

	// (predictable, high entropy distibution)
	b.Run("predictable-5k", func(b *testing.B) {
		var testData = make([]byte, 5000)
		for i := range testData {
			testData[i] = byte(float64(i) / float64(len(testData)) * 256)
		}
		b.SetBytes(int64(len(testData)))
		b.ResetTimer()
		for i := 0; i < b.N; i++ {
			ShannonEntropyBits(testData)
		}
		b.Log(ShannonEntropyBits(testData))
	})

	// (not predictable, high entropy distibution)
	b.Run("random-500b", func(b *testing.B) {
		var testData = make([]byte, 500)
		rand.Read(testData)
		b.SetBytes(int64(len(testData)))
		b.ResetTimer()
		for i := 0; i < b.N; i++ {
			ShannonEntropyBits(testData)
		}
		b.Log(ShannonEntropyBits(testData))
	})

	// (not predictable, high entropy distibution)
	b.Run("random-5k", func(b *testing.B) {
		var testData = make([]byte, 5000)
		rand.Read(testData)
		b.SetBytes(int64(len(testData)))
		b.ResetTimer()
		for i := 0; i < b.N; i++ {
			ShannonEntropyBits(testData)
		}
		b.Log(ShannonEntropyBits(testData))
	})

	// (not predictable, high entropy distibution)
	b.Run("random-50k", func(b *testing.B) {
		var testData = make([]byte, 50000)
		rand.Read(testData)
		b.SetBytes(int64(len(testData)))
		b.ResetTimer()
		for i := 0; i < b.N; i++ {
			ShannonEntropyBits(testData)
		}
		b.Log(ShannonEntropyBits(testData))
	})

	// (not predictable, high entropy distibution)
	b.Run("random-500k", func(b *testing.B) {
		var testData = make([]byte, 500000)
		rand.Read(testData)
		b.SetBytes(int64(len(testData)))
		b.ResetTimer()
		for i := 0; i < b.N; i++ {
			ShannonEntropyBits(testData)
		}
		b.Log(ShannonEntropyBits(testData))
	})

	// (not predictable, medium entropy distibution)
	b.Run("base-32-5k", func(b *testing.B) {
		var testData = make([]byte, 5000)
		rand.Read(testData)
		s := base32.StdEncoding.EncodeToString(testData)
		testData = []byte(s)
		testData = testData[:5000]
		b.SetBytes(int64(len(testData)))
		b.ResetTimer()
		for i := 0; i < b.N; i++ {
			ShannonEntropyBits(testData)
		}
		b.Log(ShannonEntropyBits(testData))
	})
	// (medium predictable, medium entropy distibution)
	b.Run("text", func(b *testing.B) {
		var testData = []byte(`If compression is done per-chunk, care should be taken that it doesn't leave restic backups open to watermarking/fingerprinting attacks.
This is essentially the same problem we discussed related to fingerprinting the CDC deduplication process:
With "naive" CDC, a "known plaintext" file can be verified to exist within the backup if the size of individual blocks can be observed by an attacker, by using CDC on the file in parallel and comparing the resulting amount of chunks and individual chunk lengths.
As discussed earlier, this can be somewhat mitigated by salting the CDC algorithm with a secret value, as done in attic.
With salted CDC, I assume compression would happen on each individual chunk, after splitting the problematic file into chunks. Restic chunks are in the range of 512 KB to 8 MB (but not evenly distributed - right?).
Attacker knows that the CDC algorithm uses a secret salt, so the attacker generates a range of chunks consisting of the first 512 KB to 8 MB of the file, one for each valid chunk length. The attacker is also able to determine the lengths of compressed chunks.
The attacker then compresses that chunk using the compression algorithm.
The attacker compares the lengths of the resulting chunks to the first chunk in the restic backup sets.
IF a matching block length is found, the attacker repeats the exercise with the next chunk, and the next chunk, and the next chunk, ... and the next chunk.
It is my belief that with sufficiently large files, and considering the fact that the CDC algorithm is "biased" (in lack of better of words) towards generating blocks of about 1 MB, this would be sufficient to ascertain whether or not a certain large file exists in the backup.
AS always, a paranoid and highly unscientific stream of consciousness.
Thoughts?`)
		testData = append(testData, testData...)
		testData = append(testData, testData...)
		b.SetBytes(int64(len(testData)))
		b.ResetTimer()
		for i := 0; i < b.N; i++ {
			ShannonEntropyBits(testData)
		}
		b.Log(ShannonEntropyBits(testData))
	})
}