diff --git a/go.mod b/go.mod
index 3fc5f5e..e647b3d 100644
--- a/go.mod
+++ b/go.mod
@@ -3,25 +3,25 @@ module github.com/DroidsOnRoids/bitrise-step-flutter
 go 1.17
 
 require (
-	github.com/bitrise-io/go-utils v0.0.0-20200224122728-e212188d99b4
+	github.com/bitrise-io/go-utils v1.0.2
 	github.com/bitrise-tools/go-steputils v0.0.0-20200227150459-94490ca44ddb
 	github.com/blang/semver v3.5.1+incompatible
-	github.com/magiconair/properties v1.8.2-0.20200610093828-cfba716d4c41
+	github.com/magiconair/properties v1.8.6
 	github.com/mholt/archiver v1.1.3-0.20190917152942-233fc16b9615
-	github.com/stretchr/testify v1.6.1
+	github.com/stretchr/testify v1.7.0
 )
 
 require (
-	github.com/andybalholm/brotli v1.0.0 // indirect
+	github.com/andybalholm/brotli v1.0.4 // indirect
 	github.com/davecgh/go-spew v1.1.1 // indirect
 	github.com/dsnet/compress v0.0.1 // indirect
-	github.com/golang/snappy v0.0.1 // indirect
-	github.com/klauspost/compress v1.10.10 // indirect
-	github.com/klauspost/pgzip v1.2.4 // indirect
-	github.com/nwaples/rardecode v1.1.0 // indirect
-	github.com/pierrec/lz4 v2.5.2+incompatible // indirect
+	github.com/golang/snappy v0.0.4 // indirect
+	github.com/klauspost/compress v1.15.4 // indirect
+	github.com/klauspost/pgzip v1.2.5 // indirect
+	github.com/nwaples/rardecode v1.1.3 // indirect
+	github.com/pierrec/lz4 v2.6.1+incompatible // indirect
 	github.com/pmezard/go-difflib v1.0.0 // indirect
-	github.com/ulikunitz/xz v0.5.7 // indirect
+	github.com/ulikunitz/xz v0.5.10 // indirect
 	github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 // indirect
-	gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776 // indirect
+	gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b // indirect
 )
diff --git a/go.sum b/go.sum
index e200a9e..c987c40 100644
--- a/go.sum
+++ b/go.sum
@@ -1,8 +1,12 @@
 github.com/andybalholm/brotli v0.0.0-20190621154722-5f990b63d2d6/go.mod h1:+lx6/Aqd1kLJ1GQfkvOnaZ1WGmLpMpbprPuIOOZX30U=
 github.com/andybalholm/brotli v1.0.0 h1:7UCwP93aiSfvWpapti8g88vVVGp2qqtGyePsSuDafo4=
 github.com/andybalholm/brotli v1.0.0/go.mod h1:loMXtMfwqflxFJPmdbJO0a3KNoPuLBgiu3qAvBg8x/Y=
+github.com/andybalholm/brotli v1.0.4 h1:V7DdXeJtZscaqfNuAdSRuRFzuiKlHSC/Zh3zl9qY3JY=
+github.com/andybalholm/brotli v1.0.4/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
 github.com/bitrise-io/go-utils v0.0.0-20200224122728-e212188d99b4 h1:35ImX3SrDgRYVDPue7NhybnQ3quuVrTQzjjul+R3aUI=
 github.com/bitrise-io/go-utils v0.0.0-20200224122728-e212188d99b4/go.mod h1:tTEsKvbz1LbzuN/KpVFHXnLtcAPdEgIdM41s0lL407s=
+github.com/bitrise-io/go-utils v1.0.2 h1:w4Mz2IvrgDzrFJECuHdvsK1LHO30cdtuy9bBa7Lw2c0=
+github.com/bitrise-io/go-utils v1.0.2/go.mod h1:ZY1DI+fEpZuFpO9szgDeICM4QbqoWVt0RSY3tRI1heY=
 github.com/bitrise-tools/go-steputils v0.0.0-20200227150459-94490ca44ddb h1:5zeXRDfZntzK+A2MNdNoU1uuueOukoC/Lh3N3dZICSI=
 github.com/bitrise-tools/go-steputils v0.0.0-20200227150459-94490ca44ddb/go.mod h1:4eEx1Bd2AAVAN/YGK4V066jO7ekrsLymkyQZzx2UEwQ=
 github.com/blang/semver v3.5.1+incompatible h1:cQNTCjp13qL8KC3Nbxr/y2Bqb63oX6wdnnjpJbkM4JQ=
@@ -16,49 +20,82 @@ github.com/dsnet/golib v0.0.0-20171103203638-1ea166775780/go.mod h1:Lj+Z9rebOhdf
 github.com/golang/gddo v0.0.0-20190419222130-af0f2af80721/go.mod h1:xEhNfoBDX1hzLm2Nf80qUvZ2sVwoMZ8d6IE2SrsQfh4=
 github.com/golang/snappy v0.0.1 h1:Qgr9rKW7uDUkrbSmQeiDsGa8SjGyCOGtuasMWwvp2P4=
 github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
+github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM=
+github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
 github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
+github.com/hashicorp/go-cleanhttp v0.5.1/go.mod h1:JpRdi6/HCYpAwUzNwuwqhbovhLtngrth3wmdIIUrZ80=
+github.com/hashicorp/go-cleanhttp v0.5.2/go.mod h1:kO/YDlP8L1346E6Sodw+PrpBSV4/SoxCXGY6BqNFT48=
+github.com/hashicorp/go-hclog v0.9.2/go.mod h1:5CU+agLiy3J7N7QjHK5d05KxGsuXiQLrjA0H7acj2lQ=
+github.com/hashicorp/go-retryablehttp v0.7.0/go.mod h1:vAew36LZh98gCBJNLH42IQ1ER/9wtLZZ8meHqQvEYWY=
 github.com/kisielk/errcheck v1.2.0/go.mod h1:/BMXB+zMLi60iA8Vv6Ksmxu/1UDYcXs4uQLJ+jE2L00=
 github.com/klauspost/compress v1.4.1/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
 github.com/klauspost/compress v1.7.4/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
 github.com/klauspost/compress v1.10.10 h1:a/y8CglcM7gLGYmlbP/stPE5sR3hbhFRUjCBfd/0B3I=
 github.com/klauspost/compress v1.10.10/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
+github.com/klauspost/compress v1.15.4 h1:1kn4/7MepF/CHmYub99/nNX8az0IJjfSOU/jbnTVfqQ=
+github.com/klauspost/compress v1.15.4/go.mod h1:PhcZ0MbTNciWF3rruxRgKxI5NkcHHrHUDtV4Yw2GlzU=
 github.com/klauspost/cpuid v1.2.0/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
 github.com/klauspost/pgzip v1.2.1/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs=
 github.com/klauspost/pgzip v1.2.4 h1:TQ7CNpYKovDOmqzRHKxJh0BeaBI7UdQZYc6p7pMQh1A=
 github.com/klauspost/pgzip v1.2.4/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs=
+github.com/klauspost/pgzip v1.2.5 h1:qnWYvvKqedOF2ulHpMG72XQol4ILEJ8k2wwRl/Km8oE=
+github.com/klauspost/pgzip v1.2.5/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs=
 github.com/magiconair/properties v1.8.2-0.20200610093828-cfba716d4c41 h1:FZWX5EyBobAddDAP0XahKO1LrWKGB5AaNpVO4kdyXdA=
 github.com/magiconair/properties v1.8.2-0.20200610093828-cfba716d4c41/go.mod h1:y3VJvCyxH9uVvJTWEGAELF3aiYNyPKd5NZ3oSwXrF60=
+github.com/magiconair/properties v1.8.6 h1:5ibWZ6iY0NctNGWo87LalDlEZ6R41TqbbDamhfG/Qzo=
+github.com/magiconair/properties v1.8.6/go.mod h1:y3VJvCyxH9uVvJTWEGAELF3aiYNyPKd5NZ3oSwXrF60=
 github.com/mholt/archiver v1.1.3-0.20190917152942-233fc16b9615 h1:2C0a9oPTqjpvyg9UCGP6Xan2fR4RUb/LfrNaLAcKT50=
 github.com/mholt/archiver v1.1.3-0.20190917152942-233fc16b9615/go.mod h1:15A2vUgLkyQMX8s3CbMj8aYNqkegWVTi4xXE46UGHNY=
 github.com/nwaples/rardecode v1.0.0/go.mod h1:5DzqNKiOdpKKBH87u8VlvAnPZMXcGRhxWkRpHbbfGS0=
 github.com/nwaples/rardecode v1.1.0 h1:vSxaY8vQhOcVr4mm5e8XllHWTiM4JF507A0Katqw7MQ=
 github.com/nwaples/rardecode v1.1.0/go.mod h1:5DzqNKiOdpKKBH87u8VlvAnPZMXcGRhxWkRpHbbfGS0=
+github.com/nwaples/rardecode v1.1.3 h1:cWCaZwfM5H7nAD6PyEdcVnczzV8i/JtotnyW/dD9lEc=
+github.com/nwaples/rardecode v1.1.3/go.mod h1:5DzqNKiOdpKKBH87u8VlvAnPZMXcGRhxWkRpHbbfGS0=
 github.com/pierrec/lz4 v2.0.5+incompatible/go.mod h1:pdkljMzZIN41W+lC3N2tnIh5sFi+IEE17M5jbnwPHcY=
 github.com/pierrec/lz4 v2.5.2+incompatible h1:WCjObylUIOlKy/+7Abdn34TLIkXiA4UWUMhxq9m9ZXI=
 github.com/pierrec/lz4 v2.5.2+incompatible/go.mod h1:pdkljMzZIN41W+lC3N2tnIh5sFi+IEE17M5jbnwPHcY=
+github.com/pierrec/lz4 v2.6.1+incompatible h1:9UY3+iC23yxF0UfGaYrGplQ+79Rg+h/q9FV9ix19jjM=
+github.com/pierrec/lz4 v2.6.1+incompatible/go.mod h1:pdkljMzZIN41W+lC3N2tnIh5sFi+IEE17M5jbnwPHcY=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/objx v0.3.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE=
+github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
+github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
 github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA=
 github.com/stretchr/testify v1.6.1 h1:hDPOHmpOpP40lSULcqw7IrRb/u7w6RpDC9399XyoNd0=
 github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
+github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 github.com/ulikunitz/xz v0.5.6/go.mod h1:2bypXElzHzzJZwzH67Y6wb67pO62Rzfn7BSiF4ABRW8=
 github.com/ulikunitz/xz v0.5.7 h1:YvTNdFzX6+W5m9msiYg/zpkSURPPtOlzbqYjrFn7Yt4=
 github.com/ulikunitz/xz v0.5.7/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14=
+github.com/ulikunitz/xz v0.5.10 h1:t92gobL9l3HE202wg3rlk19F6X+JOxl9BBrCCMYEYd8=
+github.com/ulikunitz/xz v0.5.10/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14=
 github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8 h1:nIPpBwaJSVYIxUFsDv3M8ofmx9yWTog9BfvIu0q41lo=
 github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8/go.mod h1:HUYIGzjTL3rfEspMxjDjgmT5uz5wzYJKVo23qUhYTos=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20200220183623-bac4c82f6975/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
+golang.org/x/crypto v0.0.0-20211202192323-5770296d904e/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
 golang.org/x/lint v0.0.0-20200130185559-910be7a94367/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY=
 golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
 golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
 golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
 golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200219091948-cb0a6d8edb6c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20211205182925-97ca703d548d/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
+golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20181030221726-6c7e314b6563/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
 golang.org/x/tools v0.0.0-20200220224806-8a925fa4c0df/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
@@ -66,6 +103,9 @@ golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8T
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
 gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
+gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
 gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776 h1:tQIYjPdBoyREyB9XMu+nnTclpTYkz2zFM+lzLJFO4gQ=
 gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b h1:h8qDotaEPuJATrMmW04NCwg7v22aHH28wwpauUhK9Oo=
+gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
diff --git a/main.go b/main.go
index 93537dd..48843a5 100644
--- a/main.go
+++ b/main.go
@@ -84,7 +84,7 @@ func downloadAndExtractReleaseSdk(flutterVersion, flutterSdkDestinationDir strin
 	channel := versionComponents[len(versionComponents)-1]
 
 	flutterSdkSourceURL := fmt.Sprintf(
-		"https://storage.googleapis.com/flutter_infra/releases/%s/%s/flutter_%s_v%s.%s",
+		"https://storage.googleapis.com/flutter_infra_release/releases/%s/%s/flutter_%s_v%s.%s",
 		channel,
 		getFlutterPlatform(),
 		getFlutterPlatform(),
diff --git a/vendor/github.com/andybalholm/brotli/README.md b/vendor/github.com/andybalholm/brotli/README.md
index 02d115e..1ea7fdb 100644
--- a/vendor/github.com/andybalholm/brotli/README.md
+++ b/vendor/github.com/andybalholm/brotli/README.md
@@ -3,3 +3,5 @@ It was translated from the reference implementation (https://github.com/google/b
 with the `c2go` tool at https://github.com/andybalholm/c2go.
 
 I am using it in production with https://github.com/andybalholm/redwood.
+
+API documentation is found at https://pkg.go.dev/github.com/andybalholm/brotli?tab=doc.
diff --git a/vendor/github.com/andybalholm/brotli/backward_references.go b/vendor/github.com/andybalholm/brotli/backward_references.go
index 1642e47..008c054 100644
--- a/vendor/github.com/andybalholm/brotli/backward_references.go
+++ b/vendor/github.com/andybalholm/brotli/backward_references.go
@@ -1,5 +1,9 @@
 package brotli
 
+import (
+	"sync"
+)
+
 /* Copyright 2013 Google Inc. All Rights Reserved.
 
    Distributed under MIT license.
@@ -31,13 +35,10 @@ func computeDistanceCode(distance uint, max_distance uint, dist_cache []int) uin
 	return distance + numDistanceShortCodes - 1
 }
 
-/* "commands" points to the next output command to write to, "*num_commands" is
-   initially the total amount of commands output by previous
-   CreateBackwardReferences calls, and must be incremented by the amount written
-   by this call. */
-func createBackwardReferences(num_bytes uint, position uint, ringbuffer []byte, ringbuffer_mask uint, params *encoderParams, hasher hasherHandle, dist_cache []int, last_insert_len *uint, commands []command, num_commands *uint, num_literals *uint) {
+var hasherSearchResultPool sync.Pool
+
+func createBackwardReferences(num_bytes uint, position uint, ringbuffer []byte, ringbuffer_mask uint, params *encoderParams, hasher hasherHandle, dist_cache []int, last_insert_len *uint, commands *[]command, num_literals *uint) {
 	var max_backward_limit uint = maxBackwardLimit(params.lgwin)
-	var orig_commands []command = commands
 	var insert_length uint = *last_insert_len
 	var pos_end uint = position + num_bytes
 	var store_end uint
@@ -57,8 +58,14 @@ func createBackwardReferences(num_bytes uint, position uint, ringbuffer []byte,
 
 	/* Minimum score to accept a backward reference. */
 	hasher.PrepareDistanceCache(dist_cache)
-	var sr2 hasherSearchResult
-	var sr hasherSearchResult
+	sr2, _ := hasherSearchResultPool.Get().(*hasherSearchResult)
+	if sr2 == nil {
+		sr2 = &hasherSearchResult{}
+	}
+	sr, _ := hasherSearchResultPool.Get().(*hasherSearchResult)
+	if sr == nil {
+		sr = &hasherSearchResult{}
+	}
 
 	for position+hasher.HashTypeLength() < pos_end {
 		var max_length uint = pos_end - position
@@ -67,7 +74,7 @@ func createBackwardReferences(num_bytes uint, position uint, ringbuffer []byte,
 		sr.len_code_delta = 0
 		sr.distance = 0
 		sr.score = kMinScore
-		hasher.FindLongestMatch(&params.dictionary, ringbuffer, ringbuffer_mask, dist_cache, position, max_length, max_distance, gap, params.dist.max_distance, &sr)
+		hasher.FindLongestMatch(&params.dictionary, ringbuffer, ringbuffer_mask, dist_cache, position, max_length, max_distance, gap, params.dist.max_distance, sr)
 		if sr.score > kMinScore {
 			/* Found a match. Let's look for something even better ahead. */
 			var delayed_backward_references_in_row int = 0
@@ -83,14 +90,14 @@ func createBackwardReferences(num_bytes uint, position uint, ringbuffer []byte,
 				sr2.distance = 0
 				sr2.score = kMinScore
 				max_distance = brotli_min_size_t(position+1, max_backward_limit)
-				hasher.FindLongestMatch(&params.dictionary, ringbuffer, ringbuffer_mask, dist_cache, position+1, max_length, max_distance, gap, params.dist.max_distance, &sr2)
+				hasher.FindLongestMatch(&params.dictionary, ringbuffer, ringbuffer_mask, dist_cache, position+1, max_length, max_distance, gap, params.dist.max_distance, sr2)
 				if sr2.score >= sr.score+cost_diff_lazy {
 					/* Ok, let's just write one byte for now and start a match from the
 					   next byte. */
 					position++
 
 					insert_length++
-					sr = sr2
+					*sr = *sr2
 					delayed_backward_references_in_row++
 					if delayed_backward_references_in_row < 4 && position+hasher.HashTypeLength() < pos_end {
 						continue
@@ -114,8 +121,7 @@ func createBackwardReferences(num_bytes uint, position uint, ringbuffer []byte,
 					hasher.PrepareDistanceCache(dist_cache)
 				}
 
-				initCommand(&commands[0], &params.dist, insert_length, sr.len, sr.len_code_delta, distance_code)
-				commands = commands[1:]
+				*commands = append(*commands, makeCommand(&params.dist, insert_length, sr.len, sr.len_code_delta, distance_code))
 			}
 
 			*num_literals += insert_length
@@ -173,5 +179,7 @@ func createBackwardReferences(num_bytes uint, position uint, ringbuffer []byte,
 
 	insert_length += pos_end - position
 	*last_insert_len = insert_length
-	*num_commands += uint(-cap(commands) + cap(orig_commands))
+
+	hasherSearchResultPool.Put(sr)
+	hasherSearchResultPool.Put(sr2)
 }
diff --git a/vendor/github.com/andybalholm/brotli/backward_references_hq.go b/vendor/github.com/andybalholm/brotli/backward_references_hq.go
index 5eac736..21629c1 100644
--- a/vendor/github.com/andybalholm/brotli/backward_references_hq.go
+++ b/vendor/github.com/andybalholm/brotli/backward_references_hq.go
@@ -123,14 +123,13 @@ func setCost(histogram []uint32, histogram_size uint, literal_histogram bool, co
 	}
 }
 
-func zopfliCostModelSetFromCommands(self *zopfliCostModel, position uint, ringbuffer []byte, ringbuffer_mask uint, commands []command, num_commands uint, last_insert_len uint) {
+func zopfliCostModelSetFromCommands(self *zopfliCostModel, position uint, ringbuffer []byte, ringbuffer_mask uint, commands []command, last_insert_len uint) {
 	var histogram_literal [numLiteralSymbols]uint32
 	var histogram_cmd [numCommandSymbols]uint32
 	var histogram_dist [maxEffectiveDistanceAlphabetSize]uint32
 	var cost_literal [numLiteralSymbols]float32
 	var pos uint = position - last_insert_len
 	var min_cost_cmd float32 = kInfinity
-	var i uint
 	var cost_cmd []float32 = self.cost_cmd_[:]
 	var literal_costs []float32
 
@@ -138,7 +137,7 @@ func zopfliCostModelSetFromCommands(self *zopfliCostModel, position uint, ringbu
 	histogram_cmd = [numCommandSymbols]uint32{}
 	histogram_dist = [maxEffectiveDistanceAlphabetSize]uint32{}
 
-	for i = 0; i < num_commands; i++ {
+	for i := range commands {
 		var inslength uint = uint(commands[i].insert_len_)
 		var copylength uint = uint(commandCopyLen(&commands[i]))
 		var distcode uint = uint(commands[i].dist_prefix_) & 0x3FF
@@ -161,7 +160,7 @@ func zopfliCostModelSetFromCommands(self *zopfliCostModel, position uint, ringbu
 	setCost(histogram_cmd[:], numCommandSymbols, false, cost_cmd)
 	setCost(histogram_dist[:], uint(self.distance_histogram_size), false, self.cost_dist_)
 
-	for i = 0; i < numCommandSymbols; i++ {
+	for i := 0; i < numCommandSymbols; i++ {
 		min_cost_cmd = brotli_min_float(min_cost_cmd, cost_cmd[i])
 	}
 
@@ -169,10 +168,10 @@ func zopfliCostModelSetFromCommands(self *zopfliCostModel, position uint, ringbu
 	{
 		literal_costs = self.literal_costs_
 		var literal_carry float32 = 0.0
-		var num_bytes uint = self.num_bytes_
+		num_bytes := int(self.num_bytes_)
 		literal_costs[0] = 0.0
-		for i = 0; i < num_bytes; i++ {
-			literal_carry += cost_literal[ringbuffer[(position+i)&ringbuffer_mask]]
+		for i := 0; i < num_bytes; i++ {
+			literal_carry += cost_literal[ringbuffer[(position+uint(i))&ringbuffer_mask]]
 			literal_costs[i+1] = literal_costs[i] + literal_carry
 			literal_carry -= literal_costs[i+1] - literal_costs[i]
 		}
@@ -502,7 +501,9 @@ func updateNodes(num_bytes uint, block_start uint, pos uint, ringbuffer []byte,
 					var cost float32 = dist_cost + float32(getCopyExtra(copycode)) + zopfliCostModelGetCommandCost(model, cmdcode)
 					if cost < nodes[pos+len].u.cost {
 						updateZopfliNode(nodes, pos, start, uint(len), len_code, dist, 0, cost)
-						result = brotli_max_size_t(result, uint(len))
+						if len > result {
+							result = len
+						}
 					}
 				}
 			}
@@ -530,7 +531,7 @@ func computeShortestPathFromNodes(num_bytes uint, nodes []zopfliNode) uint {
 }
 
 /* REQUIRES: nodes != NULL and len(nodes) >= num_bytes + 1 */
-func zopfliCreateCommands(num_bytes uint, block_start uint, nodes []zopfliNode, dist_cache []int, last_insert_len *uint, params *encoderParams, commands []command, num_literals *uint) {
+func zopfliCreateCommands(num_bytes uint, block_start uint, nodes []zopfliNode, dist_cache []int, last_insert_len *uint, params *encoderParams, commands *[]command, num_literals *uint) {
 	var max_backward_limit uint = maxBackwardLimit(params.lgwin)
 	var pos uint = 0
 	var offset uint32 = nodes[0].u.next
@@ -552,7 +553,7 @@ func zopfliCreateCommands(num_bytes uint, block_start uint, nodes []zopfliNode,
 			var max_distance uint = brotli_min_size_t(block_start+pos, max_backward_limit)
 			var is_dictionary bool = (distance > max_distance+gap)
 			var dist_code uint = uint(zopfliNodeDistanceCode(next))
-			initCommand(&commands[i], &params.dist, insert_length, copy_length, int(len_code)-int(copy_length), dist_code)
+			*commands = append(*commands, makeCommand(&params.dist, insert_length, copy_length, int(len_code)-int(copy_length), dist_code))
 
 			if !is_dictionary && dist_code > 0 {
 				dist_cache[3] = dist_cache[2]
@@ -679,16 +680,16 @@ func zopfliComputeShortestPath(num_bytes uint, position uint, ringbuffer []byte,
 	return computeShortestPathFromNodes(num_bytes, nodes)
 }
 
-func createZopfliBackwardReferences(num_bytes uint, position uint, ringbuffer []byte, ringbuffer_mask uint, params *encoderParams, hasher *h10, dist_cache []int, last_insert_len *uint, commands []command, num_commands *uint, num_literals *uint) {
+func createZopfliBackwardReferences(num_bytes uint, position uint, ringbuffer []byte, ringbuffer_mask uint, params *encoderParams, hasher *h10, dist_cache []int, last_insert_len *uint, commands *[]command, num_literals *uint) {
 	var nodes []zopfliNode
 	nodes = make([]zopfliNode, (num_bytes + 1))
 	initZopfliNodes(nodes, num_bytes+1)
-	*num_commands += zopfliComputeShortestPath(num_bytes, position, ringbuffer, ringbuffer_mask, params, dist_cache, hasher, nodes)
+	zopfliComputeShortestPath(num_bytes, position, ringbuffer, ringbuffer_mask, params, dist_cache, hasher, nodes)
 	zopfliCreateCommands(num_bytes, position, nodes, dist_cache, last_insert_len, params, commands, num_literals)
 	nodes = nil
 }
 
-func createHqZopfliBackwardReferences(num_bytes uint, position uint, ringbuffer []byte, ringbuffer_mask uint, params *encoderParams, hasher hasherHandle, dist_cache []int, last_insert_len *uint, commands []command, num_commands *uint, num_literals *uint) {
+func createHqZopfliBackwardReferences(num_bytes uint, position uint, ringbuffer []byte, ringbuffer_mask uint, params *encoderParams, hasher hasherHandle, dist_cache []int, last_insert_len *uint, commands *[]command, num_literals *uint) {
 	var max_backward_limit uint = maxBackwardLimit(params.lgwin)
 	var num_matches []uint32 = make([]uint32, num_bytes)
 	var matches_size uint = 4 * num_bytes
@@ -703,7 +704,7 @@ func createHqZopfliBackwardReferences(num_bytes uint, position uint, ringbuffer
 	var orig_num_literals uint
 	var orig_last_insert_len uint
 	var orig_dist_cache [4]int
-	var orig_num_commands uint
+	var orig_num_commands int
 	var model zopfliCostModel
 	var nodes []zopfliNode
 	var matches []backwardMatch = make([]backwardMatch, matches_size)
@@ -769,7 +770,7 @@ func createHqZopfliBackwardReferences(num_bytes uint, position uint, ringbuffer
 	orig_num_literals = *num_literals
 	orig_last_insert_len = *last_insert_len
 	copy(orig_dist_cache[:], dist_cache[:4])
-	orig_num_commands = *num_commands
+	orig_num_commands = len(*commands)
 	nodes = make([]zopfliNode, (num_bytes + 1))
 	initZopfliCostModel(&model, &params.dist, num_bytes)
 	for i = 0; i < 2; i++ {
@@ -777,14 +778,14 @@ func createHqZopfliBackwardReferences(num_bytes uint, position uint, ringbuffer
 		if i == 0 {
 			zopfliCostModelSetFromLiteralCosts(&model, position, ringbuffer, ringbuffer_mask)
 		} else {
-			zopfliCostModelSetFromCommands(&model, position, ringbuffer, ringbuffer_mask, commands, *num_commands-orig_num_commands, orig_last_insert_len)
+			zopfliCostModelSetFromCommands(&model, position, ringbuffer, ringbuffer_mask, (*commands)[orig_num_commands:], orig_last_insert_len)
 		}
 
-		*num_commands = orig_num_commands
+		*commands = (*commands)[:orig_num_commands]
 		*num_literals = orig_num_literals
 		*last_insert_len = orig_last_insert_len
 		copy(dist_cache, orig_dist_cache[:4])
-		*num_commands += zopfliIterate(num_bytes, position, ringbuffer, ringbuffer_mask, params, gap, dist_cache, &model, num_matches, matches, nodes)
+		zopfliIterate(num_bytes, position, ringbuffer, ringbuffer_mask, params, gap, dist_cache, &model, num_matches, matches, nodes)
 		zopfliCreateCommands(num_bytes, position, nodes, dist_cache, last_insert_len, params, commands, num_literals)
 	}
 
diff --git a/vendor/github.com/andybalholm/brotli/block_splitter.go b/vendor/github.com/andybalholm/brotli/block_splitter.go
index 2ccff45..978a131 100644
--- a/vendor/github.com/andybalholm/brotli/block_splitter.go
+++ b/vendor/github.com/andybalholm/brotli/block_splitter.go
@@ -33,23 +33,21 @@ const (
 	kMinItersForRefining         uint    = 100
 )
 
-func countLiterals(cmds []command, num_commands uint) uint {
+func countLiterals(cmds []command) uint {
 	var total_length uint = 0
 	/* Count how many we have. */
 
-	var i uint
-	for i = 0; i < num_commands; i++ {
+	for i := range cmds {
 		total_length += uint(cmds[i].insert_len_)
 	}
 
 	return total_length
 }
 
-func copyLiteralsToByteArray(cmds []command, num_commands uint, data []byte, offset uint, mask uint, literals []byte) {
+func copyLiteralsToByteArray(cmds []command, data []byte, offset uint, mask uint, literals []byte) {
 	var pos uint = 0
 	var from_pos uint = offset & mask
-	var i uint
-	for i = 0; i < num_commands; i++ {
+	for i := range cmds {
 		var insert_len uint = uint(cmds[i].insert_len_)
 		if from_pos+insert_len > mask {
 			var head_size uint = mask + 1 - from_pos
@@ -90,24 +88,19 @@ const clustersPerBatch = 16
 func initBlockSplit(self *blockSplit) {
 	self.num_types = 0
 	self.num_blocks = 0
-	self.types = nil
-	self.lengths = nil
+	self.types = self.types[:0]
+	self.lengths = self.lengths[:0]
 	self.types_alloc_size = 0
 	self.lengths_alloc_size = 0
 }
 
-func destroyBlockSplit(self *blockSplit) {
-	self.types = nil
-	self.lengths = nil
-}
-
-func splitBlock(cmds []command, num_commands uint, data []byte, pos uint, mask uint, params *encoderParams, literal_split *blockSplit, insert_and_copy_split *blockSplit, dist_split *blockSplit) {
+func splitBlock(cmds []command, data []byte, pos uint, mask uint, params *encoderParams, literal_split *blockSplit, insert_and_copy_split *blockSplit, dist_split *blockSplit) {
 	{
-		var literals_count uint = countLiterals(cmds, num_commands)
+		var literals_count uint = countLiterals(cmds)
 		var literals []byte = make([]byte, literals_count)
 
 		/* Create a continuous array of literals. */
-		copyLiteralsToByteArray(cmds, num_commands, data, pos, mask, literals)
+		copyLiteralsToByteArray(cmds, data, pos, mask, literals)
 
 		/* Create the block split on the array of literals.
 		   Literal histograms have alphabet size 256. */
@@ -116,28 +109,26 @@ func splitBlock(cmds []command, num_commands uint, data []byte, pos uint, mask u
 		literals = nil
 	}
 	{
-		var insert_and_copy_codes []uint16 = make([]uint16, num_commands)
+		var insert_and_copy_codes []uint16 = make([]uint16, len(cmds))
 		/* Compute prefix codes for commands. */
 
-		var i uint
-		for i = 0; i < num_commands; i++ {
+		for i := range cmds {
 			insert_and_copy_codes[i] = cmds[i].cmd_prefix_
 		}
 
 		/* Create the block split on the array of command prefixes. */
-		splitByteVectorCommand(insert_and_copy_codes, num_commands, kSymbolsPerCommandHistogram, kMaxCommandHistograms, kCommandStrideLength, kCommandBlockSwitchCost, params, insert_and_copy_split)
+		splitByteVectorCommand(insert_and_copy_codes, kSymbolsPerCommandHistogram, kMaxCommandHistograms, kCommandStrideLength, kCommandBlockSwitchCost, params, insert_and_copy_split)
 
 		/* TODO: reuse for distances? */
 
 		insert_and_copy_codes = nil
 	}
 	{
-		var distance_prefixes []uint16 = make([]uint16, num_commands)
+		var distance_prefixes []uint16 = make([]uint16, len(cmds))
 		var j uint = 0
 		/* Create a continuous array of distance prefixes. */
 
-		var i uint
-		for i = 0; i < num_commands; i++ {
+		for i := range cmds {
 			var cmd *command = &cmds[i]
 			if commandCopyLen(cmd) != 0 && cmd.cmd_prefix_ >= 128 {
 				distance_prefixes[j] = cmd.dist_prefix_ & 0x3FF
diff --git a/vendor/github.com/andybalholm/brotli/block_splitter_command.go b/vendor/github.com/andybalholm/brotli/block_splitter_command.go
index e505fe1..9dec13e 100644
--- a/vendor/github.com/andybalholm/brotli/block_splitter_command.go
+++ b/vendor/github.com/andybalholm/brotli/block_splitter_command.go
@@ -372,7 +372,8 @@ func clusterBlocksCommand(data []uint16, length uint, num_blocks uint, block_ids
 	histogram_symbols = nil
 }
 
-func splitByteVectorCommand(data []uint16, length uint, literals_per_histogram uint, max_histograms uint, sampling_stride_length uint, block_switch_cost float64, params *encoderParams, split *blockSplit) {
+func splitByteVectorCommand(data []uint16, literals_per_histogram uint, max_histograms uint, sampling_stride_length uint, block_switch_cost float64, params *encoderParams, split *blockSplit) {
+	length := uint(len(data))
 	var data_size uint = histogramDataSizeCommand()
 	var num_histograms uint = length/literals_per_histogram + 1
 	var histograms []histogramCommand
diff --git a/vendor/github.com/andybalholm/brotli/brotli_bit_stream.go b/vendor/github.com/andybalholm/brotli/brotli_bit_stream.go
index 395f604..7acfb18 100644
--- a/vendor/github.com/andybalholm/brotli/brotli_bit_stream.go
+++ b/vendor/github.com/andybalholm/brotli/brotli_bit_stream.go
@@ -1,6 +1,9 @@
 package brotli
 
-import "math"
+import (
+	"math"
+	"sync"
+)
 
 const maxHuffmanTreeSize = (2*numCommandSymbols + 1)
 
@@ -411,10 +414,12 @@ func buildAndStoreHuffmanTree(histogram []uint32, histogram_length uint, alphabe
 	}
 }
 
-func sortHuffmanTree1(v0 *huffmanTree, v1 *huffmanTree) bool {
+func sortHuffmanTree1(v0 huffmanTree, v1 huffmanTree) bool {
 	return v0.total_count_ < v1.total_count_
 }
 
+var huffmanTreePool sync.Pool
+
 func buildAndStoreHuffmanTreeFast(histogram []uint32, histogram_total uint, max_bits uint, depth []byte, bits []uint16, storage_ix *uint, storage []byte) {
 	var count uint = 0
 	var symbols = [4]uint{0}
@@ -446,7 +451,13 @@ func buildAndStoreHuffmanTreeFast(histogram []uint32, histogram_total uint, max_
 	}
 	{
 		var max_tree_size uint = 2*length + 1
-		var tree []huffmanTree = make([]huffmanTree, max_tree_size)
+		tree, _ := huffmanTreePool.Get().(*[]huffmanTree)
+		if tree == nil || cap(*tree) < int(max_tree_size) {
+			tmp := make([]huffmanTree, max_tree_size)
+			tree = &tmp
+		} else {
+			*tree = (*tree)[:max_tree_size]
+		}
 		var count_limit uint32
 		for count_limit = 1; ; count_limit *= 2 {
 			var node int = 0
@@ -455,9 +466,9 @@ func buildAndStoreHuffmanTreeFast(histogram []uint32, histogram_total uint, max_
 				l--
 				if histogram[l] != 0 {
 					if histogram[l] >= count_limit {
-						initHuffmanTree(&tree[node:][0], histogram[l], -1, int16(l))
+						initHuffmanTree(&(*tree)[node:][0], histogram[l], -1, int16(l))
 					} else {
-						initHuffmanTree(&tree[node:][0], count_limit, -1, int16(l))
+						initHuffmanTree(&(*tree)[node:][0], count_limit, -1, int16(l))
 					}
 
 					node++
@@ -471,7 +482,7 @@ func buildAndStoreHuffmanTreeFast(histogram []uint32, histogram_total uint, max_
 				var j int = n + 1
 				var k int
 
-				sortHuffmanTreeItems(tree, uint(n), huffmanTreeComparator(sortHuffmanTree1))
+				sortHuffmanTreeItems(*tree, uint(n), huffmanTreeComparator(sortHuffmanTree1))
 
 				/* The nodes are:
 				   [0, n): the sorted leaf nodes that we start with.
@@ -482,15 +493,15 @@ func buildAndStoreHuffmanTreeFast(histogram []uint32, histogram_total uint, max_
 				   There will be (2n+1) elements at the end. */
 				initHuffmanTree(&sentinel, math.MaxUint32, -1, -1)
 
-				tree[node] = sentinel
+				(*tree)[node] = sentinel
 				node++
-				tree[node] = sentinel
+				(*tree)[node] = sentinel
 				node++
 
 				for k = n - 1; k > 0; k-- {
 					var left int
 					var right int
-					if tree[i].total_count_ <= tree[j].total_count_ {
+					if (*tree)[i].total_count_ <= (*tree)[j].total_count_ {
 						left = i
 						i++
 					} else {
@@ -498,7 +509,7 @@ func buildAndStoreHuffmanTreeFast(histogram []uint32, histogram_total uint, max_
 						j++
 					}
 
-					if tree[i].total_count_ <= tree[j].total_count_ {
+					if (*tree)[i].total_count_ <= (*tree)[j].total_count_ {
 						right = i
 						i++
 					} else {
@@ -507,17 +518,17 @@ func buildAndStoreHuffmanTreeFast(histogram []uint32, histogram_total uint, max_
 					}
 
 					/* The sentinel node becomes the parent node. */
-					tree[node-1].total_count_ = tree[left].total_count_ + tree[right].total_count_
+					(*tree)[node-1].total_count_ = (*tree)[left].total_count_ + (*tree)[right].total_count_
 
-					tree[node-1].index_left_ = int16(left)
-					tree[node-1].index_right_or_value_ = int16(right)
+					(*tree)[node-1].index_left_ = int16(left)
+					(*tree)[node-1].index_right_or_value_ = int16(right)
 
 					/* Add back the last sentinel node. */
-					tree[node] = sentinel
+					(*tree)[node] = sentinel
 					node++
 				}
 
-				if setDepth(2*n-1, tree, depth, 14) {
+				if setDepth(2*n-1, *tree, depth, 14) {
 					/* We need to pack the Huffman tree in 14 bits. If this was not
 					   successful, add fake entities to the lowest values and retry. */
 					break
@@ -525,7 +536,7 @@ func buildAndStoreHuffmanTreeFast(histogram []uint32, histogram_total uint, max_
 			}
 		}
 
-		tree = nil
+		huffmanTreePool.Put(tree)
 	}
 
 	convertBitDepthsToSymbols(depth, length, bits)
@@ -875,27 +886,37 @@ type blockEncoder struct {
 	bits_             []uint16
 }
 
-func initBlockEncoder(self *blockEncoder, histogram_length uint, num_block_types uint, block_types []byte, block_lengths []uint32, num_blocks uint) {
+var blockEncoderPool sync.Pool
+
+func getBlockEncoder(histogram_length uint, num_block_types uint, block_types []byte, block_lengths []uint32, num_blocks uint) *blockEncoder {
+	self, _ := blockEncoderPool.Get().(*blockEncoder)
+
+	if self != nil {
+		self.block_ix_ = 0
+		self.entropy_ix_ = 0
+		self.depths_ = self.depths_[:0]
+		self.bits_ = self.bits_[:0]
+	} else {
+		self = &blockEncoder{}
+	}
+
 	self.histogram_length_ = histogram_length
 	self.num_block_types_ = num_block_types
 	self.block_types_ = block_types
 	self.block_lengths_ = block_lengths
 	self.num_blocks_ = num_blocks
 	initBlockTypeCodeCalculator(&self.block_split_code_.type_code_calculator)
-	self.block_ix_ = 0
 	if num_blocks == 0 {
 		self.block_len_ = 0
 	} else {
 		self.block_len_ = uint(block_lengths[0])
 	}
-	self.entropy_ix_ = 0
-	self.depths_ = nil
-	self.bits_ = nil
+
+	return self
 }
 
 func cleanupBlockEncoder(self *blockEncoder) {
-	self.depths_ = nil
-	self.bits_ = nil
+	blockEncoderPool.Put(self)
 }
 
 /* Creates entropy codes of block lengths and block types and stores them
@@ -948,8 +969,16 @@ func storeSymbolWithContext(self *blockEncoder, symbol uint, context uint, conte
 
 func buildAndStoreEntropyCodesLiteral(self *blockEncoder, histograms []histogramLiteral, histograms_size uint, alphabet_size uint, tree []huffmanTree, storage_ix *uint, storage []byte) {
 	var table_size uint = histograms_size * self.histogram_length_
-	self.depths_ = make([]byte, table_size)
-	self.bits_ = make([]uint16, table_size)
+	if cap(self.depths_) < int(table_size) {
+		self.depths_ = make([]byte, table_size)
+	} else {
+		self.depths_ = self.depths_[:table_size]
+	}
+	if cap(self.bits_) < int(table_size) {
+		self.bits_ = make([]uint16, table_size)
+	} else {
+		self.bits_ = self.bits_[:table_size]
+	}
 	{
 		var i uint
 		for i = 0; i < histograms_size; i++ {
@@ -961,8 +990,16 @@ func buildAndStoreEntropyCodesLiteral(self *blockEncoder, histograms []histogram
 
 func buildAndStoreEntropyCodesCommand(self *blockEncoder, histograms []histogramCommand, histograms_size uint, alphabet_size uint, tree []huffmanTree, storage_ix *uint, storage []byte) {
 	var table_size uint = histograms_size * self.histogram_length_
-	self.depths_ = make([]byte, table_size)
-	self.bits_ = make([]uint16, table_size)
+	if cap(self.depths_) < int(table_size) {
+		self.depths_ = make([]byte, table_size)
+	} else {
+		self.depths_ = self.depths_[:table_size]
+	}
+	if cap(self.bits_) < int(table_size) {
+		self.bits_ = make([]uint16, table_size)
+	} else {
+		self.bits_ = self.bits_[:table_size]
+	}
 	{
 		var i uint
 		for i = 0; i < histograms_size; i++ {
@@ -974,8 +1011,16 @@ func buildAndStoreEntropyCodesCommand(self *blockEncoder, histograms []histogram
 
 func buildAndStoreEntropyCodesDistance(self *blockEncoder, histograms []histogramDistance, histograms_size uint, alphabet_size uint, tree []huffmanTree, storage_ix *uint, storage []byte) {
 	var table_size uint = histograms_size * self.histogram_length_
-	self.depths_ = make([]byte, table_size)
-	self.bits_ = make([]uint16, table_size)
+	if cap(self.depths_) < int(table_size) {
+		self.depths_ = make([]byte, table_size)
+	} else {
+		self.depths_ = self.depths_[:table_size]
+	}
+	if cap(self.bits_) < int(table_size) {
+		self.bits_ = make([]uint16, table_size)
+	} else {
+		self.bits_ = self.bits_[:table_size]
+	}
 	{
 		var i uint
 		for i = 0; i < histograms_size; i++ {
@@ -990,16 +1035,13 @@ func jumpToByteBoundary(storage_ix *uint, storage []byte) {
 	storage[*storage_ix>>3] = 0
 }
 
-func storeMetaBlock(input []byte, start_pos uint, length uint, mask uint, prev_byte byte, prev_byte2 byte, is_last bool, params *encoderParams, literal_context_mode int, commands []command, n_commands uint, mb *metaBlockSplit, storage_ix *uint, storage []byte) {
+func storeMetaBlock(input []byte, start_pos uint, length uint, mask uint, prev_byte byte, prev_byte2 byte, is_last bool, params *encoderParams, literal_context_mode int, commands []command, mb *metaBlockSplit, storage_ix *uint, storage []byte) {
 	var pos uint = start_pos
 	var i uint
 	var num_distance_symbols uint32 = params.dist.alphabet_size
 	var num_effective_distance_symbols uint32 = num_distance_symbols
 	var tree []huffmanTree
 	var literal_context_lut contextLUT = getContextLUT(literal_context_mode)
-	var literal_enc blockEncoder
-	var command_enc blockEncoder
-	var distance_enc blockEncoder
 	var dist *distanceParams = &params.dist
 	if params.large_window && num_effective_distance_symbols > numHistogramDistanceSymbols {
 		num_effective_distance_symbols = numHistogramDistanceSymbols
@@ -1008,13 +1050,13 @@ func storeMetaBlock(input []byte, start_pos uint, length uint, mask uint, prev_b
 	storeCompressedMetaBlockHeader(is_last, length, storage_ix, storage)
 
 	tree = make([]huffmanTree, maxHuffmanTreeSize)
-	initBlockEncoder(&literal_enc, numLiteralSymbols, mb.literal_split.num_types, mb.literal_split.types, mb.literal_split.lengths, mb.literal_split.num_blocks)
-	initBlockEncoder(&command_enc, numCommandSymbols, mb.command_split.num_types, mb.command_split.types, mb.command_split.lengths, mb.command_split.num_blocks)
-	initBlockEncoder(&distance_enc, uint(num_effective_distance_symbols), mb.distance_split.num_types, mb.distance_split.types, mb.distance_split.lengths, mb.distance_split.num_blocks)
+	literal_enc := getBlockEncoder(numLiteralSymbols, mb.literal_split.num_types, mb.literal_split.types, mb.literal_split.lengths, mb.literal_split.num_blocks)
+	command_enc := getBlockEncoder(numCommandSymbols, mb.command_split.num_types, mb.command_split.types, mb.command_split.lengths, mb.command_split.num_blocks)
+	distance_enc := getBlockEncoder(uint(num_effective_distance_symbols), mb.distance_split.num_types, mb.distance_split.types, mb.distance_split.lengths, mb.distance_split.num_blocks)
 
-	buildAndStoreBlockSwitchEntropyCodes(&literal_enc, tree, storage_ix, storage)
-	buildAndStoreBlockSwitchEntropyCodes(&command_enc, tree, storage_ix, storage)
-	buildAndStoreBlockSwitchEntropyCodes(&distance_enc, tree, storage_ix, storage)
+	buildAndStoreBlockSwitchEntropyCodes(literal_enc, tree, storage_ix, storage)
+	buildAndStoreBlockSwitchEntropyCodes(command_enc, tree, storage_ix, storage)
+	buildAndStoreBlockSwitchEntropyCodes(distance_enc, tree, storage_ix, storage)
 
 	writeBits(2, uint64(dist.distance_postfix_bits), storage_ix, storage)
 	writeBits(4, uint64(dist.num_direct_distance_codes)>>dist.distance_postfix_bits, storage_ix, storage)
@@ -1034,20 +1076,19 @@ func storeMetaBlock(input []byte, start_pos uint, length uint, mask uint, prev_b
 		encodeContextMap(mb.distance_context_map, mb.distance_context_map_size, mb.distance_histograms_size, tree, storage_ix, storage)
 	}
 
-	buildAndStoreEntropyCodesLiteral(&literal_enc, mb.literal_histograms, mb.literal_histograms_size, numLiteralSymbols, tree, storage_ix, storage)
-	buildAndStoreEntropyCodesCommand(&command_enc, mb.command_histograms, mb.command_histograms_size, numCommandSymbols, tree, storage_ix, storage)
-	buildAndStoreEntropyCodesDistance(&distance_enc, mb.distance_histograms, mb.distance_histograms_size, uint(num_distance_symbols), tree, storage_ix, storage)
+	buildAndStoreEntropyCodesLiteral(literal_enc, mb.literal_histograms, mb.literal_histograms_size, numLiteralSymbols, tree, storage_ix, storage)
+	buildAndStoreEntropyCodesCommand(command_enc, mb.command_histograms, mb.command_histograms_size, numCommandSymbols, tree, storage_ix, storage)
+	buildAndStoreEntropyCodesDistance(distance_enc, mb.distance_histograms, mb.distance_histograms_size, uint(num_distance_symbols), tree, storage_ix, storage)
 	tree = nil
 
-	for i = 0; i < n_commands; i++ {
-		var cmd command = commands[i]
+	for _, cmd := range commands {
 		var cmd_code uint = uint(cmd.cmd_prefix_)
-		storeSymbol(&command_enc, cmd_code, storage_ix, storage)
+		storeSymbol(command_enc, cmd_code, storage_ix, storage)
 		storeCommandExtra(&cmd, storage_ix, storage)
 		if mb.literal_context_map_size == 0 {
 			var j uint
 			for j = uint(cmd.insert_len_); j != 0; j-- {
-				storeSymbol(&literal_enc, uint(input[pos&mask]), storage_ix, storage)
+				storeSymbol(literal_enc, uint(input[pos&mask]), storage_ix, storage)
 				pos++
 			}
 		} else {
@@ -1055,7 +1096,7 @@ func storeMetaBlock(input []byte, start_pos uint, length uint, mask uint, prev_b
 			for j = uint(cmd.insert_len_); j != 0; j-- {
 				var context uint = uint(getContext(prev_byte, prev_byte2, literal_context_lut))
 				var literal byte = input[pos&mask]
-				storeSymbolWithContext(&literal_enc, uint(literal), context, mb.literal_context_map, storage_ix, storage, literalContextBits)
+				storeSymbolWithContext(literal_enc, uint(literal), context, mb.literal_context_map, storage_ix, storage, literalContextBits)
 				prev_byte2 = prev_byte
 				prev_byte = literal
 				pos++
@@ -1071,10 +1112,10 @@ func storeMetaBlock(input []byte, start_pos uint, length uint, mask uint, prev_b
 				var distnumextra uint32 = uint32(cmd.dist_prefix_) >> 10
 				var distextra uint64 = uint64(cmd.dist_extra_)
 				if mb.distance_context_map_size == 0 {
-					storeSymbol(&distance_enc, dist_code, storage_ix, storage)
+					storeSymbol(distance_enc, dist_code, storage_ix, storage)
 				} else {
 					var context uint = uint(commandDistanceContext(&cmd))
-					storeSymbolWithContext(&distance_enc, dist_code, context, mb.distance_context_map, storage_ix, storage, distanceContextBits)
+					storeSymbolWithContext(distance_enc, dist_code, context, mb.distance_context_map, storage_ix, storage, distanceContextBits)
 				}
 
 				writeBits(uint(distnumextra), distextra, storage_ix, storage)
@@ -1082,19 +1123,17 @@ func storeMetaBlock(input []byte, start_pos uint, length uint, mask uint, prev_b
 		}
 	}
 
-	cleanupBlockEncoder(&distance_enc)
-	cleanupBlockEncoder(&command_enc)
-	cleanupBlockEncoder(&literal_enc)
+	cleanupBlockEncoder(distance_enc)
+	cleanupBlockEncoder(command_enc)
+	cleanupBlockEncoder(literal_enc)
 	if is_last {
 		jumpToByteBoundary(storage_ix, storage)
 	}
 }
 
-func buildHistograms(input []byte, start_pos uint, mask uint, commands []command, n_commands uint, lit_histo *histogramLiteral, cmd_histo *histogramCommand, dist_histo *histogramDistance) {
+func buildHistograms(input []byte, start_pos uint, mask uint, commands []command, lit_histo *histogramLiteral, cmd_histo *histogramCommand, dist_histo *histogramDistance) {
 	var pos uint = start_pos
-	var i uint
-	for i = 0; i < n_commands; i++ {
-		var cmd command = commands[i]
+	for _, cmd := range commands {
 		var j uint
 		histogramAddCommand(cmd_histo, uint(cmd.cmd_prefix_))
 		for j = uint(cmd.insert_len_); j != 0; j-- {
@@ -1109,11 +1148,9 @@ func buildHistograms(input []byte, start_pos uint, mask uint, commands []command
 	}
 }
 
-func storeDataWithHuffmanCodes(input []byte, start_pos uint, mask uint, commands []command, n_commands uint, lit_depth []byte, lit_bits []uint16, cmd_depth []byte, cmd_bits []uint16, dist_depth []byte, dist_bits []uint16, storage_ix *uint, storage []byte) {
+func storeDataWithHuffmanCodes(input []byte, start_pos uint, mask uint, commands []command, lit_depth []byte, lit_bits []uint16, cmd_depth []byte, cmd_bits []uint16, dist_depth []byte, dist_bits []uint16, storage_ix *uint, storage []byte) {
 	var pos uint = start_pos
-	var i uint
-	for i = 0; i < n_commands; i++ {
-		var cmd command = commands[i]
+	for _, cmd := range commands {
 		var cmd_code uint = uint(cmd.cmd_prefix_)
 		var j uint
 		writeBits(uint(cmd_depth[cmd_code]), uint64(cmd_bits[cmd_code]), storage_ix, storage)
@@ -1135,7 +1172,7 @@ func storeDataWithHuffmanCodes(input []byte, start_pos uint, mask uint, commands
 	}
 }
 
-func storeMetaBlockTrivial(input []byte, start_pos uint, length uint, mask uint, is_last bool, params *encoderParams, commands []command, n_commands uint, storage_ix *uint, storage []byte) {
+func storeMetaBlockTrivial(input []byte, start_pos uint, length uint, mask uint, is_last bool, params *encoderParams, commands []command, storage_ix *uint, storage []byte) {
 	var lit_histo histogramLiteral
 	var cmd_histo histogramCommand
 	var dist_histo histogramDistance
@@ -1154,7 +1191,7 @@ func storeMetaBlockTrivial(input []byte, start_pos uint, length uint, mask uint,
 	histogramClearCommand(&cmd_histo)
 	histogramClearDistance(&dist_histo)
 
-	buildHistograms(input, start_pos, mask, commands, n_commands, &lit_histo, &cmd_histo, &dist_histo)
+	buildHistograms(input, start_pos, mask, commands, &lit_histo, &cmd_histo, &dist_histo)
 
 	writeBits(13, 0, storage_ix, storage)
 
@@ -1163,13 +1200,13 @@ func storeMetaBlockTrivial(input []byte, start_pos uint, length uint, mask uint,
 	buildAndStoreHuffmanTree(cmd_histo.data_[:], numCommandSymbols, numCommandSymbols, tree, cmd_depth[:], cmd_bits[:], storage_ix, storage)
 	buildAndStoreHuffmanTree(dist_histo.data_[:], maxSimpleDistanceAlphabetSize, uint(num_distance_symbols), tree, dist_depth[:], dist_bits[:], storage_ix, storage)
 	tree = nil
-	storeDataWithHuffmanCodes(input, start_pos, mask, commands, n_commands, lit_depth[:], lit_bits[:], cmd_depth[:], cmd_bits[:], dist_depth[:], dist_bits[:], storage_ix, storage)
+	storeDataWithHuffmanCodes(input, start_pos, mask, commands, lit_depth[:], lit_bits[:], cmd_depth[:], cmd_bits[:], dist_depth[:], dist_bits[:], storage_ix, storage)
 	if is_last {
 		jumpToByteBoundary(storage_ix, storage)
 	}
 }
 
-func storeMetaBlockFast(input []byte, start_pos uint, length uint, mask uint, is_last bool, params *encoderParams, commands []command, n_commands uint, storage_ix *uint, storage []byte) {
+func storeMetaBlockFast(input []byte, start_pos uint, length uint, mask uint, is_last bool, params *encoderParams, commands []command, storage_ix *uint, storage []byte) {
 	var num_distance_symbols uint32 = params.dist.alphabet_size
 	var distance_alphabet_bits uint32 = log2FloorNonZero(uint(num_distance_symbols-1)) + 1
 
@@ -1177,15 +1214,13 @@ func storeMetaBlockFast(input []byte, start_pos uint, length uint, mask uint, is
 
 	writeBits(13, 0, storage_ix, storage)
 
-	if n_commands <= 128 {
+	if len(commands) <= 128 {
 		var histogram = [numLiteralSymbols]uint32{0}
 		var pos uint = start_pos
 		var num_literals uint = 0
-		var i uint
 		var lit_depth [numLiteralSymbols]byte
 		var lit_bits [numLiteralSymbols]uint16
-		for i = 0; i < n_commands; i++ {
-			var cmd command = commands[i]
+		for _, cmd := range commands {
 			var j uint
 			for j = uint(cmd.insert_len_); j != 0; j-- {
 				histogram[input[pos&mask]]++
@@ -1201,7 +1236,7 @@ func storeMetaBlockFast(input []byte, start_pos uint, length uint, mask uint, is
 
 		storeStaticCommandHuffmanTree(storage_ix, storage)
 		storeStaticDistanceHuffmanTree(storage_ix, storage)
-		storeDataWithHuffmanCodes(input, start_pos, mask, commands, n_commands, lit_depth[:], lit_bits[:], kStaticCommandCodeDepth[:], kStaticCommandCodeBits[:], kStaticDistanceCodeDepth[:], kStaticDistanceCodeBits[:], storage_ix, storage)
+		storeDataWithHuffmanCodes(input, start_pos, mask, commands, lit_depth[:], lit_bits[:], kStaticCommandCodeDepth[:], kStaticCommandCodeBits[:], kStaticDistanceCodeDepth[:], kStaticDistanceCodeBits[:], storage_ix, storage)
 	} else {
 		var lit_histo histogramLiteral
 		var cmd_histo histogramCommand
@@ -1215,7 +1250,7 @@ func storeMetaBlockFast(input []byte, start_pos uint, length uint, mask uint, is
 		histogramClearLiteral(&lit_histo)
 		histogramClearCommand(&cmd_histo)
 		histogramClearDistance(&dist_histo)
-		buildHistograms(input, start_pos, mask, commands, n_commands, &lit_histo, &cmd_histo, &dist_histo)
+		buildHistograms(input, start_pos, mask, commands, &lit_histo, &cmd_histo, &dist_histo)
 		buildAndStoreHuffmanTreeFast(lit_histo.data_[:], lit_histo.total_count_, /* max_bits = */
 			8, lit_depth[:], lit_bits[:], storage_ix, storage)
 
@@ -1225,7 +1260,7 @@ func storeMetaBlockFast(input []byte, start_pos uint, length uint, mask uint, is
 		buildAndStoreHuffmanTreeFast(dist_histo.data_[:], dist_histo.total_count_, /* max_bits = */
 			uint(distance_alphabet_bits), dist_depth[:], dist_bits[:], storage_ix, storage)
 
-		storeDataWithHuffmanCodes(input, start_pos, mask, commands, n_commands, lit_depth[:], lit_bits[:], cmd_depth[:], cmd_bits[:], dist_depth[:], dist_bits[:], storage_ix, storage)
+		storeDataWithHuffmanCodes(input, start_pos, mask, commands, lit_depth[:], lit_bits[:], cmd_depth[:], cmd_bits[:], dist_depth[:], dist_bits[:], storage_ix, storage)
 	}
 
 	if is_last {
diff --git a/vendor/github.com/andybalholm/brotli/cluster_command.go b/vendor/github.com/andybalholm/brotli/cluster_command.go
index 7449751..45b569b 100644
--- a/vendor/github.com/andybalholm/brotli/cluster_command.go
+++ b/vendor/github.com/andybalholm/brotli/cluster_command.go
@@ -1,7 +1,5 @@
 package brotli
 
-import "math"
-
 /* Copyright 2013 Google Inc. All Rights Reserved.
 
    Distributed under MIT license.
@@ -164,163 +162,3 @@ func histogramBitCostDistanceCommand(histogram *histogramCommand, candidate *his
 		return populationCostCommand(&tmp) - candidate.bit_cost_
 	}
 }
-
-/* Find the best 'out' histogram for each of the 'in' histograms.
-   When called, clusters[0..num_clusters) contains the unique values from
-   symbols[0..in_size), but this property is not preserved in this function.
-   Note: we assume that out[]->bit_cost_ is already up-to-date. */
-func histogramRemapCommand(in []histogramCommand, in_size uint, clusters []uint32, num_clusters uint, out []histogramCommand, symbols []uint32) {
-	var i uint
-	for i = 0; i < in_size; i++ {
-		var best_out uint32
-		if i == 0 {
-			best_out = symbols[0]
-		} else {
-			best_out = symbols[i-1]
-		}
-		var best_bits float64 = histogramBitCostDistanceCommand(&in[i], &out[best_out])
-		var j uint
-		for j = 0; j < num_clusters; j++ {
-			var cur_bits float64 = histogramBitCostDistanceCommand(&in[i], &out[clusters[j]])
-			if cur_bits < best_bits {
-				best_bits = cur_bits
-				best_out = clusters[j]
-			}
-		}
-
-		symbols[i] = best_out
-	}
-
-	/* Recompute each out based on raw and symbols. */
-	for i = 0; i < num_clusters; i++ {
-		histogramClearCommand(&out[clusters[i]])
-	}
-
-	for i = 0; i < in_size; i++ {
-		histogramAddHistogramCommand(&out[symbols[i]], &in[i])
-	}
-}
-
-/* Reorders elements of the out[0..length) array and changes values in
-   symbols[0..length) array in the following way:
-     * when called, symbols[] contains indexes into out[], and has N unique
-       values (possibly N < length)
-     * on return, symbols'[i] = f(symbols[i]) and
-                  out'[symbols'[i]] = out[symbols[i]], for each 0 <= i < length,
-       where f is a bijection between the range of symbols[] and [0..N), and
-       the first occurrences of values in symbols'[i] come in consecutive
-       increasing order.
-   Returns N, the number of unique values in symbols[]. */
-
-var histogramReindexCommand_kInvalidIndex uint32 = math.MaxUint32
-
-func histogramReindexCommand(out []histogramCommand, symbols []uint32, length uint) uint {
-	var new_index []uint32 = make([]uint32, length)
-	var next_index uint32
-	var tmp []histogramCommand
-	var i uint
-	for i = 0; i < length; i++ {
-		new_index[i] = histogramReindexCommand_kInvalidIndex
-	}
-
-	next_index = 0
-	for i = 0; i < length; i++ {
-		if new_index[symbols[i]] == histogramReindexCommand_kInvalidIndex {
-			new_index[symbols[i]] = next_index
-			next_index++
-		}
-	}
-
-	/* TODO: by using idea of "cycle-sort" we can avoid allocation of
-	   tmp and reduce the number of copying by the factor of 2. */
-	tmp = make([]histogramCommand, next_index)
-
-	next_index = 0
-	for i = 0; i < length; i++ {
-		if new_index[symbols[i]] == next_index {
-			tmp[next_index] = out[symbols[i]]
-			next_index++
-		}
-
-		symbols[i] = new_index[symbols[i]]
-	}
-
-	new_index = nil
-	for i = 0; uint32(i) < next_index; i++ {
-		out[i] = tmp[i]
-	}
-
-	tmp = nil
-	return uint(next_index)
-}
-
-func clusterHistogramsCommand(in []histogramCommand, in_size uint, max_histograms uint, out []histogramCommand, out_size *uint, histogram_symbols []uint32) {
-	var cluster_size []uint32 = make([]uint32, in_size)
-	var clusters []uint32 = make([]uint32, in_size)
-	var num_clusters uint = 0
-	var max_input_histograms uint = 64
-	var pairs_capacity uint = max_input_histograms * max_input_histograms / 2
-	var pairs []histogramPair = make([]histogramPair, (pairs_capacity + 1))
-	var i uint
-
-	/* For the first pass of clustering, we allow all pairs. */
-	for i = 0; i < in_size; i++ {
-		cluster_size[i] = 1
-	}
-
-	for i = 0; i < in_size; i++ {
-		out[i] = in[i]
-		out[i].bit_cost_ = populationCostCommand(&in[i])
-		histogram_symbols[i] = uint32(i)
-	}
-
-	for i = 0; i < in_size; i += max_input_histograms {
-		var num_to_combine uint = brotli_min_size_t(in_size-i, max_input_histograms)
-		var num_new_clusters uint
-		var j uint
-		for j = 0; j < num_to_combine; j++ {
-			clusters[num_clusters+j] = uint32(i + j)
-		}
-
-		num_new_clusters = histogramCombineCommand(out, cluster_size, histogram_symbols[i:], clusters[num_clusters:], pairs, num_to_combine, num_to_combine, max_histograms, pairs_capacity)
-		num_clusters += num_new_clusters
-	}
-	{
-		/* For the second pass, we limit the total number of histogram pairs.
-		   After this limit is reached, we only keep searching for the best pair. */
-		var max_num_pairs uint = brotli_min_size_t(64*num_clusters, (num_clusters/2)*num_clusters)
-		if pairs_capacity < (max_num_pairs + 1) {
-			var _new_size uint
-			if pairs_capacity == 0 {
-				_new_size = max_num_pairs + 1
-			} else {
-				_new_size = pairs_capacity
-			}
-			var new_array []histogramPair
-			for _new_size < (max_num_pairs + 1) {
-				_new_size *= 2
-			}
-			new_array = make([]histogramPair, _new_size)
-			if pairs_capacity != 0 {
-				copy(new_array, pairs[:pairs_capacity])
-			}
-
-			pairs = new_array
-			pairs_capacity = _new_size
-		}
-
-		/* Collapse similar histograms. */
-		num_clusters = histogramCombineCommand(out, cluster_size, histogram_symbols, clusters, pairs, num_clusters, in_size, max_histograms, max_num_pairs)
-	}
-
-	pairs = nil
-	cluster_size = nil
-
-	/* Find the optimal map from original histograms to the final ones. */
-	histogramRemapCommand(in, in_size, clusters, num_clusters, out, histogram_symbols)
-
-	clusters = nil
-
-	/* Convert the context map to a canonical form. */
-	*out_size = histogramReindexCommand(out, histogram_symbols, in_size)
-}
diff --git a/vendor/github.com/andybalholm/brotli/command.go b/vendor/github.com/andybalholm/brotli/command.go
index e93ccdf..b1662a5 100644
--- a/vendor/github.com/andybalholm/brotli/command.go
+++ b/vendor/github.com/andybalholm/brotli/command.go
@@ -194,26 +194,28 @@ type command struct {
 }
 
 /* distance_code is e.g. 0 for same-as-last short code, or 16 for offset 1. */
-func initCommand(self *command, dist *distanceParams, insertlen uint, copylen uint, copylen_code_delta int, distance_code uint) {
+func makeCommand(dist *distanceParams, insertlen uint, copylen uint, copylen_code_delta int, distance_code uint) (cmd command) {
 	/* Don't rely on signed int representation, use honest casts. */
 	var delta uint32 = uint32(byte(int8(copylen_code_delta)))
-	self.insert_len_ = uint32(insertlen)
-	self.copy_len_ = uint32(uint32(copylen) | delta<<25)
+	cmd.insert_len_ = uint32(insertlen)
+	cmd.copy_len_ = uint32(uint32(copylen) | delta<<25)
 
 	/* The distance prefix and extra bits are stored in this Command as if
 	   npostfix and ndirect were 0, they are only recomputed later after the
 	   clustering if needed. */
-	prefixEncodeCopyDistance(distance_code, uint(dist.num_direct_distance_codes), uint(dist.distance_postfix_bits), &self.dist_prefix_, &self.dist_extra_)
+	prefixEncodeCopyDistance(distance_code, uint(dist.num_direct_distance_codes), uint(dist.distance_postfix_bits), &cmd.dist_prefix_, &cmd.dist_extra_)
+	getLengthCode(insertlen, uint(int(copylen)+copylen_code_delta), (cmd.dist_prefix_&0x3FF == 0), &cmd.cmd_prefix_)
 
-	getLengthCode(insertlen, uint(int(copylen)+copylen_code_delta), (self.dist_prefix_&0x3FF == 0), &self.cmd_prefix_)
+	return cmd
 }
 
-func initInsertCommand(self *command, insertlen uint) {
-	self.insert_len_ = uint32(insertlen)
-	self.copy_len_ = 4 << 25
-	self.dist_extra_ = 0
-	self.dist_prefix_ = numDistanceShortCodes
-	getLengthCode(insertlen, 4, false, &self.cmd_prefix_)
+func makeInsertCommand(insertlen uint) (cmd command) {
+	cmd.insert_len_ = uint32(insertlen)
+	cmd.copy_len_ = 4 << 25
+	cmd.dist_extra_ = 0
+	cmd.dist_prefix_ = numDistanceShortCodes
+	getLengthCode(insertlen, 4, false, &cmd.cmd_prefix_)
+	return cmd
 }
 
 func commandRestoreDistanceCode(self *command, dist *distanceParams) uint32 {
diff --git a/vendor/github.com/andybalholm/brotli/compress_fragment.go b/vendor/github.com/andybalholm/brotli/compress_fragment.go
index 435898e..c9bd057 100644
--- a/vendor/github.com/andybalholm/brotli/compress_fragment.go
+++ b/vendor/github.com/andybalholm/brotli/compress_fragment.go
@@ -33,14 +33,8 @@ func hashBytesAtOffset5(v uint64, offset int, shift uint) uint32 {
 }
 
 func isMatch5(p1 []byte, p2 []byte) bool {
-	var i int
-	for i = 0; i < 5; i++ {
-		if p1[i] != p2[i] {
-			return false
-		}
-	}
-
-	return true
+	return binary.LittleEndian.Uint32(p1) == binary.LittleEndian.Uint32(p2) &&
+		p1[4] == p2[4]
 }
 
 /* Builds a literal prefix code into "depths" and "bits" based on the statistics
@@ -610,7 +604,7 @@ emit_commands:
 				assert(candidate < ip)
 
 				table[hash] = int(ip - base_ip)
-				if !(!isMatch5(in[ip:], in[candidate:])) {
+				if isMatch5(in[ip:], in[candidate:]) {
 					break
 				}
 			}
diff --git a/vendor/github.com/andybalholm/brotli/compress_fragment_two_pass.go b/vendor/github.com/andybalholm/brotli/compress_fragment_two_pass.go
index ffeb321..172dc7f 100644
--- a/vendor/github.com/andybalholm/brotli/compress_fragment_two_pass.go
+++ b/vendor/github.com/andybalholm/brotli/compress_fragment_two_pass.go
@@ -30,14 +30,13 @@ func hashBytesAtOffset(v uint64, offset uint, shift uint, length uint) uint32 {
 }
 
 func isMatch1(p1 []byte, p2 []byte, length uint) bool {
-	var i uint
-	for i = 0; i < length && i < 6; i++ {
-		if p1[i] != p2[i] {
-			return false
-		}
+	if binary.LittleEndian.Uint32(p1) != binary.LittleEndian.Uint32(p2) {
+		return false
 	}
-
-	return true
+	if length == 4 {
+		return true
+	}
+	return p1[4] == p2[4] && p1[5] == p2[5]
 }
 
 /* Builds a command and distance prefix code (each 64 symbols) into "depth" and
diff --git a/vendor/github.com/andybalholm/brotli/decode.go b/vendor/github.com/andybalholm/brotli/decode.go
index d2f39a0..6a73b88 100644
--- a/vendor/github.com/andybalholm/brotli/decode.go
+++ b/vendor/github.com/andybalholm/brotli/decode.go
@@ -50,21 +50,6 @@ const (
 	decoderErrorUnreachable                 = -31
 )
 
-/**
- * The value of the last error code, negative integer.
- *
- * All other error code values are in the range from ::lastErrorCode
- * to @c -1. There are also 4 other possible non-error codes @c 0 .. @c 3 in
- * ::BrotliDecoderErrorCode enumeration.
- */
-const lastErrorCode = decoderErrorUnreachable
-
-/** Options to be used with ::BrotliDecoderSetParameter. */
-const (
-	decoderParamDisableRingBufferReallocation = 0
-	decoderParamLargeWindow                   = 1
-)
-
 const huffmanTableBits = 8
 
 const huffmanTableMask = 0xFF
@@ -81,28 +66,6 @@ var kCodeLengthPrefixLength = [16]byte{2, 2, 2, 3, 2, 2, 2, 4, 2, 2, 2, 3, 2, 2,
 
 var kCodeLengthPrefixValue = [16]byte{0, 4, 3, 2, 0, 4, 3, 1, 0, 4, 3, 2, 0, 4, 3, 5}
 
-func decoderSetParameter(state *Reader, p int, value uint32) bool {
-	if state.state != stateUninited {
-		return false
-	}
-	switch p {
-	case decoderParamDisableRingBufferReallocation:
-		if !(value == 0) {
-			state.canny_ringbuffer_allocation = 0
-		} else {
-			state.canny_ringbuffer_allocation = 1
-		}
-		return true
-
-	case decoderParamLargeWindow:
-		state.large_window = (!(value == 0))
-		return true
-
-	default:
-		return false
-	}
-}
-
 /* Saves error code and converts it to BrotliDecoderResult. */
 func saveErrorCode(s *Reader, e int) int {
 	s.error_code = int(e)
@@ -1125,10 +1088,8 @@ func decodeContextMap(context_map_size uint32, num_htrees *uint32, context_map_a
    Reads 3..54 bits. */
 func decodeBlockTypeAndLength(safe int, s *Reader, tree_type int) bool {
 	var max_block_type uint32 = s.num_block_types[tree_type]
-	var type_tree []huffmanCode
-	type_tree = s.block_type_trees[tree_type*huffmanMaxSize258:]
-	var len_tree []huffmanCode
-	len_tree = s.block_len_trees[tree_type*huffmanMaxSize26:]
+	type_tree := s.block_type_trees[tree_type*huffmanMaxSize258:]
+	len_tree := s.block_len_trees[tree_type*huffmanMaxSize26:]
 	var br *bitReader = &s.br
 	var ringbuffer []uint32 = s.block_type_rb[tree_type*2:]
 	var block_type uint32
@@ -1280,8 +1241,7 @@ func unwrittenBytes(s *Reader, wrap bool) uint {
    Returns BROTLI_DECODER_NEEDS_MORE_OUTPUT only if there is more output to push
    and either ring-buffer is as big as window size, or |force| is true. */
 func writeRingBuffer(s *Reader, available_out *uint, next_out *[]byte, total_out *uint, force bool) int {
-	var start []byte
-	start = s.ringbuffer[s.partial_pos_out&uint(s.ringbuffer_mask):]
+	start := s.ringbuffer[s.partial_pos_out&uint(s.ringbuffer_mask):]
 	var to_write uint = unwrittenBytes(s, true)
 	var num_written uint = *available_out
 	if num_written > to_write {
@@ -1412,8 +1372,7 @@ func copyUncompressedBlockToOutput(available_out *uint, next_out *[]byte, total_
 
 		case stateUncompressedWrite:
 			{
-				var result int
-				result = writeRingBuffer(s, available_out, next_out, total_out, false)
+				result := writeRingBuffer(s, available_out, next_out, total_out, false)
 				if result != decoderSuccess {
 					return result
 				}
@@ -1931,8 +1890,7 @@ CommandPostDecodeLiterals:
 			}
 
 			if transform_idx < int(trans.num_transforms) {
-				var word []byte
-				word = words.data[offset:]
+				word := words.data[offset:]
 				var len int = i
 				if transform_idx == int(trans.cutOffTransforms[0]) {
 					copy(s.ringbuffer[pos:], word[:uint(len)])
@@ -1954,10 +1912,8 @@ CommandPostDecodeLiterals:
 		}
 	} else {
 		var src_start int = (pos - s.distance_code) & s.ringbuffer_mask
-		var copy_dst []byte
-		copy_dst = s.ringbuffer[pos:]
-		var copy_src []byte
-		copy_src = s.ringbuffer[src_start:]
+		copy_dst := s.ringbuffer[pos:]
+		copy_src := s.ringbuffer[src_start:]
 		var dst_end int = pos + i
 		var src_end int = src_start + i
 
@@ -2494,8 +2450,6 @@ func decoderDecompressStream(s *Reader, available_in *uint, next_in *[]byte, ava
 				} else {
 					s.state = stateCommandBegin
 				}
-
-				break
 			} else if s.state == stateCommandPostWrite2 {
 				s.state = stateCommandPostWrapCopy /* BROTLI_STATE_COMMAND_INNER_WRITE */
 			} else {
diff --git a/vendor/github.com/andybalholm/brotli/encode.go b/vendor/github.com/andybalholm/brotli/encode.go
index c01322b..8e25a4e 100644
--- a/vendor/github.com/andybalholm/brotli/encode.go
+++ b/vendor/github.com/andybalholm/brotli/encode.go
@@ -74,14 +74,13 @@ const (
 type Writer struct {
 	dst     io.Writer
 	options WriterOptions
+	err     error
 
 	params              encoderParams
 	hasher_             hasherHandle
 	input_pos_          uint64
 	ringbuffer_         ringBuffer
-	cmd_alloc_size_     uint
-	commands_           []command
-	num_commands_       uint
+	commands            []command
 	num_literals_       uint
 	last_insert_len_    uint
 	last_flush_pos_     uint64
@@ -92,8 +91,7 @@ type Writer struct {
 	last_bytes_bits_    byte
 	prev_byte_          byte
 	prev_byte2_         byte
-	storage_size_       uint
-	storage_            []byte
+	storage             []byte
 	small_table_        [1 << 10]int
 	large_table_        []int
 	large_table_size_   uint
@@ -103,9 +101,6 @@ type Writer struct {
 	cmd_code_numbits_   uint
 	command_buf_        []uint32
 	literal_buf_        []byte
-	next_out_           []byte
-	available_out_      uint
-	total_out_          uint
 	tiny_buf_           struct {
 		u64 [2]uint64
 		u8  [16]byte
@@ -146,14 +141,12 @@ func wrapPosition(position uint64) uint32 {
 	return result
 }
 
-func getBrotliStorage(s *Writer, size uint) []byte {
-	if s.storage_size_ < size {
-		s.storage_ = nil
-		s.storage_ = make([]byte, size)
-		s.storage_size_ = size
+func (s *Writer) getStorage(size int) []byte {
+	if len(s.storage) < size {
+		s.storage = make([]byte, size)
 	}
 
-	return s.storage_
+	return s.storage
 }
 
 func hashTableSize(max_table_size uint, input_size uint) uint {
@@ -222,332 +215,6 @@ func encodeWindowBits(lgwin int, large_window bool, last_bytes *uint16, last_byt
 	}
 }
 
-/* Initializes the command and distance prefix codes for the first block. */
-
-var initCommandPrefixCodes_kDefaultCommandDepths = [128]byte{
-	0,
-	4,
-	4,
-	5,
-	6,
-	6,
-	7,
-	7,
-	7,
-	7,
-	7,
-	8,
-	8,
-	8,
-	8,
-	8,
-	0,
-	0,
-	0,
-	4,
-	4,
-	4,
-	4,
-	4,
-	5,
-	5,
-	6,
-	6,
-	6,
-	6,
-	7,
-	7,
-	7,
-	7,
-	10,
-	10,
-	10,
-	10,
-	10,
-	10,
-	0,
-	4,
-	4,
-	5,
-	5,
-	5,
-	6,
-	6,
-	7,
-	8,
-	8,
-	9,
-	10,
-	10,
-	10,
-	10,
-	10,
-	10,
-	10,
-	10,
-	10,
-	10,
-	10,
-	10,
-	5,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	6,
-	6,
-	6,
-	6,
-	6,
-	6,
-	5,
-	5,
-	5,
-	5,
-	5,
-	5,
-	4,
-	4,
-	4,
-	4,
-	4,
-	4,
-	4,
-	5,
-	5,
-	5,
-	5,
-	5,
-	5,
-	6,
-	6,
-	7,
-	7,
-	7,
-	8,
-	10,
-	12,
-	12,
-	12,
-	12,
-	12,
-	12,
-	12,
-	12,
-	12,
-	12,
-	12,
-	12,
-}
-var initCommandPrefixCodes_kDefaultCommandBits = [128]uint16{
-	0,
-	0,
-	8,
-	9,
-	3,
-	35,
-	7,
-	71,
-	39,
-	103,
-	23,
-	47,
-	175,
-	111,
-	239,
-	31,
-	0,
-	0,
-	0,
-	4,
-	12,
-	2,
-	10,
-	6,
-	13,
-	29,
-	11,
-	43,
-	27,
-	59,
-	87,
-	55,
-	15,
-	79,
-	319,
-	831,
-	191,
-	703,
-	447,
-	959,
-	0,
-	14,
-	1,
-	25,
-	5,
-	21,
-	19,
-	51,
-	119,
-	159,
-	95,
-	223,
-	479,
-	991,
-	63,
-	575,
-	127,
-	639,
-	383,
-	895,
-	255,
-	767,
-	511,
-	1023,
-	14,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	27,
-	59,
-	7,
-	39,
-	23,
-	55,
-	30,
-	1,
-	17,
-	9,
-	25,
-	5,
-	0,
-	8,
-	4,
-	12,
-	2,
-	10,
-	6,
-	21,
-	13,
-	29,
-	3,
-	19,
-	11,
-	15,
-	47,
-	31,
-	95,
-	63,
-	127,
-	255,
-	767,
-	2815,
-	1791,
-	3839,
-	511,
-	2559,
-	1535,
-	3583,
-	1023,
-	3071,
-	2047,
-	4095,
-}
-var initCommandPrefixCodes_kDefaultCommandCode = []byte{
-	0xff,
-	0x77,
-	0xd5,
-	0xbf,
-	0xe7,
-	0xde,
-	0xea,
-	0x9e,
-	0x51,
-	0x5d,
-	0xde,
-	0xc6,
-	0x70,
-	0x57,
-	0xbc,
-	0x58,
-	0x58,
-	0x58,
-	0xd8,
-	0xd8,
-	0x58,
-	0xd5,
-	0xcb,
-	0x8c,
-	0xea,
-	0xe0,
-	0xc3,
-	0x87,
-	0x1f,
-	0x83,
-	0xc1,
-	0x60,
-	0x1c,
-	0x67,
-	0xb2,
-	0xaa,
-	0x06,
-	0x83,
-	0xc1,
-	0x60,
-	0x30,
-	0x18,
-	0xcc,
-	0xa1,
-	0xce,
-	0x88,
-	0x54,
-	0x94,
-	0x46,
-	0xe1,
-	0xb0,
-	0xd0,
-	0x4e,
-	0xb2,
-	0xf7,
-	0x04,
-	0x00,
-}
-var initCommandPrefixCodes_kDefaultCommandCodeNumBits uint = 448
-
-func initCommandPrefixCodes(cmd_depths []byte, cmd_bits []uint16, cmd_code []byte, cmd_code_numbits *uint) {
-	copy(cmd_depths, initCommandPrefixCodes_kDefaultCommandDepths[:])
-	copy(cmd_bits, initCommandPrefixCodes_kDefaultCommandBits[:])
-
-	/* Initialize the pre-compressed form of the command and distance prefix
-	   codes. */
-	copy(cmd_code, initCommandPrefixCodes_kDefaultCommandCode)
-
-	*cmd_code_numbits = initCommandPrefixCodes_kDefaultCommandCodeNumBits
-}
-
 /* Decide about the context map based on the ability of the prediction
    ability of the previous byte UTF8-prefix on the next byte. The
    prediction ability is calculated as Shannon entropy. Here we need
@@ -557,136 +224,16 @@ func initCommandPrefixCodes(cmd_depths []byte, cmd_bits []uint16, cmd_code []byt
    coding. */
 
 var kStaticContextMapContinuation = [64]uint32{
-	1,
-	1,
-	2,
-	2,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
+	1, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 }
 var kStaticContextMapSimpleUTF8 = [64]uint32{
-	0,
-	0,
-	1,
-	1,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
-	0,
+	0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 }
 
 func chooseContextMap(quality int, bigram_histo []uint32, num_literal_contexts *uint, literal_context_map *[]uint32) {
@@ -738,70 +285,22 @@ func chooseContextMap(quality int, bigram_histo []uint32, num_literal_contexts *
    first 5 bits of literals. */
 
 var kStaticContextMapComplexUTF8 = [64]uint32{
-	11,
-	11,
-	12,
-	12,
-	0,
-	0,
-	0,
-	0,
-	1,
-	1,
-	9,
-	9,
-	2,
-	2,
-	2,
-	2,
-	1,
-	1,
-	1,
-	1,
-	8,
-	3,
-	3,
-	3,
-	1,
-	1,
-	1,
-	1,
-	2,
-	2,
-	2,
-	2,
-	8,
-	4,
-	4,
-	4,
-	8,
-	7,
-	4,
-	4,
-	8,
-	0,
-	0,
-	0,
-	3,
-	3,
-	3,
-	3,
-	5,
-	5,
-	10,
-	5,
-	5,
-	5,
-	10,
-	5,
-	6,
-	6,
-	6,
-	6,
-	6,
-	6,
-	6,
-	6,
+	11, 11, 12, 12, /* 0 special */
+	0, 0, 0, 0, /* 4 lf */
+	1, 1, 9, 9, /* 8 space */
+	2, 2, 2, 2, /* !, first after space/lf and after something else. */
+	1, 1, 1, 1, /* " */
+	8, 3, 3, 3, /* % */
+	1, 1, 1, 1, /* ({[ */
+	2, 2, 2, 2, /* }]) */
+	8, 4, 4, 4, /* :; */
+	8, 7, 4, 4, /* . */
+	8, 0, 0, 0, /* > */
+	3, 3, 3, 3, /* [0..9] */
+	5, 5, 10, 5, /* [A-Z] */
+	5, 5, 10, 5,
+	6, 6, 6, 6, /* [a-z] */
+	6, 6, 6, 6,
 }
 
 func shouldUseComplexStaticContextMap(input []byte, start_pos uint, length uint, mask uint, quality int, size_hint uint, num_literal_contexts *uint, literal_context_map *[]uint32) bool {
@@ -933,7 +432,7 @@ func chooseContextMode(params *encoderParams, data []byte, pos uint, mask uint,
 	return contextUTF8
 }
 
-func writeMetaBlockInternal(data []byte, mask uint, last_flush_pos uint64, bytes uint, is_last bool, literal_context_mode int, params *encoderParams, prev_byte byte, prev_byte2 byte, num_literals uint, num_commands uint, commands []command, saved_dist_cache []int, dist_cache []int, storage_ix *uint, storage []byte) {
+func writeMetaBlockInternal(data []byte, mask uint, last_flush_pos uint64, bytes uint, is_last bool, literal_context_mode int, params *encoderParams, prev_byte byte, prev_byte2 byte, num_literals uint, commands []command, saved_dist_cache []int, dist_cache []int, storage_ix *uint, storage []byte) {
 	var wrapped_last_flush_pos uint32 = wrapPosition(last_flush_pos)
 	var last_bytes uint16
 	var last_bytes_bits byte
@@ -948,7 +447,7 @@ func writeMetaBlockInternal(data []byte, mask uint, last_flush_pos uint64, bytes
 		return
 	}
 
-	if !shouldCompress_encode(data, mask, last_flush_pos, bytes, num_literals, num_commands) {
+	if !shouldCompress_encode(data, mask, last_flush_pos, bytes, num_literals, uint(len(commands))) {
 		/* Restore the distance cache, as its last update by
 		   CreateBackwardReferences is now unused. */
 		copy(dist_cache, saved_dist_cache[:4])
@@ -961,12 +460,11 @@ func writeMetaBlockInternal(data []byte, mask uint, last_flush_pos uint64, bytes
 	last_bytes = uint16(storage[1])<<8 | uint16(storage[0])
 	last_bytes_bits = byte(*storage_ix)
 	if params.quality <= maxQualityForStaticEntropyCodes {
-		storeMetaBlockFast(data, uint(wrapped_last_flush_pos), bytes, mask, is_last, params, commands, num_commands, storage_ix, storage)
+		storeMetaBlockFast(data, uint(wrapped_last_flush_pos), bytes, mask, is_last, params, commands, storage_ix, storage)
 	} else if params.quality < minQualityForBlockSplit {
-		storeMetaBlockTrivial(data, uint(wrapped_last_flush_pos), bytes, mask, is_last, params, commands, num_commands, storage_ix, storage)
+		storeMetaBlockTrivial(data, uint(wrapped_last_flush_pos), bytes, mask, is_last, params, commands, storage_ix, storage)
 	} else {
-		var mb metaBlockSplit
-		initMetaBlockSplit(&mb)
+		mb := getMetaBlockSplit()
 		if params.quality < minQualityForHqBlockSplitting {
 			var num_literal_contexts uint = 1
 			var literal_context_map []uint32 = nil
@@ -974,9 +472,9 @@ func writeMetaBlockInternal(data []byte, mask uint, last_flush_pos uint64, bytes
 				decideOverLiteralContextModeling(data, uint(wrapped_last_flush_pos), bytes, mask, params.quality, params.size_hint, &num_literal_contexts, &literal_context_map)
 			}
 
-			buildMetaBlockGreedy(data, uint(wrapped_last_flush_pos), mask, prev_byte, prev_byte2, literal_context_lut, num_literal_contexts, literal_context_map, commands, num_commands, &mb)
+			buildMetaBlockGreedy(data, uint(wrapped_last_flush_pos), mask, prev_byte, prev_byte2, literal_context_lut, num_literal_contexts, literal_context_map, commands, mb)
 		} else {
-			buildMetaBlock(data, uint(wrapped_last_flush_pos), mask, &block_params, prev_byte, prev_byte2, commands, num_commands, literal_context_mode, &mb)
+			buildMetaBlock(data, uint(wrapped_last_flush_pos), mask, &block_params, prev_byte, prev_byte2, commands, literal_context_mode, mb)
 		}
 
 		if params.quality >= minQualityForOptimizeHistograms {
@@ -988,11 +486,11 @@ func writeMetaBlockInternal(data []byte, mask uint, last_flush_pos uint64, bytes
 				num_effective_dist_codes = numHistogramDistanceSymbols
 			}
 
-			optimizeHistograms(num_effective_dist_codes, &mb)
+			optimizeHistograms(num_effective_dist_codes, mb)
 		}
 
-		storeMetaBlock(data, uint(wrapped_last_flush_pos), bytes, mask, prev_byte, prev_byte2, is_last, &block_params, literal_context_mode, commands, num_commands, &mb, storage_ix, storage)
-		destroyMetaBlockSplit(&mb)
+		storeMetaBlock(data, uint(wrapped_last_flush_pos), bytes, mask, prev_byte, prev_byte2, is_last, &block_params, literal_context_mode, commands, mb, storage_ix, storage)
+		freeMetaBlockSplit(mb)
 	}
 
 	if bytes+4 < *storage_ix>>3 {
@@ -1056,7 +554,38 @@ func ensureInitialized(s *Writer) bool {
 	}
 
 	if s.params.quality == fastOnePassCompressionQuality {
-		initCommandPrefixCodes(s.cmd_depths_[:], s.cmd_bits_[:], s.cmd_code_[:], &s.cmd_code_numbits_)
+		s.cmd_depths_ = [128]byte{
+			0, 4, 4, 5, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8,
+			0, 0, 0, 4, 4, 4, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7,
+			7, 7, 10, 10, 10, 10, 10, 10, 0, 4, 4, 5, 5, 5, 6, 6,
+			7, 8, 8, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+			5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+			6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4,
+			4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 7, 7, 7, 8, 10,
+			12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+		}
+		s.cmd_bits_ = [128]uint16{
+			0, 0, 8, 9, 3, 35, 7, 71,
+			39, 103, 23, 47, 175, 111, 239, 31,
+			0, 0, 0, 4, 12, 2, 10, 6,
+			13, 29, 11, 43, 27, 59, 87, 55,
+			15, 79, 319, 831, 191, 703, 447, 959,
+			0, 14, 1, 25, 5, 21, 19, 51,
+			119, 159, 95, 223, 479, 991, 63, 575,
+			127, 639, 383, 895, 255, 767, 511, 1023,
+			14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+			27, 59, 7, 39, 23, 55, 30, 1, 17, 9, 25, 5, 0, 8, 4, 12,
+			2, 10, 6, 21, 13, 29, 3, 19, 11, 15, 47, 31, 95, 63, 127, 255,
+			767, 2815, 1791, 3839, 511, 2559, 1535, 3583, 1023, 3071, 2047, 4095,
+		}
+		s.cmd_code_ = [512]byte{
+			0xff, 0x77, 0xd5, 0xbf, 0xe7, 0xde, 0xea, 0x9e, 0x51, 0x5d, 0xde, 0xc6,
+			0x70, 0x57, 0xbc, 0x58, 0x58, 0x58, 0xd8, 0xd8, 0x58, 0xd5, 0xcb, 0x8c,
+			0xea, 0xe0, 0xc3, 0x87, 0x1f, 0x83, 0xc1, 0x60, 0x1c, 0x67, 0xb2, 0xaa,
+			0x06, 0x83, 0xc1, 0x60, 0x30, 0x18, 0xcc, 0xa1, 0xce, 0x88, 0x54, 0x94,
+			0x46, 0xe1, 0xb0, 0xd0, 0x4e, 0xb2, 0xf7, 0x04, 0x00,
+		}
+		s.cmd_code_numbits_ = 448
 	}
 
 	s.is_initialized_ = true
@@ -1081,33 +610,23 @@ func encoderInitParams(params *encoderParams) {
 func encoderInitState(s *Writer) {
 	encoderInitParams(&s.params)
 	s.input_pos_ = 0
-	s.num_commands_ = 0
+	s.commands = s.commands[:0]
 	s.num_literals_ = 0
 	s.last_insert_len_ = 0
 	s.last_flush_pos_ = 0
 	s.last_processed_pos_ = 0
 	s.prev_byte_ = 0
 	s.prev_byte2_ = 0
-	s.storage_size_ = 0
-	s.storage_ = nil
-	s.hasher_ = nil
-	s.large_table_ = nil
-	s.large_table_size_ = 0
+	if s.hasher_ != nil {
+		s.hasher_.Common().is_prepared_ = false
+	}
 	s.cmd_code_numbits_ = 0
-	s.command_buf_ = nil
-	s.literal_buf_ = nil
-	s.next_out_ = nil
-	s.available_out_ = 0
-	s.total_out_ = 0
 	s.stream_state_ = streamProcessing
 	s.is_last_block_emitted_ = false
 	s.is_initialized_ = false
 
 	ringBufferInit(&s.ringbuffer_)
 
-	s.commands_ = nil
-	s.cmd_alloc_size_ = 0
-
 	/* Initialize distance cache. */
 	s.dist_cache_[0] = 4
 
@@ -1190,7 +709,7 @@ func updateLastProcessedPos(s *Writer) bool {
 }
 
 func extendLastCommand(s *Writer, bytes *uint32, wrapped_last_processed_pos *uint32) {
-	var last_command *command = &s.commands_[s.num_commands_-1]
+	var last_command *command = &s.commands[len(s.commands)-1]
 	var data []byte = s.ringbuffer_.buffer_
 	var mask uint32 = s.ringbuffer_.mask_
 	var max_backward_distance uint64 = ((uint64(1)) << s.params.lgwin) - windowGap
@@ -1219,18 +738,17 @@ func extendLastCommand(s *Writer, bytes *uint32, wrapped_last_processed_pos *uin
 }
 
 /*
-   Processes the accumulated input data and sets |*out_size| to the length of
-   the new output meta-block, or to zero if no new output meta-block has been
-   created (in this case the processed input data is buffered internally).
-   If |*out_size| is positive, |*output| points to the start of the output
-   data. If |is_last| or |force_flush| is true, an output meta-block is
+   Processes the accumulated input data and writes
+   the new output meta-block to s.dest, if one has been
+   created (otherwise the processed input data is buffered internally).
+   If |is_last| or |force_flush| is true, an output meta-block is
    always created. However, until |is_last| is true encoder may retain up
    to 7 bits of the last byte of output. To force encoder to dump the remaining
    bits use WriteMetadata() to append an empty meta-data block.
    Returns false if the size of the input data is larger than
    input_block_size().
 */
-func encodeData(s *Writer, is_last bool, force_flush bool, out_size *uint, output *[]byte) bool {
+func encodeData(s *Writer, is_last bool, force_flush bool) bool {
 	var delta uint64 = unprocessedInputSize(s)
 	var bytes uint32 = uint32(delta)
 	var wrapped_last_processed_pos uint32 = wrapPosition(s.last_processed_pos_)
@@ -1253,9 +771,14 @@ func encodeData(s *Writer, is_last bool, force_flush bool, out_size *uint, outpu
 		return false
 	}
 
-	if s.params.quality == fastTwoPassCompressionQuality && s.command_buf_ == nil {
-		s.command_buf_ = make([]uint32, kCompressFragmentTwoPassBlockSize)
-		s.literal_buf_ = make([]byte, kCompressFragmentTwoPassBlockSize)
+	if s.params.quality == fastTwoPassCompressionQuality {
+		if s.command_buf_ == nil || cap(s.command_buf_) < int(kCompressFragmentTwoPassBlockSize) {
+			s.command_buf_ = make([]uint32, kCompressFragmentTwoPassBlockSize)
+			s.literal_buf_ = make([]byte, kCompressFragmentTwoPassBlockSize)
+		} else {
+			s.command_buf_ = s.command_buf_[:kCompressFragmentTwoPassBlockSize]
+			s.literal_buf_ = s.literal_buf_[:kCompressFragmentTwoPassBlockSize]
+		}
 	}
 
 	if s.params.quality == fastOnePassCompressionQuality || s.params.quality == fastTwoPassCompressionQuality {
@@ -1267,12 +790,10 @@ func encodeData(s *Writer, is_last bool, force_flush bool, out_size *uint, outpu
 		if delta == 0 && !is_last {
 			/* We have no new input data and we don't have to finish the stream, so
 			   nothing to do. */
-			*out_size = 0
-
 			return true
 		}
 
-		storage = getBrotliStorage(s, uint(2*bytes+503))
+		storage = s.getStorage(int(2*bytes + 503))
 		storage[0] = byte(s.last_bytes_)
 		storage[1] = byte(s.last_bytes_ >> 8)
 		table = getHashTable(s, s.params.quality, uint(bytes), &table_size)
@@ -1285,28 +806,23 @@ func encodeData(s *Writer, is_last bool, force_flush bool, out_size *uint, outpu
 		s.last_bytes_ = uint16(storage[storage_ix>>3])
 		s.last_bytes_bits_ = byte(storage_ix & 7)
 		updateLastProcessedPos(s)
-		*output = storage[0:]
-		*out_size = storage_ix >> 3
+		s.writeOutput(storage[:storage_ix>>3])
 		return true
 	}
 	{
 		/* Theoretical max number of commands is 1 per 2 bytes. */
-		var newsize uint = uint(uint32(s.num_commands_) + bytes/2 + 1)
-		if newsize > s.cmd_alloc_size_ {
-			var new_commands []command
-
+		newsize := len(s.commands) + int(bytes)/2 + 1
+		if newsize > cap(s.commands) {
 			/* Reserve a bit more memory to allow merging with a next block
 			   without reallocation: that would impact speed. */
-			newsize += uint((bytes / 4) + 16)
+			newsize += int(bytes/4) + 16
 
-			s.cmd_alloc_size_ = newsize
-			new_commands = make([]command, newsize)
-			if s.commands_ != nil {
-				copy(new_commands, s.commands_[:s.num_commands_])
-				s.commands_ = nil
+			new_commands := make([]command, len(s.commands), newsize)
+			if s.commands != nil {
+				copy(new_commands, s.commands)
 			}
 
-			s.commands_ = new_commands
+			s.commands = new_commands
 		}
 	}
 
@@ -1314,46 +830,44 @@ func encodeData(s *Writer, is_last bool, force_flush bool, out_size *uint, outpu
 
 	literal_context_mode = chooseContextMode(&s.params, data, uint(wrapPosition(s.last_flush_pos_)), uint(mask), uint(s.input_pos_-s.last_flush_pos_))
 
-	if s.num_commands_ != 0 && s.last_insert_len_ == 0 {
+	if len(s.commands) != 0 && s.last_insert_len_ == 0 {
 		extendLastCommand(s, &bytes, &wrapped_last_processed_pos)
 	}
 
 	if s.params.quality == zopflificationQuality {
 		assert(s.params.hasher.type_ == 10)
-		createZopfliBackwardReferences(uint(bytes), uint(wrapped_last_processed_pos), data, uint(mask), &s.params, s.hasher_.(*h10), s.dist_cache_[:], &s.last_insert_len_, s.commands_[s.num_commands_:], &s.num_commands_, &s.num_literals_)
+		createZopfliBackwardReferences(uint(bytes), uint(wrapped_last_processed_pos), data, uint(mask), &s.params, s.hasher_.(*h10), s.dist_cache_[:], &s.last_insert_len_, &s.commands, &s.num_literals_)
 	} else if s.params.quality == hqZopflificationQuality {
 		assert(s.params.hasher.type_ == 10)
-		createHqZopfliBackwardReferences(uint(bytes), uint(wrapped_last_processed_pos), data, uint(mask), &s.params, s.hasher_, s.dist_cache_[:], &s.last_insert_len_, s.commands_[s.num_commands_:], &s.num_commands_, &s.num_literals_)
+		createHqZopfliBackwardReferences(uint(bytes), uint(wrapped_last_processed_pos), data, uint(mask), &s.params, s.hasher_, s.dist_cache_[:], &s.last_insert_len_, &s.commands, &s.num_literals_)
 	} else {
-		createBackwardReferences(uint(bytes), uint(wrapped_last_processed_pos), data, uint(mask), &s.params, s.hasher_, s.dist_cache_[:], &s.last_insert_len_, s.commands_[s.num_commands_:], &s.num_commands_, &s.num_literals_)
+		createBackwardReferences(uint(bytes), uint(wrapped_last_processed_pos), data, uint(mask), &s.params, s.hasher_, s.dist_cache_[:], &s.last_insert_len_, &s.commands, &s.num_literals_)
 	}
 	{
 		var max_length uint = maxMetablockSize(&s.params)
 		var max_literals uint = max_length / 8
-		var max_commands uint = max_length / 8
+		max_commands := int(max_length / 8)
 		var processed_bytes uint = uint(s.input_pos_ - s.last_flush_pos_)
 		var next_input_fits_metablock bool = (processed_bytes+inputBlockSize(s) <= max_length)
-		var should_flush bool = (s.params.quality < minQualityForBlockSplit && s.num_literals_+s.num_commands_ >= maxNumDelayedSymbols)
+		var should_flush bool = (s.params.quality < minQualityForBlockSplit && s.num_literals_+uint(len(s.commands)) >= maxNumDelayedSymbols)
 		/* If maximal possible additional block doesn't fit metablock, flush now. */
 		/* TODO: Postpone decision until next block arrives? */
 
 		/* If block splitting is not used, then flush as soon as there is some
 		   amount of commands / literals produced. */
-		if !is_last && !force_flush && !should_flush && next_input_fits_metablock && s.num_literals_ < max_literals && s.num_commands_ < max_commands {
+		if !is_last && !force_flush && !should_flush && next_input_fits_metablock && s.num_literals_ < max_literals && len(s.commands) < max_commands {
 			/* Merge with next input block. Everything will happen later. */
 			if updateLastProcessedPos(s) {
 				hasherReset(s.hasher_)
 			}
 
-			*out_size = 0
 			return true
 		}
 	}
 
 	/* Create the last insert-only command. */
 	if s.last_insert_len_ > 0 {
-		initInsertCommand(&s.commands_[s.num_commands_], s.last_insert_len_)
-		s.num_commands_++
+		s.commands = append(s.commands, makeInsertCommand(s.last_insert_len_))
 		s.num_literals_ += s.last_insert_len_
 		s.last_insert_len_ = 0
 	}
@@ -1361,8 +875,6 @@ func encodeData(s *Writer, is_last bool, force_flush bool, out_size *uint, outpu
 	if !is_last && s.input_pos_ == s.last_flush_pos_ {
 		/* We have no new input data and we don't have to finish the stream, so
 		   nothing to do. */
-		*out_size = 0
-
 		return true
 	}
 
@@ -1371,11 +883,11 @@ func encodeData(s *Writer, is_last bool, force_flush bool, out_size *uint, outpu
 	assert(s.input_pos_-s.last_flush_pos_ <= 1<<24)
 	{
 		var metablock_size uint32 = uint32(s.input_pos_ - s.last_flush_pos_)
-		var storage []byte = getBrotliStorage(s, uint(2*metablock_size+503))
+		var storage []byte = s.getStorage(int(2*metablock_size + 503))
 		var storage_ix uint = uint(s.last_bytes_bits_)
 		storage[0] = byte(s.last_bytes_)
 		storage[1] = byte(s.last_bytes_ >> 8)
-		writeMetaBlockInternal(data, uint(mask), s.last_flush_pos_, uint(metablock_size), is_last, literal_context_mode, &s.params, s.prev_byte_, s.prev_byte2_, s.num_literals_, s.num_commands_, s.commands_, s.saved_dist_cache_[:], s.dist_cache_[:], &storage_ix, storage)
+		writeMetaBlockInternal(data, uint(mask), s.last_flush_pos_, uint(metablock_size), is_last, literal_context_mode, &s.params, s.prev_byte_, s.prev_byte2_, s.num_literals_, s.commands, s.saved_dist_cache_[:], s.dist_cache_[:], &storage_ix, storage)
 		s.last_bytes_ = uint16(storage[storage_ix>>3])
 		s.last_bytes_bits_ = byte(storage_ix & 7)
 		s.last_flush_pos_ = s.input_pos_
@@ -1391,15 +903,14 @@ func encodeData(s *Writer, is_last bool, force_flush bool, out_size *uint, outpu
 			s.prev_byte2_ = data[uint32(s.last_flush_pos_-2)&mask]
 		}
 
-		s.num_commands_ = 0
+		s.commands = s.commands[:0]
 		s.num_literals_ = 0
 
 		/* Save the state of the distance cache in case we need to restore it for
 		   emitting an uncompressed block. */
 		copy(s.saved_dist_cache_[:], s.dist_cache_[:])
 
-		*output = storage[0:]
-		*out_size = storage_ix >> 3
+		s.writeOutput(storage[:storage_ix>>3])
 		return true
 	}
 }
@@ -1409,8 +920,7 @@ func encodeData(s *Writer, is_last bool, force_flush bool, out_size *uint, outpu
    REQUIRED: |header| should be 8-byte aligned and at least 16 bytes long.
    REQUIRED: |block_size| <= (1 << 24). */
 func writeMetadataHeader(s *Writer, block_size uint, header []byte) uint {
-	var storage_ix uint
-	storage_ix = uint(s.last_bytes_bits_)
+	storage_ix := uint(s.last_bytes_bits_)
 	header[0] = byte(s.last_bytes_)
 	header[1] = byte(s.last_bytes_ >> 8)
 	s.last_bytes_ = 0
@@ -1439,7 +949,6 @@ func writeMetadataHeader(s *Writer, block_size uint, header []byte) uint {
 func injectBytePaddingBlock(s *Writer) {
 	var seal uint32 = uint32(s.last_bytes_)
 	var seal_bits uint = uint(s.last_bytes_bits_)
-	var destination []byte
 	s.last_bytes_ = 0
 	s.last_bytes_bits_ = 0
 
@@ -1448,14 +957,7 @@ func injectBytePaddingBlock(s *Writer) {
 
 	seal_bits += 6
 
-	/* If we have already created storage, then append to it.
-	   Storage is valid until next block is being compressed. */
-	if s.next_out_ != nil {
-		destination = s.next_out_[s.available_out_:]
-	} else {
-		destination = s.tiny_buf_.u8[:]
-		s.next_out_ = destination
-	}
+	destination := s.tiny_buf_.u8[:]
 
 	destination[0] = byte(seal)
 	if seal_bits > 8 {
@@ -1464,42 +966,35 @@ func injectBytePaddingBlock(s *Writer) {
 	if seal_bits > 16 {
 		destination[2] = byte(seal >> 16)
 	}
-	s.available_out_ += (seal_bits + 7) >> 3
+	s.writeOutput(destination[:(seal_bits+7)>>3])
 }
 
 func checkFlushComplete(s *Writer) {
-	if s.stream_state_ == streamFlushRequested && s.available_out_ == 0 {
+	if s.stream_state_ == streamFlushRequested && s.err == nil {
 		s.stream_state_ = streamProcessing
-		s.next_out_ = nil
 	}
 }
 
 func encoderCompressStreamFast(s *Writer, op int, available_in *uint, next_in *[]byte) bool {
 	var block_size_limit uint = uint(1) << s.params.lgwin
 	var buf_size uint = brotli_min_size_t(kCompressFragmentTwoPassBlockSize, brotli_min_size_t(*available_in, block_size_limit))
-	var tmp_command_buf []uint32 = nil
 	var command_buf []uint32 = nil
-	var tmp_literal_buf []byte = nil
 	var literal_buf []byte = nil
 	if s.params.quality != fastOnePassCompressionQuality && s.params.quality != fastTwoPassCompressionQuality {
 		return false
 	}
 
 	if s.params.quality == fastTwoPassCompressionQuality {
-		if s.command_buf_ == nil && buf_size == kCompressFragmentTwoPassBlockSize {
-			s.command_buf_ = make([]uint32, kCompressFragmentTwoPassBlockSize)
-			s.literal_buf_ = make([]byte, kCompressFragmentTwoPassBlockSize)
-		}
-
-		if s.command_buf_ != nil {
-			command_buf = s.command_buf_
-			literal_buf = s.literal_buf_
+		if s.command_buf_ == nil || cap(s.command_buf_) < int(buf_size) {
+			s.command_buf_ = make([]uint32, buf_size)
+			s.literal_buf_ = make([]byte, buf_size)
 		} else {
-			tmp_command_buf = make([]uint32, buf_size)
-			tmp_literal_buf = make([]byte, buf_size)
-			command_buf = tmp_command_buf
-			literal_buf = tmp_literal_buf
+			s.command_buf_ = s.command_buf_[:buf_size]
+			s.literal_buf_ = s.literal_buf_[:buf_size]
 		}
+
+		command_buf = s.command_buf_
+		literal_buf = s.literal_buf_
 	}
 
 	for {
@@ -1508,10 +1003,10 @@ func encoderCompressStreamFast(s *Writer, op int, available_in *uint, next_in *[
 			continue
 		}
 
-		/* Compress block only when internal output buffer is empty, stream is not
+		/* Compress block only when stream is not
 		   finished, there is no pending flush request, and there is either
 		   additional input or pending operation. */
-		if s.available_out_ == 0 && s.stream_state_ == streamProcessing && (*available_in != 0 || op != int(operationProcess)) {
+		if s.stream_state_ == streamProcessing && (*available_in != 0 || op != int(operationProcess)) {
 			var block_size uint = brotli_min_size_t(block_size_limit, *available_in)
 			var is_last bool = (*available_in == block_size) && (op == int(operationFinish))
 			var force_flush bool = (*available_in == block_size) && (op == int(operationFlush))
@@ -1526,7 +1021,7 @@ func encoderCompressStreamFast(s *Writer, op int, available_in *uint, next_in *[
 				continue
 			}
 
-			storage = getBrotliStorage(s, max_out_size)
+			storage = s.getStorage(int(max_out_size))
 
 			storage[0] = byte(s.last_bytes_)
 			storage[1] = byte(s.last_bytes_ >> 8)
@@ -1541,8 +1036,7 @@ func encoderCompressStreamFast(s *Writer, op int, available_in *uint, next_in *[
 			*next_in = (*next_in)[block_size:]
 			*available_in -= block_size
 			var out_bytes uint = storage_ix >> 3
-			s.next_out_ = storage
-			s.available_out_ = out_bytes
+			s.writeOutput(storage[:out_bytes])
 
 			s.last_bytes_ = uint16(storage[storage_ix>>3])
 			s.last_bytes_bits_ = byte(storage_ix & 7)
@@ -1559,8 +1053,6 @@ func encoderCompressStreamFast(s *Writer, op int, available_in *uint, next_in *[
 		break
 	}
 
-	tmp_command_buf = nil
-	tmp_literal_buf = nil
 	checkFlushComplete(s)
 	return true
 }
@@ -1586,12 +1078,8 @@ func processMetadata(s *Writer, available_in *uint, next_in *[]byte) bool {
 			continue
 		}
 
-		if s.available_out_ != 0 {
-			break
-		}
-
 		if s.input_pos_ != s.last_flush_pos_ {
-			var result bool = encodeData(s, false, true, &s.available_out_, &s.next_out_)
+			var result bool = encodeData(s, false, true)
 			if !result {
 				return false
 			}
@@ -1599,8 +1087,8 @@ func processMetadata(s *Writer, available_in *uint, next_in *[]byte) bool {
 		}
 
 		if s.stream_state_ == streamMetadataHead {
-			s.next_out_ = s.tiny_buf_.u8[:]
-			s.available_out_ = writeMetadataHeader(s, uint(s.remaining_metadata_bytes_), s.next_out_)
+			n := writeMetadataHeader(s, uint(s.remaining_metadata_bytes_), s.tiny_buf_.u8[:])
+			s.writeOutput(s.tiny_buf_.u8[:n])
 			s.stream_state_ = streamMetadataBody
 			continue
 		} else {
@@ -1614,12 +1102,11 @@ func processMetadata(s *Writer, available_in *uint, next_in *[]byte) bool {
 
 			/* This guarantees progress in "TakeOutput" workflow. */
 			var c uint32 = brotli_min_uint32_t(s.remaining_metadata_bytes_, 16)
-			s.next_out_ = s.tiny_buf_.u8[:]
-			copy(s.next_out_, (*next_in)[:c])
+			copy(s.tiny_buf_.u8[:], (*next_in)[:c])
 			*next_in = (*next_in)[c:]
 			*available_in -= uint(c)
 			s.remaining_metadata_bytes_ -= c
-			s.available_out_ = uint(c)
+			s.writeOutput(s.tiny_buf_.u8[:c])
 
 			continue
 		}
@@ -1692,15 +1179,15 @@ func encoderCompressStream(s *Writer, op int, available_in *uint, next_in *[]byt
 			continue
 		}
 
-		/* Compress data only when internal output buffer is empty, stream is not
+		/* Compress data only when stream is not
 		   finished and there is no pending flush request. */
-		if s.available_out_ == 0 && s.stream_state_ == streamProcessing {
+		if s.stream_state_ == streamProcessing {
 			if remaining_block_size == 0 || op != int(operationProcess) {
 				var is_last bool = ((*available_in == 0) && op == int(operationFinish))
 				var force_flush bool = ((*available_in == 0) && op == int(operationFlush))
 				var result bool
 				updateSizeHint(s, *available_in)
-				result = encodeData(s, is_last, force_flush, &s.available_out_, &s.next_out_)
+				result = encodeData(s, is_last, force_flush)
 				if !result {
 					return false
 				}
@@ -1721,18 +1208,13 @@ func encoderCompressStream(s *Writer, op int, available_in *uint, next_in *[]byt
 	return true
 }
 
-func encoderHasMoreOutput(s *Writer) bool {
-	return s.available_out_ != 0
-}
-
-func encoderTakeOutput(s *Writer) []byte {
-	if s.available_out_ == 0 {
-		return nil
+func (w *Writer) writeOutput(data []byte) {
+	if w.err != nil {
+		return
 	}
-	result := s.next_out_[:s.available_out_]
-	s.total_out_ += s.available_out_
-	s.available_out_ = 0
-	checkFlushComplete(s)
 
-	return result
+	_, w.err = w.dst.Write(data)
+	if w.err == nil {
+		checkFlushComplete(w)
+	}
 }
diff --git a/vendor/github.com/andybalholm/brotli/entropy_encode.go b/vendor/github.com/andybalholm/brotli/entropy_encode.go
index d0c1dca..3f469a3 100644
--- a/vendor/github.com/andybalholm/brotli/entropy_encode.go
+++ b/vendor/github.com/andybalholm/brotli/entropy_encode.go
@@ -24,7 +24,7 @@ func initHuffmanTree(self *huffmanTree, count uint32, left int16, right int16) {
 }
 
 /* Input size optimized Shell sort. */
-type huffmanTreeComparator func(*huffmanTree, *huffmanTree) bool
+type huffmanTreeComparator func(huffmanTree, huffmanTree) bool
 
 var sortHuffmanTreeItems_gaps = []uint{132, 57, 23, 10, 4, 1}
 
@@ -36,14 +36,13 @@ func sortHuffmanTreeItems(items []huffmanTree, n uint, comparator huffmanTreeCom
 			var tmp huffmanTree = items[i]
 			var k uint = i
 			var j uint = i - 1
-			for comparator(&tmp, &items[j]) {
+			for comparator(tmp, items[j]) {
 				items[k] = items[j]
 				k = j
-				tmp10 := j
-				j--
-				if tmp10 == 0 {
+				if j == 0 {
 					break
 				}
+				j--
 			}
 
 			items[k] = tmp
@@ -63,7 +62,7 @@ func sortHuffmanTreeItems(items []huffmanTree, n uint, comparator huffmanTreeCom
 			for i = gap; i < n; i++ {
 				var j uint = i
 				var tmp huffmanTree = items[i]
-				for ; j >= gap && comparator(&tmp, &items[j-gap]); j -= gap {
+				for ; j >= gap && comparator(tmp, items[j-gap]); j -= gap {
 					items[j] = items[j-gap]
 				}
 
@@ -105,7 +104,7 @@ func setDepth(p0 int, pool []huffmanTree, depth []byte, max_depth int) bool {
 }
 
 /* Sort the root nodes, least popular first. */
-func sortHuffmanTree(v0 *huffmanTree, v1 *huffmanTree) bool {
+func sortHuffmanTree(v0 huffmanTree, v1 huffmanTree) bool {
 	if v0.total_count_ != v1.total_count_ {
 		return v0.total_count_ < v1.total_count_
 	}
diff --git a/vendor/github.com/andybalholm/brotli/fast_log.go b/vendor/github.com/andybalholm/brotli/fast_log.go
index bbae300..9d6607f 100644
--- a/vendor/github.com/andybalholm/brotli/fast_log.go
+++ b/vendor/github.com/andybalholm/brotli/fast_log.go
@@ -1,6 +1,9 @@
 package brotli
 
-import "math"
+import (
+	"math"
+	"math/bits"
+)
 
 /* Copyright 2013 Google Inc. All Rights Reserved.
 
@@ -11,16 +14,7 @@ import "math"
 /* Utilities for fast computation of logarithms. */
 
 func log2FloorNonZero(n uint) uint32 {
-	/* TODO: generalize and move to platform.h */
-	var result uint32 = 0
-	for {
-		n >>= 1
-		if n == 0 {
-			break
-		}
-		result++
-	}
-	return result
+	return uint32(bits.Len(n)) - 1
 }
 
 /* A lookup table for small values of log2(int) to be used in entropy
diff --git a/vendor/github.com/andybalholm/brotli/find_match_length.go b/vendor/github.com/andybalholm/brotli/find_match_length.go
index 14d350a..09d2ae6 100644
--- a/vendor/github.com/andybalholm/brotli/find_match_length.go
+++ b/vendor/github.com/andybalholm/brotli/find_match_length.go
@@ -1,5 +1,11 @@
 package brotli
 
+import (
+	"encoding/binary"
+	"math/bits"
+	"runtime"
+)
+
 /* Copyright 2010 Google Inc. All Rights Reserved.
 
    Distributed under MIT license.
@@ -9,6 +15,29 @@ package brotli
 /* Function to find maximal matching prefixes of strings. */
 func findMatchLengthWithLimit(s1 []byte, s2 []byte, limit uint) uint {
 	var matched uint = 0
+	_, _ = s1[limit-1], s2[limit-1] // bounds check
+	switch runtime.GOARCH {
+	case "amd64":
+		// Compare 8 bytes at at time.
+		for matched+8 <= limit {
+			w1 := binary.LittleEndian.Uint64(s1[matched:])
+			w2 := binary.LittleEndian.Uint64(s2[matched:])
+			if w1 != w2 {
+				return matched + uint(bits.TrailingZeros64(w1^w2)>>3)
+			}
+			matched += 8
+		}
+	case "386":
+		// Compare 4 bytes at at time.
+		for matched+4 <= limit {
+			w1 := binary.LittleEndian.Uint32(s1[matched:])
+			w2 := binary.LittleEndian.Uint32(s2[matched:])
+			if w1 != w2 {
+				return matched + uint(bits.TrailingZeros32(w1^w2)>>3)
+			}
+			matched += 4
+		}
+	}
 	for matched < limit && s1[matched] == s2[matched] {
 		matched++
 	}
diff --git a/vendor/github.com/andybalholm/brotli/hash.go b/vendor/github.com/andybalholm/brotli/hash.go
index 003b433..00f812e 100644
--- a/vendor/github.com/andybalholm/brotli/hash.go
+++ b/vendor/github.com/andybalholm/brotli/hash.go
@@ -29,8 +29,6 @@ type hasherHandle interface {
 	Store(data []byte, mask uint, ix uint)
 }
 
-type score_t uint
-
 const kCutoffTransformsCount uint32 = 10
 
 /*   0,  12,   27,    23,    42,    63,    56,    48,    59,    64 */
diff --git a/vendor/github.com/andybalholm/brotli/hash_forgetful_chain.go b/vendor/github.com/andybalholm/brotli/hash_forgetful_chain.go
index 3364c44..306e46d 100644
--- a/vendor/github.com/andybalholm/brotli/hash_forgetful_chain.go
+++ b/vendor/github.com/andybalholm/brotli/hash_forgetful_chain.go
@@ -110,8 +110,7 @@ func (h *hashForgetfulChain) Prepare(one_shot bool, input_size uint, data []byte
 func (h *hashForgetfulChain) Store(data []byte, mask uint, ix uint) {
 	var key uint = h.HashBytes(data[ix&mask:])
 	var bank uint = key & (h.numBanks - 1)
-	var idx uint
-	idx = uint(h.free_slot_idx[bank]) & ((1 << h.bankBits) - 1)
+	idx := uint(h.free_slot_idx[bank]) & ((1 << h.bankBits) - 1)
 	h.free_slot_idx[bank]++
 	var delta uint = ix - uint(h.addr[key])
 	h.tiny_hash[uint16(ix)] = byte(key)
diff --git a/vendor/github.com/andybalholm/brotli/hash_rolling.go b/vendor/github.com/andybalholm/brotli/hash_rolling.go
index ad655a0..6630fc0 100644
--- a/vendor/github.com/andybalholm/brotli/hash_rolling.go
+++ b/vendor/github.com/andybalholm/brotli/hash_rolling.go
@@ -48,7 +48,6 @@ type hashRolling struct {
 	state         uint32
 	table         []uint32
 	next_ix       uint
-	chunk_len     uint32
 	factor        uint32
 	factor_remove uint32
 }
diff --git a/vendor/github.com/andybalholm/brotli/histogram.go b/vendor/github.com/andybalholm/brotli/histogram.go
index f208ff7..0346622 100644
--- a/vendor/github.com/andybalholm/brotli/histogram.go
+++ b/vendor/github.com/andybalholm/brotli/histogram.go
@@ -163,7 +163,7 @@ func initBlockSplitIterator(self *blockSplitIterator, split *blockSplit) {
 	self.split_ = split
 	self.idx_ = 0
 	self.type_ = 0
-	if split.lengths != nil {
+	if len(split.lengths) > 0 {
 		self.length_ = uint(split.lengths[0])
 	} else {
 		self.length_ = 0
@@ -180,17 +180,16 @@ func blockSplitIteratorNext(self *blockSplitIterator) {
 	self.length_--
 }
 
-func buildHistogramsWithContext(cmds []command, num_commands uint, literal_split *blockSplit, insert_and_copy_split *blockSplit, dist_split *blockSplit, ringbuffer []byte, start_pos uint, mask uint, prev_byte byte, prev_byte2 byte, context_modes []int, literal_histograms []histogramLiteral, insert_and_copy_histograms []histogramCommand, copy_dist_histograms []histogramDistance) {
+func buildHistogramsWithContext(cmds []command, literal_split *blockSplit, insert_and_copy_split *blockSplit, dist_split *blockSplit, ringbuffer []byte, start_pos uint, mask uint, prev_byte byte, prev_byte2 byte, context_modes []int, literal_histograms []histogramLiteral, insert_and_copy_histograms []histogramCommand, copy_dist_histograms []histogramDistance) {
 	var pos uint = start_pos
 	var literal_it blockSplitIterator
 	var insert_and_copy_it blockSplitIterator
 	var dist_it blockSplitIterator
-	var i uint
 
 	initBlockSplitIterator(&literal_it, literal_split)
 	initBlockSplitIterator(&insert_and_copy_it, insert_and_copy_split)
 	initBlockSplitIterator(&dist_it, dist_split)
-	for i = 0; i < num_commands; i++ {
+	for i := range cmds {
 		var cmd *command = &cmds[i]
 		var j uint
 		blockSplitIteratorNext(&insert_and_copy_it)
diff --git a/vendor/github.com/andybalholm/brotli/http.go b/vendor/github.com/andybalholm/brotli/http.go
index af58670..1e98196 100644
--- a/vendor/github.com/andybalholm/brotli/http.go
+++ b/vendor/github.com/andybalholm/brotli/http.go
@@ -180,8 +180,8 @@ func init() {
 		var t octetType
 		isCtl := c <= 31 || c == 127
 		isChar := 0 <= c && c <= 127
-		isSeparator := strings.IndexRune(" \t\"(),/:;<=>?@[]\\{}", rune(c)) >= 0
-		if strings.IndexRune(" \t\r\n", rune(c)) >= 0 {
+		isSeparator := strings.ContainsRune(" \t\"(),/:;<=>?@[]\\{}", rune(c))
+		if strings.ContainsRune(" \t\r\n", rune(c)) {
 			t |= isSpace
 		}
 		if isChar && !isCtl && !isSeparator {
diff --git a/vendor/github.com/andybalholm/brotli/memory.go b/vendor/github.com/andybalholm/brotli/memory.go
index 7208a3b..a07c705 100644
--- a/vendor/github.com/andybalholm/brotli/memory.go
+++ b/vendor/github.com/andybalholm/brotli/memory.go
@@ -23,12 +23,18 @@ func brotli_ensure_capacity_uint8_t(a *[]byte, c *uint, r uint) {
 		for new_size < r {
 			new_size *= 2
 		}
-		var new_array []byte = make([]byte, new_size)
-		if *c != 0 {
-			copy(new_array, (*a)[:*c])
+
+		if cap(*a) < int(new_size) {
+			var new_array []byte = make([]byte, new_size)
+			if *c != 0 {
+				copy(new_array, (*a)[:*c])
+			}
+
+			*a = new_array
+		} else {
+			*a = (*a)[:new_size]
 		}
 
-		*a = new_array
 		*c = new_size
 	}
 }
@@ -45,12 +51,16 @@ func brotli_ensure_capacity_uint32_t(a *[]uint32, c *uint, r uint) {
 			new_size *= 2
 		}
 
-		new_array = make([]uint32, new_size)
-		if *c != 0 {
-			copy(new_array, (*a)[:*c])
-		}
+		if cap(*a) < int(new_size) {
+			new_array = make([]uint32, new_size)
+			if *c != 0 {
+				copy(new_array, (*a)[:*c])
+			}
 
-		*a = new_array
+			*a = new_array
+		} else {
+			*a = (*a)[:new_size]
+		}
 		*c = new_size
 	}
 }
diff --git a/vendor/github.com/andybalholm/brotli/metablock.go b/vendor/github.com/andybalholm/brotli/metablock.go
index 4a412cf..3014df8 100644
--- a/vendor/github.com/andybalholm/brotli/metablock.go
+++ b/vendor/github.com/andybalholm/brotli/metablock.go
@@ -1,5 +1,9 @@
 package brotli
 
+import (
+	"sync"
+)
+
 /* Copyright 2014 Google Inc. All Rights Reserved.
 
    Distributed under MIT license.
@@ -25,31 +29,30 @@ type metaBlockSplit struct {
 	distance_histograms_size  uint
 }
 
-func initMetaBlockSplit(mb *metaBlockSplit) {
-	initBlockSplit(&mb.literal_split)
-	initBlockSplit(&mb.command_split)
-	initBlockSplit(&mb.distance_split)
-	mb.literal_context_map = nil
-	mb.literal_context_map_size = 0
-	mb.distance_context_map = nil
-	mb.distance_context_map_size = 0
-	mb.literal_histograms = nil
-	mb.literal_histograms_size = 0
-	mb.command_histograms = nil
-	mb.command_histograms_size = 0
-	mb.distance_histograms = nil
-	mb.distance_histograms_size = 0
+var metaBlockPool sync.Pool
+
+func getMetaBlockSplit() *metaBlockSplit {
+	mb, _ := metaBlockPool.Get().(*metaBlockSplit)
+
+	if mb == nil {
+		mb = &metaBlockSplit{}
+	} else {
+		initBlockSplit(&mb.literal_split)
+		initBlockSplit(&mb.command_split)
+		initBlockSplit(&mb.distance_split)
+		mb.literal_context_map = mb.literal_context_map[:0]
+		mb.literal_context_map_size = 0
+		mb.distance_context_map = mb.distance_context_map[:0]
+		mb.distance_context_map_size = 0
+		mb.literal_histograms = mb.literal_histograms[:0]
+		mb.command_histograms = mb.command_histograms[:0]
+		mb.distance_histograms = mb.distance_histograms[:0]
+	}
+	return mb
 }
 
-func destroyMetaBlockSplit(mb *metaBlockSplit) {
-	destroyBlockSplit(&mb.literal_split)
-	destroyBlockSplit(&mb.command_split)
-	destroyBlockSplit(&mb.distance_split)
-	mb.literal_context_map = nil
-	mb.distance_context_map = nil
-	mb.literal_histograms = nil
-	mb.command_histograms = nil
-	mb.distance_histograms = nil
+func freeMetaBlockSplit(mb *metaBlockSplit) {
+	metaBlockPool.Put(mb)
 }
 
 func initDistanceParams(params *encoderParams, npostfix uint32, ndirect uint32) {
@@ -84,14 +87,12 @@ func initDistanceParams(params *encoderParams, npostfix uint32, ndirect uint32)
 	dist_params.max_distance = uint(max_distance)
 }
 
-func recomputeDistancePrefixes(cmds []command, num_commands uint, orig_params *distanceParams, new_params *distanceParams) {
-	var i uint
-
+func recomputeDistancePrefixes(cmds []command, orig_params *distanceParams, new_params *distanceParams) {
 	if orig_params.distance_postfix_bits == new_params.distance_postfix_bits && orig_params.num_direct_distance_codes == new_params.num_direct_distance_codes {
 		return
 	}
 
-	for i = 0; i < num_commands; i++ {
+	for i := range cmds {
 		var cmd *command = &cmds[i]
 		if commandCopyLen(cmd) != 0 && cmd.cmd_prefix_ >= 128 {
 			prefixEncodeCopyDistance(uint(commandRestoreDistanceCode(cmd, orig_params)), uint(new_params.num_direct_distance_codes), uint(new_params.distance_postfix_bits), &cmd.dist_prefix_, &cmd.dist_extra_)
@@ -99,8 +100,7 @@ func recomputeDistancePrefixes(cmds []command, num_commands uint, orig_params *d
 	}
 }
 
-func computeDistanceCost(cmds []command, num_commands uint, orig_params *distanceParams, new_params *distanceParams, cost *float64) bool {
-	var i uint
+func computeDistanceCost(cmds []command, orig_params *distanceParams, new_params *distanceParams, cost *float64) bool {
 	var equal_params bool = false
 	var dist_prefix uint16
 	var dist_extra uint32
@@ -112,8 +112,8 @@ func computeDistanceCost(cmds []command, num_commands uint, orig_params *distanc
 		equal_params = true
 	}
 
-	for i = 0; i < num_commands; i++ {
-		var cmd *command = &cmds[i]
+	for i := range cmds {
+		cmd := &cmds[i]
 		if commandCopyLen(cmd) != 0 && cmd.cmd_prefix_ >= 128 {
 			if equal_params {
 				dist_prefix = cmd.dist_prefix_
@@ -137,7 +137,7 @@ func computeDistanceCost(cmds []command, num_commands uint, orig_params *distanc
 
 var buildMetaBlock_kMaxNumberOfHistograms uint = 256
 
-func buildMetaBlock(ringbuffer []byte, pos uint, mask uint, params *encoderParams, prev_byte byte, prev_byte2 byte, cmds []command, num_commands uint, literal_context_mode int, mb *metaBlockSplit) {
+func buildMetaBlock(ringbuffer []byte, pos uint, mask uint, params *encoderParams, prev_byte byte, prev_byte2 byte, cmds []command, literal_context_mode int, mb *metaBlockSplit) {
 	var distance_histograms []histogramDistance
 	var literal_histograms []histogramLiteral
 	var literal_context_modes []int = nil
@@ -164,7 +164,7 @@ func buildMetaBlock(ringbuffer []byte, pos uint, mask uint, params *encoderParam
 				check_orig = false
 			}
 
-			skip = !computeDistanceCost(cmds, num_commands, &orig_params.dist, &new_params.dist, &dist_cost)
+			skip = !computeDistanceCost(cmds, &orig_params.dist, &new_params.dist, &dist_cost)
 			if skip || (dist_cost > best_dist_cost) {
 				break
 			}
@@ -181,7 +181,7 @@ func buildMetaBlock(ringbuffer []byte, pos uint, mask uint, params *encoderParam
 
 	if check_orig {
 		var dist_cost float64
-		computeDistanceCost(cmds, num_commands, &orig_params.dist, &orig_params.dist, &dist_cost)
+		computeDistanceCost(cmds, &orig_params.dist, &orig_params.dist, &dist_cost)
 		if dist_cost < best_dist_cost {
 			/* NB: currently unused; uncomment when more param tuning is added. */
 			/* best_dist_cost = dist_cost; */
@@ -189,9 +189,9 @@ func buildMetaBlock(ringbuffer []byte, pos uint, mask uint, params *encoderParam
 		}
 	}
 
-	recomputeDistancePrefixes(cmds, num_commands, &orig_params.dist, &params.dist)
+	recomputeDistancePrefixes(cmds, &orig_params.dist, &params.dist)
 
-	splitBlock(cmds, num_commands, ringbuffer, pos, mask, params, &mb.literal_split, &mb.command_split, &mb.distance_split)
+	splitBlock(cmds, ringbuffer, pos, mask, params, &mb.literal_split, &mb.command_split, &mb.distance_split)
 
 	if !params.disable_literal_context_modeling {
 		literal_context_multiplier = 1 << literalContextBits
@@ -209,21 +209,30 @@ func buildMetaBlock(ringbuffer []byte, pos uint, mask uint, params *encoderParam
 	distance_histograms = make([]histogramDistance, distance_histograms_size)
 	clearHistogramsDistance(distance_histograms, distance_histograms_size)
 
-	assert(mb.command_histograms == nil)
 	mb.command_histograms_size = mb.command_split.num_types
-	mb.command_histograms = make([]histogramCommand, (mb.command_histograms_size))
+	if cap(mb.command_histograms) < int(mb.command_histograms_size) {
+		mb.command_histograms = make([]histogramCommand, (mb.command_histograms_size))
+	} else {
+		mb.command_histograms = mb.command_histograms[:mb.command_histograms_size]
+	}
 	clearHistogramsCommand(mb.command_histograms, mb.command_histograms_size)
 
-	buildHistogramsWithContext(cmds, num_commands, &mb.literal_split, &mb.command_split, &mb.distance_split, ringbuffer, pos, mask, prev_byte, prev_byte2, literal_context_modes, literal_histograms, mb.command_histograms, distance_histograms)
+	buildHistogramsWithContext(cmds, &mb.literal_split, &mb.command_split, &mb.distance_split, ringbuffer, pos, mask, prev_byte, prev_byte2, literal_context_modes, literal_histograms, mb.command_histograms, distance_histograms)
 	literal_context_modes = nil
 
-	assert(mb.literal_context_map == nil)
 	mb.literal_context_map_size = mb.literal_split.num_types << literalContextBits
-	mb.literal_context_map = make([]uint32, (mb.literal_context_map_size))
+	if cap(mb.literal_context_map) < int(mb.literal_context_map_size) {
+		mb.literal_context_map = make([]uint32, (mb.literal_context_map_size))
+	} else {
+		mb.literal_context_map = mb.literal_context_map[:mb.literal_context_map_size]
+	}
 
-	assert(mb.literal_histograms == nil)
 	mb.literal_histograms_size = mb.literal_context_map_size
-	mb.literal_histograms = make([]histogramLiteral, (mb.literal_histograms_size))
+	if cap(mb.literal_histograms) < int(mb.literal_histograms_size) {
+		mb.literal_histograms = make([]histogramLiteral, (mb.literal_histograms_size))
+	} else {
+		mb.literal_histograms = mb.literal_histograms[:mb.literal_histograms_size]
+	}
 
 	clusterHistogramsLiteral(literal_histograms, literal_histograms_size, buildMetaBlock_kMaxNumberOfHistograms, mb.literal_histograms, &mb.literal_histograms_size, mb.literal_context_map)
 	literal_histograms = nil
@@ -239,13 +248,19 @@ func buildMetaBlock(ringbuffer []byte, pos uint, mask uint, params *encoderParam
 		}
 	}
 
-	assert(mb.distance_context_map == nil)
 	mb.distance_context_map_size = mb.distance_split.num_types << distanceContextBits
-	mb.distance_context_map = make([]uint32, (mb.distance_context_map_size))
+	if cap(mb.distance_context_map) < int(mb.distance_context_map_size) {
+		mb.distance_context_map = make([]uint32, (mb.distance_context_map_size))
+	} else {
+		mb.distance_context_map = mb.distance_context_map[:mb.distance_context_map_size]
+	}
 
-	assert(mb.distance_histograms == nil)
 	mb.distance_histograms_size = mb.distance_context_map_size
-	mb.distance_histograms = make([]histogramDistance, (mb.distance_histograms_size))
+	if cap(mb.distance_histograms) < int(mb.distance_histograms_size) {
+		mb.distance_histograms = make([]histogramDistance, (mb.distance_histograms_size))
+	} else {
+		mb.distance_histograms = mb.distance_histograms[:mb.distance_histograms_size]
+	}
 
 	clusterHistogramsDistance(distance_histograms, mb.distance_context_map_size, buildMetaBlock_kMaxNumberOfHistograms, mb.distance_histograms, &mb.distance_histograms_size, mb.distance_context_map)
 	distance_histograms = nil
@@ -298,9 +313,12 @@ func initContextBlockSplitter(self *contextBlockSplitter, alphabet_size uint, nu
 	brotli_ensure_capacity_uint8_t(&split.types, &split.types_alloc_size, max_num_blocks)
 	brotli_ensure_capacity_uint32_t(&split.lengths, &split.lengths_alloc_size, max_num_blocks)
 	split.num_blocks = max_num_blocks
-	assert(*histograms == nil)
 	*histograms_size = max_num_types * num_contexts
-	*histograms = make([]histogramLiteral, (*histograms_size))
+	if histograms == nil || cap(*histograms) < int(*histograms_size) {
+		*histograms = make([]histogramLiteral, (*histograms_size))
+	} else {
+		*histograms = (*histograms)[:*histograms_size]
+	}
 	self.histograms_ = *histograms
 
 	/* Clear only current histogram. */
@@ -453,9 +471,12 @@ func contextBlockSplitterAddSymbol(self *contextBlockSplitter, symbol uint, cont
 
 func mapStaticContexts(num_contexts uint, static_context_map []uint32, mb *metaBlockSplit) {
 	var i uint
-	assert(mb.literal_context_map == nil)
 	mb.literal_context_map_size = mb.literal_split.num_types << literalContextBits
-	mb.literal_context_map = make([]uint32, (mb.literal_context_map_size))
+	if cap(mb.literal_context_map) < int(mb.literal_context_map_size) {
+		mb.literal_context_map = make([]uint32, (mb.literal_context_map_size))
+	} else {
+		mb.literal_context_map = mb.literal_context_map[:mb.literal_context_map_size]
+	}
 
 	for i = 0; i < mb.literal_split.num_types; i++ {
 		var offset uint32 = uint32(i * num_contexts)
@@ -466,7 +487,7 @@ func mapStaticContexts(num_contexts uint, static_context_map []uint32, mb *metaB
 	}
 }
 
-func buildMetaBlockGreedyInternal(ringbuffer []byte, pos uint, mask uint, prev_byte byte, prev_byte2 byte, literal_context_lut contextLUT, num_contexts uint, static_context_map []uint32, commands []command, n_commands uint, mb *metaBlockSplit) {
+func buildMetaBlockGreedyInternal(ringbuffer []byte, pos uint, mask uint, prev_byte byte, prev_byte2 byte, literal_context_lut contextLUT, num_contexts uint, static_context_map []uint32, commands []command, mb *metaBlockSplit) {
 	var lit_blocks struct {
 		plain blockSplitterLiteral
 		ctx   contextBlockSplitter
@@ -474,8 +495,7 @@ func buildMetaBlockGreedyInternal(ringbuffer []byte, pos uint, mask uint, prev_b
 	var cmd_blocks blockSplitterCommand
 	var dist_blocks blockSplitterDistance
 	var num_literals uint = 0
-	var i uint
-	for i = 0; i < n_commands; i++ {
+	for i := range commands {
 		num_literals += uint(commands[i].insert_len_)
 	}
 
@@ -485,11 +505,10 @@ func buildMetaBlockGreedyInternal(ringbuffer []byte, pos uint, mask uint, prev_b
 		initContextBlockSplitter(&lit_blocks.ctx, 256, num_contexts, 512, 400.0, num_literals, &mb.literal_split, &mb.literal_histograms, &mb.literal_histograms_size)
 	}
 
-	initBlockSplitterCommand(&cmd_blocks, numCommandSymbols, 1024, 500.0, n_commands, &mb.command_split, &mb.command_histograms, &mb.command_histograms_size)
-	initBlockSplitterDistance(&dist_blocks, 64, 512, 100.0, n_commands, &mb.distance_split, &mb.distance_histograms, &mb.distance_histograms_size)
+	initBlockSplitterCommand(&cmd_blocks, numCommandSymbols, 1024, 500.0, uint(len(commands)), &mb.command_split, &mb.command_histograms, &mb.command_histograms_size)
+	initBlockSplitterDistance(&dist_blocks, 64, 512, 100.0, uint(len(commands)), &mb.distance_split, &mb.distance_histograms, &mb.distance_histograms_size)
 
-	for i = 0; i < n_commands; i++ {
-		var cmd command = commands[i]
+	for _, cmd := range commands {
 		var j uint
 		blockSplitterAddSymbolCommand(&cmd_blocks, uint(cmd.cmd_prefix_))
 		for j = uint(cmd.insert_len_); j != 0; j-- {
@@ -530,11 +549,11 @@ func buildMetaBlockGreedyInternal(ringbuffer []byte, pos uint, mask uint, prev_b
 	}
 }
 
-func buildMetaBlockGreedy(ringbuffer []byte, pos uint, mask uint, prev_byte byte, prev_byte2 byte, literal_context_lut contextLUT, num_contexts uint, static_context_map []uint32, commands []command, n_commands uint, mb *metaBlockSplit) {
+func buildMetaBlockGreedy(ringbuffer []byte, pos uint, mask uint, prev_byte byte, prev_byte2 byte, literal_context_lut contextLUT, num_contexts uint, static_context_map []uint32, commands []command, mb *metaBlockSplit) {
 	if num_contexts == 1 {
-		buildMetaBlockGreedyInternal(ringbuffer, pos, mask, prev_byte, prev_byte2, literal_context_lut, 1, nil, commands, n_commands, mb)
+		buildMetaBlockGreedyInternal(ringbuffer, pos, mask, prev_byte, prev_byte2, literal_context_lut, 1, nil, commands, mb)
 	} else {
-		buildMetaBlockGreedyInternal(ringbuffer, pos, mask, prev_byte, prev_byte2, literal_context_lut, num_contexts, static_context_map, commands, n_commands, mb)
+		buildMetaBlockGreedyInternal(ringbuffer, pos, mask, prev_byte, prev_byte2, literal_context_lut, num_contexts, static_context_map, commands, mb)
 	}
 }
 
diff --git a/vendor/github.com/andybalholm/brotli/metablock_command.go b/vendor/github.com/andybalholm/brotli/metablock_command.go
index d47541c..14c7b77 100644
--- a/vendor/github.com/andybalholm/brotli/metablock_command.go
+++ b/vendor/github.com/andybalholm/brotli/metablock_command.go
@@ -43,9 +43,12 @@ func initBlockSplitterCommand(self *blockSplitterCommand, alphabet_size uint, mi
 	brotli_ensure_capacity_uint8_t(&split.types, &split.types_alloc_size, max_num_blocks)
 	brotli_ensure_capacity_uint32_t(&split.lengths, &split.lengths_alloc_size, max_num_blocks)
 	self.split_.num_blocks = max_num_blocks
-	assert(*histograms == nil)
 	*histograms_size = max_num_types
-	*histograms = make([]histogramCommand, (*histograms_size))
+	if histograms == nil || cap(*histograms) < int(*histograms_size) {
+		*histograms = make([]histogramCommand, (*histograms_size))
+	} else {
+		*histograms = (*histograms)[:*histograms_size]
+	}
 	self.histograms_ = *histograms
 
 	/* Clear only current histogram. */
diff --git a/vendor/github.com/andybalholm/brotli/metablock_distance.go b/vendor/github.com/andybalholm/brotli/metablock_distance.go
index 9592312..5110a81 100644
--- a/vendor/github.com/andybalholm/brotli/metablock_distance.go
+++ b/vendor/github.com/andybalholm/brotli/metablock_distance.go
@@ -43,9 +43,12 @@ func initBlockSplitterDistance(self *blockSplitterDistance, alphabet_size uint,
 	brotli_ensure_capacity_uint8_t(&split.types, &split.types_alloc_size, max_num_blocks)
 	brotli_ensure_capacity_uint32_t(&split.lengths, &split.lengths_alloc_size, max_num_blocks)
 	self.split_.num_blocks = max_num_blocks
-	assert(*histograms == nil)
 	*histograms_size = max_num_types
-	*histograms = make([]histogramDistance, (*histograms_size))
+	if histograms == nil || cap(*histograms) < int(*histograms_size) {
+		*histograms = make([]histogramDistance, *histograms_size)
+	} else {
+		*histograms = (*histograms)[:*histograms_size]
+	}
 	self.histograms_ = *histograms
 
 	/* Clear only current histogram. */
diff --git a/vendor/github.com/andybalholm/brotli/metablock_literal.go b/vendor/github.com/andybalholm/brotli/metablock_literal.go
index d7e8a7c..307f8da 100644
--- a/vendor/github.com/andybalholm/brotli/metablock_literal.go
+++ b/vendor/github.com/andybalholm/brotli/metablock_literal.go
@@ -43,9 +43,12 @@ func initBlockSplitterLiteral(self *blockSplitterLiteral, alphabet_size uint, mi
 	brotli_ensure_capacity_uint8_t(&split.types, &split.types_alloc_size, max_num_blocks)
 	brotli_ensure_capacity_uint32_t(&split.lengths, &split.lengths_alloc_size, max_num_blocks)
 	self.split_.num_blocks = max_num_blocks
-	assert(*histograms == nil)
 	*histograms_size = max_num_types
-	*histograms = make([]histogramLiteral, (*histograms_size))
+	if histograms == nil || cap(*histograms) < int(*histograms_size) {
+		*histograms = make([]histogramLiteral, *histograms_size)
+	} else {
+		*histograms = (*histograms)[:*histograms_size]
+	}
 	self.histograms_ = *histograms
 
 	/* Clear only current histogram. */
diff --git a/vendor/github.com/andybalholm/brotli/reader.go b/vendor/github.com/andybalholm/brotli/reader.go
index 5c795e6..cdc6764 100644
--- a/vendor/github.com/andybalholm/brotli/reader.go
+++ b/vendor/github.com/andybalholm/brotli/reader.go
@@ -33,7 +33,9 @@ func NewReader(src io.Reader) *Reader {
 func (r *Reader) Reset(src io.Reader) error {
 	decoderStateInit(r)
 	r.src = src
-	r.buf = make([]byte, readBufSize)
+	if r.buf == nil {
+		r.buf = make([]byte, readBufSize)
+	}
 	return nil
 }
 
diff --git a/vendor/github.com/andybalholm/brotli/ringbuffer.go b/vendor/github.com/andybalholm/brotli/ringbuffer.go
index 693a3f6..1c8f86f 100644
--- a/vendor/github.com/andybalholm/brotli/ringbuffer.go
+++ b/vendor/github.com/andybalholm/brotli/ringbuffer.go
@@ -27,10 +27,7 @@ type ringBuffer struct {
 }
 
 func ringBufferInit(rb *ringBuffer) {
-	rb.cur_size_ = 0
 	rb.pos_ = 0
-	rb.data_ = nil
-	rb.buffer_ = nil
 }
 
 func ringBufferSetup(params *encoderParams, rb *ringBuffer) {
@@ -47,11 +44,16 @@ const kSlackForEightByteHashingEverywhere uint = 7
 /* Allocates or re-allocates data_ to the given length + plus some slack
    region before and after. Fills the slack regions with zeros. */
 func ringBufferInitBuffer(buflen uint32, rb *ringBuffer) {
-	var new_data []byte = make([]byte, (2 + uint(buflen) + kSlackForEightByteHashingEverywhere))
+	var new_data []byte
 	var i uint
+	size := 2 + int(buflen) + int(kSlackForEightByteHashingEverywhere)
+	if cap(rb.data_) < size {
+		new_data = make([]byte, size)
+	} else {
+		new_data = rb.data_[:size]
+	}
 	if rb.data_ != nil {
 		copy(new_data, rb.data_[:2+rb.cur_size_+uint32(kSlackForEightByteHashingEverywhere)])
-		rb.data_ = nil
 	}
 
 	rb.data_ = new_data
diff --git a/vendor/github.com/andybalholm/brotli/static_dict.go b/vendor/github.com/andybalholm/brotli/static_dict.go
index 8e7492d..bc05566 100644
--- a/vendor/github.com/andybalholm/brotli/static_dict.go
+++ b/vendor/github.com/andybalholm/brotli/static_dict.go
@@ -77,8 +77,7 @@ func findAllStaticDictionaryMatches(dict *encoderDictionary, data []byte, min_le
 		var offset uint = uint(dict.buckets[hash(data)])
 		var end bool = offset == 0
 		for !end {
-			var w dictWord
-			w = dict.dict_words[offset]
+			w := dict.dict_words[offset]
 			offset++
 			var l uint = uint(w.len) & 0x1F
 			var n uint = uint(1) << dict.words.size_bits_by_length[l]
@@ -431,8 +430,7 @@ func findAllStaticDictionaryMatches(dict *encoderDictionary, data []byte, min_le
 		var offset uint = uint(dict.buckets[hash(data[1:])])
 		var end bool = offset == 0
 		for !end {
-			var w dictWord
-			w = dict.dict_words[offset]
+			w := dict.dict_words[offset]
 			offset++
 			var l uint = uint(w.len) & 0x1F
 			var n uint = uint(1) << dict.words.size_bits_by_length[l]
@@ -596,8 +594,7 @@ func findAllStaticDictionaryMatches(dict *encoderDictionary, data []byte, min_le
 			var offset uint = uint(dict.buckets[hash(data[2:])])
 			var end bool = offset == 0
 			for !end {
-				var w dictWord
-				w = dict.dict_words[offset]
+				w := dict.dict_words[offset]
 				offset++
 				var l uint = uint(w.len) & 0x1F
 				var n uint = uint(1) << dict.words.size_bits_by_length[l]
@@ -629,8 +626,7 @@ func findAllStaticDictionaryMatches(dict *encoderDictionary, data []byte, min_le
 			var offset uint = uint(dict.buckets[hash(data[5:])])
 			var end bool = offset == 0
 			for !end {
-				var w dictWord
-				w = dict.dict_words[offset]
+				w := dict.dict_words[offset]
 				offset++
 				var l uint = uint(w.len) & 0x1F
 				var n uint = uint(1) << dict.words.size_bits_by_length[l]
diff --git a/vendor/github.com/andybalholm/brotli/utf8_util.go b/vendor/github.com/andybalholm/brotli/utf8_util.go
index f86de3d..3244247 100644
--- a/vendor/github.com/andybalholm/brotli/utf8_util.go
+++ b/vendor/github.com/andybalholm/brotli/utf8_util.go
@@ -58,8 +58,7 @@ func isMostlyUTF8(data []byte, pos uint, mask uint, length uint, min_fraction fl
 	var i uint = 0
 	for i < length {
 		var symbol int
-		var current_data []byte
-		current_data = data[(pos+i)&mask:]
+		current_data := data[(pos+i)&mask:]
 		var bytes_read uint = parseAsUTF8(&symbol, current_data, length-i)
 		i += bytes_read
 		if symbol < 0x110000 {
diff --git a/vendor/github.com/andybalholm/brotli/write_bits.go b/vendor/github.com/andybalholm/brotli/write_bits.go
index 8f15c20..8729901 100644
--- a/vendor/github.com/andybalholm/brotli/write_bits.go
+++ b/vendor/github.com/andybalholm/brotli/write_bits.go
@@ -1,5 +1,7 @@
 package brotli
 
+import "encoding/binary"
+
 /* Copyright 2010 Google Inc. All Rights Reserved.
 
    Distributed under MIT license.
@@ -24,21 +26,15 @@ package brotli
    For n bits, we take the last 5 bits, OR that with high bits in BYTE-0,
    and locate the rest in BYTE+1, BYTE+2, etc. */
 func writeBits(n_bits uint, bits uint64, pos *uint, array []byte) {
-	var array_pos []byte = array[*pos>>3:]
-	var bits_reserved_in_first_byte uint = (*pos & 7)
-	/* implicit & 0xFF is assumed for uint8_t arithmetics */
-
-	var bits_left_to_write uint
-	bits <<= bits_reserved_in_first_byte
-	array_pos[0] |= byte(bits)
-	array_pos = array_pos[1:]
-	for bits_left_to_write = n_bits + bits_reserved_in_first_byte; bits_left_to_write >= 9; bits_left_to_write -= 8 {
-		bits >>= 8
-		array_pos[0] = byte(bits)
-		array_pos = array_pos[1:]
-	}
-
-	array_pos[0] = 0
+	/* This branch of the code can write up to 56 bits at a time,
+	   7 bits are lost by being perhaps already in *p and at least
+	   1 bit is needed to initialize the bit-stream ahead (i.e. if 7
+	   bits are in *p and we write 57 bits, then the next write will
+	   access a byte that was never initialized). */
+	p := array[*pos>>3:]
+	v := uint64(p[0])
+	v |= bits << (*pos & 7)
+	binary.LittleEndian.PutUint64(p, v)
 	*pos += n_bits
 }
 
diff --git a/vendor/github.com/andybalholm/brotli/writer.go b/vendor/github.com/andybalholm/brotli/writer.go
index ec333f9..39feaef 100644
--- a/vendor/github.com/andybalholm/brotli/writer.go
+++ b/vendor/github.com/andybalholm/brotli/writer.go
@@ -61,12 +61,16 @@ func (w *Writer) Reset(dst io.Writer) {
 		w.params.lgwin = uint(w.options.LGWin)
 	}
 	w.dst = dst
+	w.err = nil
 }
 
 func (w *Writer) writeChunk(p []byte, op int) (n int, err error) {
 	if w.dst == nil {
 		return 0, errWriterClosed
 	}
+	if w.err != nil {
+		return 0, w.err
+	}
 
 	for {
 		availableIn := uint(len(p))
@@ -79,16 +83,8 @@ func (w *Writer) writeChunk(p []byte, op int) (n int, err error) {
 			return n, errEncode
 		}
 
-		outputData := encoderTakeOutput(w)
-
-		if len(outputData) > 0 {
-			_, err = w.dst.Write(outputData)
-			if err != nil {
-				return n, err
-			}
-		}
-		if len(p) == 0 {
-			return n, nil
+		if len(p) == 0 || w.err != nil {
+			return n, w.err
 		}
 	}
 }
diff --git a/vendor/github.com/bitrise-io/go-utils/command/command.go b/vendor/github.com/bitrise-io/go-utils/command/command.go
index 4cd005a..c068490 100644
--- a/vendor/github.com/bitrise-io/go-utils/command/command.go
+++ b/vendor/github.com/bitrise-io/go-utils/command/command.go
@@ -2,14 +2,11 @@ package command
 
 import (
 	"errors"
-	"fmt"
 	"io"
 	"os"
 	"os/exec"
 	"strconv"
 	"strings"
-
-	"github.com/bitrise-io/go-utils/errorutil"
 )
 
 // ----------
@@ -139,18 +136,10 @@ func PrintableCommandArgs(isQuoteFirst bool, fullCommandArgs []string) string {
 }
 
 // RunCmdAndReturnExitCode ...
-func RunCmdAndReturnExitCode(cmd *exec.Cmd) (int, error) {
-	err := cmd.Run()
-	if err != nil {
-		exitCode, castErr := errorutil.CmdExitCodeFromError(err)
-		if castErr != nil {
-			return 1, fmt.Errorf("failed get exit code from error: %s, error: %s", err, castErr)
-		}
-
-		return exitCode, err
-	}
-
-	return 0, nil
+func RunCmdAndReturnExitCode(cmd *exec.Cmd) (exitCode int, err error) {
+	err = cmd.Run()
+	exitCode = cmd.ProcessState.ExitCode()
+	return
 }
 
 // RunCmdAndReturnTrimmedOutput ...
diff --git a/vendor/github.com/bitrise-io/go-utils/command/git/commands.go b/vendor/github.com/bitrise-io/go-utils/command/git/commands.go
index c1e276b..1c217d8 100644
--- a/vendor/github.com/bitrise-io/go-utils/command/git/commands.go
+++ b/vendor/github.com/bitrise-io/go-utils/command/git/commands.go
@@ -62,8 +62,10 @@ func (g *Git) Clean(options ...string) *command.Model {
 }
 
 // SubmoduleUpdate updates the registered submodules.
-func (g *Git) SubmoduleUpdate() *command.Model {
-	return g.command("submodule", "update", "--init", "--recursive")
+func (g *Git) SubmoduleUpdate(opts ...string) *command.Model {
+	args := []string{"submodule", "update", "--init", "--recursive"}
+	args = append(args, opts...)
+	return g.command(args...)
 }
 
 // SubmoduleForeach evaluates an arbitrary git command in each checked out
@@ -86,8 +88,9 @@ func (g *Git) Add(pathspec string) *command.Model {
 }
 
 // Branch lists branches.
-func (g *Git) Branch() *command.Model {
-	return g.command("branch")
+func (g *Git) Branch(opts ...string) *command.Model {
+	args := append([]string{"branch"}, opts...)
+	return g.command(args...)
 }
 
 // NewBranch creates a new branch as if git-branch were called and then check it out.
@@ -101,8 +104,11 @@ func (g *Git) Apply(patch string) *command.Model {
 }
 
 // Log shows the commit logs. The format parameter controls what is shown and how.
-func (g *Git) Log(format string) *command.Model {
-	return g.command("log", "-1", "--format="+format)
+// Revision range can be optionally specified using the opts parameter.
+func (g *Git) Log(format string, opts ...string) *command.Model {
+	args := []string{"log", "-1", "--format=" + format}
+	args = append(args, opts...)
+	return g.command(args...)
 }
 
 // RevList lists commit objects in reverse chronological order.
@@ -135,6 +141,24 @@ func (g *Git) Status(opts ...string) *command.Model {
 }
 
 // Config sets a git config setting for the repository.
-func (g *Git) Config(key string, value string) *command.Model {
-	return g.command("config", key, value)
+func (g *Git) Config(key string, value string, opts ...string) *command.Model {
+	args := []string{"config", key, value}
+	args = append(args, opts...)
+	return g.command(args...)
+}
+
+// SparseCheckoutInit initializes the sparse-checkout config file.
+func (g *Git) SparseCheckoutInit(cone bool) *command.Model {
+	args := []string{"sparse-checkout", "init"}
+	if cone {
+		args = append(args, "--cone")
+	}
+	return g.command(args...)
+}
+
+// SparseCheckoutSet writes the provided patterns to the sparse-checkout config file.
+func (g *Git) SparseCheckoutSet(opts ...string) *command.Model {
+	args := []string{"sparse-checkout", "set"}
+	args = append(args, opts...)
+	return g.command(args...)
 }
diff --git a/vendor/github.com/bitrise-io/go-utils/errorutil/errorutil.go b/vendor/github.com/bitrise-io/go-utils/errorutil/errorutil.go
deleted file mode 100644
index 1128416..0000000
--- a/vendor/github.com/bitrise-io/go-utils/errorutil/errorutil.go
+++ /dev/null
@@ -1,36 +0,0 @@
-package errorutil
-
-import (
-	"errors"
-	"os/exec"
-	"regexp"
-	"syscall"
-)
-
-// IsExitStatusError ...
-func IsExitStatusError(err error) bool {
-	return IsExitStatusErrorStr(err.Error())
-}
-
-// IsExitStatusErrorStr ...
-func IsExitStatusErrorStr(errString string) bool {
-	// example exit status error string: exit status 1
-	var rex = regexp.MustCompile(`^exit status [0-9]{1,3}$`)
-	return rex.MatchString(errString)
-}
-
-// CmdExitCodeFromError ...
-func CmdExitCodeFromError(err error) (int, error) {
-	cmdExitCode := 0
-	if err != nil {
-		if exitError, ok := err.(*exec.ExitError); ok {
-			waitStatus, ok := exitError.Sys().(syscall.WaitStatus)
-			if !ok {
-				return 1, errors.New("Failed to cast exit status")
-			}
-			cmdExitCode = waitStatus.ExitStatus()
-		}
-		return cmdExitCode, nil
-	}
-	return 0, nil
-}
diff --git a/vendor/github.com/bitrise-io/go-utils/pathutil/path_filter.go b/vendor/github.com/bitrise-io/go-utils/pathutil/path_filter.go
new file mode 100644
index 0000000..db0665d
--- /dev/null
+++ b/vendor/github.com/bitrise-io/go-utils/pathutil/path_filter.go
@@ -0,0 +1,164 @@
+package pathutil
+
+import (
+	"errors"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"regexp"
+	"strings"
+)
+
+// ListEntries filters contents of a directory using the provided filters
+func ListEntries(dir string, filters ...FilterFunc) ([]string, error) {
+	absDir, err := filepath.Abs(dir)
+	if err != nil {
+		return []string{}, err
+	}
+
+	entries, err := ioutil.ReadDir(absDir)
+	if err != nil {
+		return []string{}, err
+	}
+
+	var paths []string
+	for _, entry := range entries {
+		pth := filepath.Join(absDir, entry.Name())
+		paths = append(paths, pth)
+	}
+
+	return FilterPaths(paths, filters...)
+}
+
+// FilterPaths ...
+func FilterPaths(fileList []string, filters ...FilterFunc) ([]string, error) {
+	var filtered []string
+
+	for _, pth := range fileList {
+		allowed := true
+		for _, filter := range filters {
+			if allows, err := filter(pth); err != nil {
+				return []string{}, err
+			} else if !allows {
+				allowed = false
+				break
+			}
+		}
+		if allowed {
+			filtered = append(filtered, pth)
+		}
+	}
+
+	return filtered, nil
+}
+
+// FilterFunc ...
+type FilterFunc func(string) (bool, error)
+
+// BaseFilter ...
+func BaseFilter(base string, allowed bool) FilterFunc {
+	return func(pth string) (bool, error) {
+		b := filepath.Base(pth)
+		return allowed == strings.EqualFold(base, b), nil
+	}
+}
+
+// ExtensionFilter ...
+func ExtensionFilter(ext string, allowed bool) FilterFunc {
+	return func(pth string) (bool, error) {
+		e := filepath.Ext(pth)
+		return allowed == strings.EqualFold(ext, e), nil
+	}
+}
+
+// RegexpFilter ...
+func RegexpFilter(pattern string, allowed bool) FilterFunc {
+	return func(pth string) (bool, error) {
+		re := regexp.MustCompile(pattern)
+		found := re.FindString(pth) != ""
+		return allowed == found, nil
+	}
+}
+
+// ComponentFilter ...
+func ComponentFilter(component string, allowed bool) FilterFunc {
+	return func(pth string) (bool, error) {
+		found := false
+		pathComponents := strings.Split(pth, string(filepath.Separator))
+		for _, c := range pathComponents {
+			if c == component {
+				found = true
+			}
+		}
+		return allowed == found, nil
+	}
+}
+
+// ComponentWithExtensionFilter ...
+func ComponentWithExtensionFilter(ext string, allowed bool) FilterFunc {
+	return func(pth string) (bool, error) {
+		found := false
+		pathComponents := strings.Split(pth, string(filepath.Separator))
+		for _, c := range pathComponents {
+			e := filepath.Ext(c)
+			if e == ext {
+				found = true
+			}
+		}
+		return allowed == found, nil
+	}
+}
+
+// IsDirectoryFilter ...
+func IsDirectoryFilter(allowed bool) FilterFunc {
+	return func(pth string) (bool, error) {
+		fileInf, err := os.Lstat(pth)
+		if err != nil {
+			return false, err
+		}
+		if fileInf == nil {
+			return false, errors.New("no file info available")
+		}
+		return allowed == fileInf.IsDir(), nil
+	}
+}
+
+// InDirectoryFilter ...
+func InDirectoryFilter(dir string, allowed bool) FilterFunc {
+	return func(pth string) (bool, error) {
+		in := filepath.Dir(pth) == dir
+		return allowed == in, nil
+	}
+}
+
+// DirectoryContainsFileFilter returns a FilterFunc that checks if a directory contains a file
+func DirectoryContainsFileFilter(fileName string) FilterFunc {
+	return func(pth string) (bool, error) {
+		isDir, err := IsDirectoryFilter(true)(pth)
+		if err != nil {
+			return false, err
+		}
+		if !isDir {
+			return false, nil
+		}
+
+		absPath := filepath.Join(pth, fileName)
+		if _, err := os.Lstat(absPath); err != nil {
+			if !os.IsNotExist(err) {
+				return false, err
+			}
+			return false, nil
+		}
+		return true, nil
+	}
+}
+
+// FileContainsFilter ...
+func FileContainsFilter(pth, str string) (bool, error) {
+	bytes, err := ioutil.ReadFile(pth)
+	if err != nil {
+		return false, err
+	}
+
+	return strings.Contains(string(bytes), str), nil
+}
diff --git a/vendor/github.com/bitrise-io/go-utils/pathutil/pathutil.go b/vendor/github.com/bitrise-io/go-utils/pathutil/pathutil.go
index a88ff5f..947c97c 100644
--- a/vendor/github.com/bitrise-io/go-utils/pathutil/pathutil.go
+++ b/vendor/github.com/bitrise-io/go-utils/pathutil/pathutil.go
@@ -10,47 +10,34 @@ import (
 	"strings"
 )
 
-// RevokableChangeDir ...
-func RevokableChangeDir(dir string) (func() error, error) {
-	origDir, err := CurrentWorkingDirectoryAbsolutePath()
-	if err != nil {
-		return nil, err
-	}
-
-	revokeFn := func() error {
-		return os.Chdir(origDir)
+// NormalizedOSTempDirPath ...
+// Creates a temp dir, and returns its path.
+// If tmpDirNamePrefix is provided it'll be used
+//  as the tmp dir's name prefix.
+// Normalized: it's guaranteed that the path won't end with '/'.
+func NormalizedOSTempDirPath(tmpDirNamePrefix string) (retPth string, err error) {
+	retPth, err = ioutil.TempDir("", tmpDirNamePrefix)
+	if strings.HasSuffix(retPth, "/") {
+		retPth = retPth[:len(retPth)-1]
 	}
-
-	return revokeFn, os.Chdir(dir)
+	return
 }
 
-// ChangeDirForFunction ...
-func ChangeDirForFunction(dir string, fn func()) error {
-	revokeFn, err := RevokableChangeDir(dir)
-	if err != nil {
-		return err
-	}
-
-	fn()
-
-	return revokeFn()
+// CurrentWorkingDirectoryAbsolutePath ...
+func CurrentWorkingDirectoryAbsolutePath() (string, error) {
+	return filepath.Abs("./")
 }
 
-// IsRelativePath ...
-func IsRelativePath(pth string) bool {
-	if strings.HasPrefix(pth, "./") {
-		return true
-	}
-
-	if strings.HasPrefix(pth, "/") {
-		return false
-	}
-
-	if strings.HasPrefix(pth, "$") {
-		return false
+// UserHomeDir ...
+func UserHomeDir() string {
+	if runtime.GOOS == "windows" {
+		home := os.Getenv("HOMEDRIVE") + os.Getenv("HOMEPATH")
+		if home == "" {
+			home = os.Getenv("USERPROFILE")
+		}
+		return home
 	}
-
-	return true
+	return os.Getenv("HOME")
 }
 
 // EnsureDirExist ...
@@ -76,12 +63,6 @@ func genericIsPathExists(pth string) (os.FileInfo, bool, error) {
 	return fileInf, false, err
 }
 
-// IsPathExists ...
-func IsPathExists(pth string) (bool, error) {
-	_, isExists, err := genericIsPathExists(pth)
-	return isExists, err
-}
-
 // PathCheckAndInfos ...
 // Returns:
 // 1. file info or nil
@@ -106,6 +87,32 @@ func IsDirExists(pth string) (bool, error) {
 	return fileInf.IsDir(), nil
 }
 
+// IsPathExists ...
+func IsPathExists(pth string) (bool, error) {
+	_, isExists, err := genericIsPathExists(pth)
+	return isExists, err
+}
+
+//
+// Path modifier functions
+
+// PathModifier ...
+type PathModifier interface {
+	AbsPath(pth string) (string, error)
+}
+
+type defaultPathModifier struct{}
+
+// NewPathModifier ...
+func NewPathModifier() PathModifier {
+	return defaultPathModifier{}
+}
+
+// AbsPath ...
+func (defaultPathModifier) AbsPath(pth string) (string, error) {
+	return AbsPath(pth)
+}
+
 // AbsPath expands ENV vars and the ~ character
 //	then call Go's Abs
 func AbsPath(pth string) (string, error) {
@@ -150,37 +157,65 @@ func ExpandTilde(pth string) (string, error) {
 	return pth, nil
 }
 
-// CurrentWorkingDirectoryAbsolutePath ...
-func CurrentWorkingDirectoryAbsolutePath() (string, error) {
-	return filepath.Abs("./")
-}
+// IsRelativePath ...
+func IsRelativePath(pth string) bool {
+	if strings.HasPrefix(pth, "./") {
+		return true
+	}
 
-// UserHomeDir ...
-func UserHomeDir() string {
-	if runtime.GOOS == "windows" {
-		home := os.Getenv("HOMEDRIVE") + os.Getenv("HOMEPATH")
-		if home == "" {
-			home = os.Getenv("USERPROFILE")
-		}
-		return home
+	if strings.HasPrefix(pth, "/") {
+		return false
 	}
-	return os.Getenv("HOME")
-}
 
-// NormalizedOSTempDirPath ...
-// Creates a temp dir, and returns its path.
-// If tmpDirNamePrefix is provided it'll be used
-//  as the tmp dir's name prefix.
-// Normalized: it's guaranteed that the path won't end with '/'.
-func NormalizedOSTempDirPath(tmpDirNamePrefix string) (retPth string, err error) {
-	retPth, err = ioutil.TempDir("", tmpDirNamePrefix)
-	if strings.HasSuffix(retPth, "/") {
-		retPth = retPth[:len(retPth)-1]
+	if strings.HasPrefix(pth, "$") {
+		return false
 	}
-	return
+
+	return true
 }
 
 // GetFileName returns the name of the file from a given path or the name of the directory if it is a directory
 func GetFileName(path string) string {
 	return strings.TrimSuffix(filepath.Base(path), filepath.Ext(path))
 }
+
+// EscapeGlobPath escapes a partial path, determined at runtime, used as a parameter for filepath.Glob
+func EscapeGlobPath(path string) string {
+	var escaped string
+	for _, ch := range path {
+		if ch == '[' || ch == ']' || ch == '-' || ch == '*' || ch == '?' || ch == '\\' {
+			escaped += "\\"
+		}
+		escaped += string(ch)
+	}
+	return escaped
+}
+
+//
+// Change dir functions
+
+// RevokableChangeDir ...
+func RevokableChangeDir(dir string) (func() error, error) {
+	origDir, err := CurrentWorkingDirectoryAbsolutePath()
+	if err != nil {
+		return nil, err
+	}
+
+	revokeFn := func() error {
+		return os.Chdir(origDir)
+	}
+
+	return revokeFn, os.Chdir(dir)
+}
+
+// ChangeDirForFunction ...
+func ChangeDirForFunction(dir string, fn func()) error {
+	revokeFn, err := RevokableChangeDir(dir)
+	if err != nil {
+		return err
+	}
+
+	fn()
+
+	return revokeFn()
+}
diff --git a/vendor/github.com/bitrise-io/go-utils/pathutil/sortable_path.go b/vendor/github.com/bitrise-io/go-utils/pathutil/sortable_path.go
new file mode 100644
index 0000000..97f63c3
--- /dev/null
+++ b/vendor/github.com/bitrise-io/go-utils/pathutil/sortable_path.go
@@ -0,0 +1,119 @@
+package pathutil
+
+import (
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+)
+
+// ListPathInDirSortedByComponents ...
+func ListPathInDirSortedByComponents(searchDir string, relPath bool) ([]string, error) {
+	searchDir, err := filepath.Abs(searchDir)
+	if err != nil {
+		return []string{}, err
+	}
+
+	var fileList []string
+
+	if err := filepath.Walk(searchDir, func(path string, _ os.FileInfo, walkErr error) error {
+		if walkErr != nil {
+			return walkErr
+		}
+
+		if relPath {
+			rel, err := filepath.Rel(searchDir, path)
+			if err != nil {
+				return err
+			}
+			path = rel
+		}
+
+		fileList = append(fileList, path)
+
+		return nil
+	}); err != nil {
+		return []string{}, err
+	}
+	return SortPathsByComponents(fileList)
+}
+
+// SortablePath ...
+type SortablePath struct {
+	Pth        string
+	AbsPth     string
+	Components []string
+}
+
+// NewSortablePath ...
+func NewSortablePath(pth string) (SortablePath, error) {
+	absPth, err := AbsPath(pth)
+	if err != nil {
+		return SortablePath{}, err
+	}
+
+	components := strings.Split(absPth, string(os.PathSeparator))
+	fixedComponents := []string{}
+	for _, comp := range components {
+		if comp != "" {
+			fixedComponents = append(fixedComponents, comp)
+		}
+	}
+
+	return SortablePath{
+		Pth:        pth,
+		AbsPth:     absPth,
+		Components: fixedComponents,
+	}, nil
+}
+
+// BySortablePathComponents ..
+type BySortablePathComponents []SortablePath
+
+func (s BySortablePathComponents) Len() int {
+	return len(s)
+}
+func (s BySortablePathComponents) Swap(i, j int) {
+	s[i], s[j] = s[j], s[i]
+}
+func (s BySortablePathComponents) Less(i, j int) bool {
+	path1 := s[i]
+	path2 := s[j]
+
+	d1 := len(path1.Components)
+	d2 := len(path2.Components)
+
+	if d1 < d2 {
+		return true
+	} else if d1 > d2 {
+		return false
+	}
+
+	// if same component size,
+	// do alphabetic sort based on the last component
+	base1 := filepath.Base(path1.AbsPth)
+	base2 := filepath.Base(path2.AbsPth)
+
+	return base1 < base2
+}
+
+// SortPathsByComponents ...
+func SortPathsByComponents(paths []string) ([]string, error) {
+	sortableFiles := []SortablePath{}
+	for _, pth := range paths {
+		sortable, err := NewSortablePath(pth)
+		if err != nil {
+			return []string{}, err
+		}
+		sortableFiles = append(sortableFiles, sortable)
+	}
+
+	sort.Sort(BySortablePathComponents(sortableFiles))
+
+	sortedFiles := []string{}
+	for _, pth := range sortableFiles {
+		sortedFiles = append(sortedFiles, pth.Pth)
+	}
+
+	return sortedFiles, nil
+}
diff --git a/vendor/github.com/golang/snappy/AUTHORS b/vendor/github.com/golang/snappy/AUTHORS
index bcfa195..52ccb5a 100644
--- a/vendor/github.com/golang/snappy/AUTHORS
+++ b/vendor/github.com/golang/snappy/AUTHORS
@@ -8,8 +8,11 @@
 
 # Please keep the list sorted.
 
+Amazon.com, Inc
 Damian Gryski <dgryski@gmail.com>
+Eric Buth <eric@topos.com>
 Google Inc.
 Jan Mercl <0xjnml@gmail.com>
+Klaus Post <klauspost@gmail.com>
 Rodolfo Carvalho <rhcarvalho@gmail.com>
 Sebastien Binet <seb.binet@gmail.com>
diff --git a/vendor/github.com/golang/snappy/CONTRIBUTORS b/vendor/github.com/golang/snappy/CONTRIBUTORS
index 931ae31..ea6524d 100644
--- a/vendor/github.com/golang/snappy/CONTRIBUTORS
+++ b/vendor/github.com/golang/snappy/CONTRIBUTORS
@@ -26,9 +26,13 @@
 
 # Please keep the list sorted.
 
+Alex Legg <alexlegg@google.com>
 Damian Gryski <dgryski@gmail.com>
+Eric Buth <eric@topos.com>
 Jan Mercl <0xjnml@gmail.com>
+Jonathan Swinney <jswinney@amazon.com>
 Kai Backman <kaib@golang.org>
+Klaus Post <klauspost@gmail.com>
 Marc-Antoine Ruel <maruel@chromium.org>
 Nigel Tao <nigeltao@golang.org>
 Rob Pike <r@golang.org>
diff --git a/vendor/github.com/golang/snappy/decode.go b/vendor/github.com/golang/snappy/decode.go
index 72efb03..23c6e26 100644
--- a/vendor/github.com/golang/snappy/decode.go
+++ b/vendor/github.com/golang/snappy/decode.go
@@ -52,6 +52,8 @@ const (
 // Otherwise, a newly allocated slice will be returned.
 //
 // The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// Decode handles the Snappy block format, not the Snappy stream format.
 func Decode(dst, src []byte) ([]byte, error) {
 	dLen, s, err := decodedLen(src)
 	if err != nil {
@@ -83,6 +85,8 @@ func NewReader(r io.Reader) *Reader {
 }
 
 // Reader is an io.Reader that can read Snappy-compressed bytes.
+//
+// Reader handles the Snappy stream format, not the Snappy block format.
 type Reader struct {
 	r       io.Reader
 	err     error
@@ -114,32 +118,23 @@ func (r *Reader) readFull(p []byte, allowEOF bool) (ok bool) {
 	return true
 }
 
-// Read satisfies the io.Reader interface.
-func (r *Reader) Read(p []byte) (int, error) {
-	if r.err != nil {
-		return 0, r.err
-	}
-	for {
-		if r.i < r.j {
-			n := copy(p, r.decoded[r.i:r.j])
-			r.i += n
-			return n, nil
-		}
+func (r *Reader) fill() error {
+	for r.i >= r.j {
 		if !r.readFull(r.buf[:4], true) {
-			return 0, r.err
+			return r.err
 		}
 		chunkType := r.buf[0]
 		if !r.readHeader {
 			if chunkType != chunkTypeStreamIdentifier {
 				r.err = ErrCorrupt
-				return 0, r.err
+				return r.err
 			}
 			r.readHeader = true
 		}
 		chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16
 		if chunkLen > len(r.buf) {
 			r.err = ErrUnsupported
-			return 0, r.err
+			return r.err
 		}
 
 		// The chunk types are specified at
@@ -149,11 +144,11 @@ func (r *Reader) Read(p []byte) (int, error) {
 			// Section 4.2. Compressed data (chunk type 0x00).
 			if chunkLen < checksumSize {
 				r.err = ErrCorrupt
-				return 0, r.err
+				return r.err
 			}
 			buf := r.buf[:chunkLen]
 			if !r.readFull(buf, false) {
-				return 0, r.err
+				return r.err
 			}
 			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
 			buf = buf[checksumSize:]
@@ -161,19 +156,19 @@ func (r *Reader) Read(p []byte) (int, error) {
 			n, err := DecodedLen(buf)
 			if err != nil {
 				r.err = err
-				return 0, r.err
+				return r.err
 			}
 			if n > len(r.decoded) {
 				r.err = ErrCorrupt
-				return 0, r.err
+				return r.err
 			}
 			if _, err := Decode(r.decoded, buf); err != nil {
 				r.err = err
-				return 0, r.err
+				return r.err
 			}
 			if crc(r.decoded[:n]) != checksum {
 				r.err = ErrCorrupt
-				return 0, r.err
+				return r.err
 			}
 			r.i, r.j = 0, n
 			continue
@@ -182,25 +177,25 @@ func (r *Reader) Read(p []byte) (int, error) {
 			// Section 4.3. Uncompressed data (chunk type 0x01).
 			if chunkLen < checksumSize {
 				r.err = ErrCorrupt
-				return 0, r.err
+				return r.err
 			}
 			buf := r.buf[:checksumSize]
 			if !r.readFull(buf, false) {
-				return 0, r.err
+				return r.err
 			}
 			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
 			// Read directly into r.decoded instead of via r.buf.
 			n := chunkLen - checksumSize
 			if n > len(r.decoded) {
 				r.err = ErrCorrupt
-				return 0, r.err
+				return r.err
 			}
 			if !r.readFull(r.decoded[:n], false) {
-				return 0, r.err
+				return r.err
 			}
 			if crc(r.decoded[:n]) != checksum {
 				r.err = ErrCorrupt
-				return 0, r.err
+				return r.err
 			}
 			r.i, r.j = 0, n
 			continue
@@ -209,15 +204,15 @@ func (r *Reader) Read(p []byte) (int, error) {
 			// Section 4.1. Stream identifier (chunk type 0xff).
 			if chunkLen != len(magicBody) {
 				r.err = ErrCorrupt
-				return 0, r.err
+				return r.err
 			}
 			if !r.readFull(r.buf[:len(magicBody)], false) {
-				return 0, r.err
+				return r.err
 			}
 			for i := 0; i < len(magicBody); i++ {
 				if r.buf[i] != magicBody[i] {
 					r.err = ErrCorrupt
-					return 0, r.err
+					return r.err
 				}
 			}
 			continue
@@ -226,12 +221,44 @@ func (r *Reader) Read(p []byte) (int, error) {
 		if chunkType <= 0x7f {
 			// Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
 			r.err = ErrUnsupported
-			return 0, r.err
+			return r.err
 		}
 		// Section 4.4 Padding (chunk type 0xfe).
 		// Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
 		if !r.readFull(r.buf[:chunkLen], false) {
-			return 0, r.err
+			return r.err
 		}
 	}
+
+	return nil
+}
+
+// Read satisfies the io.Reader interface.
+func (r *Reader) Read(p []byte) (int, error) {
+	if r.err != nil {
+		return 0, r.err
+	}
+
+	if err := r.fill(); err != nil {
+		return 0, err
+	}
+
+	n := copy(p, r.decoded[r.i:r.j])
+	r.i += n
+	return n, nil
+}
+
+// ReadByte satisfies the io.ByteReader interface.
+func (r *Reader) ReadByte() (byte, error) {
+	if r.err != nil {
+		return 0, r.err
+	}
+
+	if err := r.fill(); err != nil {
+		return 0, err
+	}
+
+	c := r.decoded[r.i]
+	r.i++
+	return c, nil
 }
diff --git a/vendor/github.com/golang/snappy/decode_amd64.go b/vendor/github.com/golang/snappy/decode_amd64.go
deleted file mode 100644
index fcd192b..0000000
--- a/vendor/github.com/golang/snappy/decode_amd64.go
+++ /dev/null
@@ -1,14 +0,0 @@
-// Copyright 2016 The Snappy-Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !appengine
-// +build gc
-// +build !noasm
-
-package snappy
-
-// decode has the same semantics as in decode_other.go.
-//
-//go:noescape
-func decode(dst, src []byte) int
diff --git a/vendor/github.com/klauspost/compress/snappy/decode_amd64.s b/vendor/github.com/golang/snappy/decode_arm64.s
similarity index 67%
rename from vendor/github.com/klauspost/compress/snappy/decode_amd64.s
rename to vendor/github.com/golang/snappy/decode_arm64.s
index 1c66e37..7a3ead1 100644
--- a/vendor/github.com/klauspost/compress/snappy/decode_amd64.s
+++ b/vendor/github.com/golang/snappy/decode_arm64.s
@@ -1,4 +1,4 @@
-// Copyright 2016 The Go Authors. All rights reserved.
+// Copyright 2020 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -15,12 +15,12 @@
 //
 // All local variables fit into registers. The non-zero stack size is only to
 // spill registers and push args when issuing a CALL. The register allocation:
-//	- AX	scratch
-//	- BX	scratch
-//	- CX	length or x
-//	- DX	offset
-//	- SI	&src[s]
-//	- DI	&dst[d]
+//	- R2	scratch
+//	- R3	scratch
+//	- R4	length or x
+//	- R5	offset
+//	- R6	&src[s]
+//	- R7	&dst[d]
 //	+ R8	dst_base
 //	+ R9	dst_len
 //	+ R10	dst_base + dst_len
@@ -33,34 +33,35 @@
 // The registers R8-R13 (marked with a "+") are set at the start of the
 // function, and after a CALL returns, and are not otherwise modified.
 //
-// The d variable is implicitly DI - R8,  and len(dst)-d is R10 - DI.
-// The s variable is implicitly SI - R11, and len(src)-s is R13 - SI.
-TEXT ·decode(SB), NOSPLIT, $48-56
-	// Initialize SI, DI and R8-R13.
-	MOVQ dst_base+0(FP), R8
-	MOVQ dst_len+8(FP), R9
-	MOVQ R8, DI
-	MOVQ R8, R10
-	ADDQ R9, R10
-	MOVQ src_base+24(FP), R11
-	MOVQ src_len+32(FP), R12
-	MOVQ R11, SI
-	MOVQ R11, R13
-	ADDQ R12, R13
+// The d variable is implicitly R7 - R8,  and len(dst)-d is R10 - R7.
+// The s variable is implicitly R6 - R11, and len(src)-s is R13 - R6.
+TEXT ·decode(SB), NOSPLIT, $56-56
+	// Initialize R6, R7 and R8-R13.
+	MOVD dst_base+0(FP), R8
+	MOVD dst_len+8(FP), R9
+	MOVD R8, R7
+	MOVD R8, R10
+	ADD  R9, R10, R10
+	MOVD src_base+24(FP), R11
+	MOVD src_len+32(FP), R12
+	MOVD R11, R6
+	MOVD R11, R13
+	ADD  R12, R13, R13
 
 loop:
 	// for s < len(src)
-	CMPQ SI, R13
-	JEQ  end
+	CMP R13, R6
+	BEQ end
 
-	// CX = uint32(src[s])
+	// R4 = uint32(src[s])
 	//
 	// switch src[s] & 0x03
-	MOVBLZX (SI), CX
-	MOVL    CX, BX
-	ANDL    $3, BX
-	CMPL    BX, $1
-	JAE     tagCopy
+	MOVBU (R6), R4
+	MOVW  R4, R3
+	ANDW  $3, R3
+	MOVW  $1, R1
+	CMPW  R1, R3
+	BGE   tagCopy
 
 	// ----------------------------------------
 	// The code below handles literal tags.
@@ -68,35 +69,36 @@ loop:
 	// case tagLiteral:
 	// x := uint32(src[s] >> 2)
 	// switch
-	SHRL $2, CX
-	CMPL CX, $60
-	JAE  tagLit60Plus
+	MOVW $60, R1
+	LSRW $2, R4, R4
+	CMPW R4, R1
+	BLS  tagLit60Plus
 
 	// case x < 60:
 	// s++
-	INCQ SI
+	ADD $1, R6, R6
 
 doLit:
 	// This is the end of the inner "switch", when we have a literal tag.
 	//
-	// We assume that CX == x and x fits in a uint32, where x is the variable
+	// We assume that R4 == x and x fits in a uint32, where x is the variable
 	// used in the pure Go decode_other.go code.
 
 	// length = int(x) + 1
 	//
 	// Unlike the pure Go code, we don't need to check if length <= 0 because
-	// CX can hold 64 bits, so the increment cannot overflow.
-	INCQ CX
+	// R4 can hold 64 bits, so the increment cannot overflow.
+	ADD $1, R4, R4
 
 	// Prepare to check if copying length bytes will run past the end of dst or
 	// src.
 	//
-	// AX = len(dst) - d
-	// BX = len(src) - s
-	MOVQ R10, AX
-	SUBQ DI, AX
-	MOVQ R13, BX
-	SUBQ SI, BX
+	// R2 = len(dst) - d
+	// R3 = len(src) - s
+	MOVD R10, R2
+	SUB  R7, R2, R2
+	MOVD R13, R3
+	SUB  R6, R3, R3
 
 	// !!! Try a faster technique for short (16 or fewer bytes) copies.
 	//
@@ -109,12 +111,12 @@ doLit:
 	// is contiguous in memory and so it needs to leave enough source bytes to
 	// read the next tag without refilling buffers, but Go's Decode assumes
 	// contiguousness (the src argument is a []byte).
-	CMPQ CX, $16
-	JGT  callMemmove
-	CMPQ AX, $16
-	JLT  callMemmove
-	CMPQ BX, $16
-	JLT  callMemmove
+	CMP $16, R4
+	BGT callMemmove
+	CMP $16, R2
+	BLT callMemmove
+	CMP $16, R3
+	BLT callMemmove
 
 	// !!! Implement the copy from src to dst as a 16-byte load and store.
 	// (Decode's documentation says that dst and src must not overlap.)
@@ -124,57 +126,57 @@ doLit:
 	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
 	// non-nil error), so the overrun will be ignored.
 	//
-	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
+	// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
 	// 16-byte loads and stores. This technique probably wouldn't be as
 	// effective on architectures that are fussier about alignment.
-	MOVOU 0(SI), X0
-	MOVOU X0, 0(DI)
+	LDP 0(R6), (R14, R15)
+	STP (R14, R15), 0(R7)
 
 	// d += length
 	// s += length
-	ADDQ CX, DI
-	ADDQ CX, SI
-	JMP  loop
+	ADD R4, R7, R7
+	ADD R4, R6, R6
+	B   loop
 
 callMemmove:
 	// if length > len(dst)-d || length > len(src)-s { etc }
-	CMPQ CX, AX
-	JGT  errCorrupt
-	CMPQ CX, BX
-	JGT  errCorrupt
+	CMP R2, R4
+	BGT errCorrupt
+	CMP R3, R4
+	BGT errCorrupt
 
 	// copy(dst[d:], src[s:s+length])
 	//
 	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
-	// DI, SI and CX as arguments. Coincidentally, we also need to spill those
+	// R7, R6 and R4 as arguments. Coincidentally, we also need to spill those
 	// three registers to the stack, to save local variables across the CALL.
-	MOVQ DI, 0(SP)
-	MOVQ SI, 8(SP)
-	MOVQ CX, 16(SP)
-	MOVQ DI, 24(SP)
-	MOVQ SI, 32(SP)
-	MOVQ CX, 40(SP)
+	MOVD R7, 8(RSP)
+	MOVD R6, 16(RSP)
+	MOVD R4, 24(RSP)
+	MOVD R7, 32(RSP)
+	MOVD R6, 40(RSP)
+	MOVD R4, 48(RSP)
 	CALL runtime·memmove(SB)
 
 	// Restore local variables: unspill registers from the stack and
 	// re-calculate R8-R13.
-	MOVQ 24(SP), DI
-	MOVQ 32(SP), SI
-	MOVQ 40(SP), CX
-	MOVQ dst_base+0(FP), R8
-	MOVQ dst_len+8(FP), R9
-	MOVQ R8, R10
-	ADDQ R9, R10
-	MOVQ src_base+24(FP), R11
-	MOVQ src_len+32(FP), R12
-	MOVQ R11, R13
-	ADDQ R12, R13
+	MOVD 32(RSP), R7
+	MOVD 40(RSP), R6
+	MOVD 48(RSP), R4
+	MOVD dst_base+0(FP), R8
+	MOVD dst_len+8(FP), R9
+	MOVD R8, R10
+	ADD  R9, R10, R10
+	MOVD src_base+24(FP), R11
+	MOVD src_len+32(FP), R12
+	MOVD R11, R13
+	ADD  R12, R13, R13
 
 	// d += length
 	// s += length
-	ADDQ CX, DI
-	ADDQ CX, SI
-	JMP  loop
+	ADD R4, R7, R7
+	ADD R4, R6, R6
+	B   loop
 
 tagLit60Plus:
 	// !!! This fragment does the
@@ -182,142 +184,150 @@ tagLit60Plus:
 	// s += x - 58; if uint(s) > uint(len(src)) { etc }
 	//
 	// checks. In the asm version, we code it once instead of once per switch case.
-	ADDQ CX, SI
-	SUBQ $58, SI
-	CMPQ SI, R13
-	JA   errCorrupt
+	ADD  R4, R6, R6
+	SUB  $58, R6, R6
+	MOVD R6, R3
+	SUB  R11, R3, R3
+	CMP  R12, R3
+	BGT  errCorrupt
 
 	// case x == 60:
-	CMPL CX, $61
-	JEQ  tagLit61
-	JA   tagLit62Plus
+	MOVW $61, R1
+	CMPW R1, R4
+	BEQ  tagLit61
+	BGT  tagLit62Plus
 
 	// x = uint32(src[s-1])
-	MOVBLZX -1(SI), CX
-	JMP     doLit
+	MOVBU -1(R6), R4
+	B     doLit
 
 tagLit61:
 	// case x == 61:
 	// x = uint32(src[s-2]) | uint32(src[s-1])<<8
-	MOVWLZX -2(SI), CX
-	JMP     doLit
+	MOVHU -2(R6), R4
+	B     doLit
 
 tagLit62Plus:
-	CMPL CX, $62
-	JA   tagLit63
+	CMPW $62, R4
+	BHI  tagLit63
 
 	// case x == 62:
 	// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
-	MOVWLZX -3(SI), CX
-	MOVBLZX -1(SI), BX
-	SHLL    $16, BX
-	ORL     BX, CX
-	JMP     doLit
+	MOVHU -3(R6), R4
+	MOVBU -1(R6), R3
+	ORR   R3<<16, R4
+	B     doLit
 
 tagLit63:
 	// case x == 63:
 	// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
-	MOVL -4(SI), CX
-	JMP  doLit
+	MOVWU -4(R6), R4
+	B     doLit
 
-// The code above handles literal tags.
-// ----------------------------------------
-// The code below handles copy tags.
+	// The code above handles literal tags.
+	// ----------------------------------------
+	// The code below handles copy tags.
 
 tagCopy4:
 	// case tagCopy4:
 	// s += 5
-	ADDQ $5, SI
+	ADD $5, R6, R6
 
 	// if uint(s) > uint(len(src)) { etc }
-	CMPQ SI, R13
-	JA   errCorrupt
+	MOVD R6, R3
+	SUB  R11, R3, R3
+	CMP  R12, R3
+	BGT  errCorrupt
 
 	// length = 1 + int(src[s-5])>>2
-	SHRQ $2, CX
-	INCQ CX
+	MOVD $1, R1
+	ADD  R4>>2, R1, R4
 
 	// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
-	MOVLQZX -4(SI), DX
-	JMP     doCopy
+	MOVWU -4(R6), R5
+	B     doCopy
 
 tagCopy2:
 	// case tagCopy2:
 	// s += 3
-	ADDQ $3, SI
+	ADD $3, R6, R6
 
 	// if uint(s) > uint(len(src)) { etc }
-	CMPQ SI, R13
-	JA   errCorrupt
+	MOVD R6, R3
+	SUB  R11, R3, R3
+	CMP  R12, R3
+	BGT  errCorrupt
 
 	// length = 1 + int(src[s-3])>>2
-	SHRQ $2, CX
-	INCQ CX
+	MOVD $1, R1
+	ADD  R4>>2, R1, R4
 
 	// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
-	MOVWQZX -2(SI), DX
-	JMP     doCopy
+	MOVHU -2(R6), R5
+	B     doCopy
 
 tagCopy:
 	// We have a copy tag. We assume that:
-	//	- BX == src[s] & 0x03
-	//	- CX == src[s]
-	CMPQ BX, $2
-	JEQ  tagCopy2
-	JA   tagCopy4
+	//	- R3 == src[s] & 0x03
+	//	- R4 == src[s]
+	CMP $2, R3
+	BEQ tagCopy2
+	BGT tagCopy4
 
 	// case tagCopy1:
 	// s += 2
-	ADDQ $2, SI
+	ADD $2, R6, R6
 
 	// if uint(s) > uint(len(src)) { etc }
-	CMPQ SI, R13
-	JA   errCorrupt
+	MOVD R6, R3
+	SUB  R11, R3, R3
+	CMP  R12, R3
+	BGT  errCorrupt
 
 	// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
-	MOVQ    CX, DX
-	ANDQ    $0xe0, DX
-	SHLQ    $3, DX
-	MOVBQZX -1(SI), BX
-	ORQ     BX, DX
+	MOVD  R4, R5
+	AND   $0xe0, R5
+	MOVBU -1(R6), R3
+	ORR   R5<<3, R3, R5
 
 	// length = 4 + int(src[s-2])>>2&0x7
-	SHRQ $2, CX
-	ANDQ $7, CX
-	ADDQ $4, CX
+	MOVD $7, R1
+	AND  R4>>2, R1, R4
+	ADD  $4, R4, R4
 
 doCopy:
 	// This is the end of the outer "switch", when we have a copy tag.
 	//
 	// We assume that:
-	//	- CX == length && CX > 0
-	//	- DX == offset
+	//	- R4 == length && R4 > 0
+	//	- R5 == offset
 
 	// if offset <= 0 { etc }
-	CMPQ DX, $0
-	JLE  errCorrupt
+	MOVD $0, R1
+	CMP  R1, R5
+	BLE  errCorrupt
 
 	// if d < offset { etc }
-	MOVQ DI, BX
-	SUBQ R8, BX
-	CMPQ BX, DX
-	JLT  errCorrupt
+	MOVD R7, R3
+	SUB  R8, R3, R3
+	CMP  R5, R3
+	BLT  errCorrupt
 
 	// if length > len(dst)-d { etc }
-	MOVQ R10, BX
-	SUBQ DI, BX
-	CMPQ CX, BX
-	JGT  errCorrupt
+	MOVD R10, R3
+	SUB  R7, R3, R3
+	CMP  R3, R4
+	BGT  errCorrupt
 
 	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
 	//
 	// Set:
 	//	- R14 = len(dst)-d
 	//	- R15 = &dst[d-offset]
-	MOVQ R10, R14
-	SUBQ DI, R14
-	MOVQ DI, R15
-	SUBQ DX, R15
+	MOVD R10, R14
+	SUB  R7, R14, R14
+	MOVD R7, R15
+	SUB  R5, R15, R15
 
 	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
 	//
@@ -332,18 +342,18 @@ doCopy:
 	// }
 	// copy 16 bytes
 	// d += length
-	CMPQ CX, $16
-	JGT  slowForwardCopy
-	CMPQ DX, $8
-	JLT  slowForwardCopy
-	CMPQ R14, $16
-	JLT  slowForwardCopy
-	MOVQ 0(R15), AX
-	MOVQ AX, 0(DI)
-	MOVQ 8(R15), BX
-	MOVQ BX, 8(DI)
-	ADDQ CX, DI
-	JMP  loop
+	CMP  $16, R4
+	BGT  slowForwardCopy
+	CMP  $8, R5
+	BLT  slowForwardCopy
+	CMP  $16, R14
+	BLT  slowForwardCopy
+	MOVD 0(R15), R2
+	MOVD R2, 0(R7)
+	MOVD 8(R15), R3
+	MOVD R3, 8(R7)
+	ADD  R4, R7, R7
+	B    loop
 
 slowForwardCopy:
 	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
@@ -394,9 +404,9 @@ slowForwardCopy:
 	// if length > len(dst)-d-10 {
 	//   goto verySlowForwardCopy
 	// }
-	SUBQ $10, R14
-	CMPQ CX, R14
-	JGT  verySlowForwardCopy
+	SUB $10, R14, R14
+	CMP R14, R4
+	BGT verySlowForwardCopy
 
 makeOffsetAtLeast8:
 	// !!! As above, expand the pattern so that offset >= 8 and we can use
@@ -410,36 +420,37 @@ makeOffsetAtLeast8:
 	//   // The two previous lines together means that d-offset, and therefore
 	//   // R15, is unchanged.
 	// }
-	CMPQ DX, $8
-	JGE  fixUpSlowForwardCopy
-	MOVQ (R15), BX
-	MOVQ BX, (DI)
-	SUBQ DX, CX
-	ADDQ DX, DI
-	ADDQ DX, DX
-	JMP  makeOffsetAtLeast8
+	CMP  $8, R5
+	BGE  fixUpSlowForwardCopy
+	MOVD (R15), R3
+	MOVD R3, (R7)
+	SUB  R5, R4, R4
+	ADD  R5, R7, R7
+	ADD  R5, R5, R5
+	B    makeOffsetAtLeast8
 
 fixUpSlowForwardCopy:
-	// !!! Add length (which might be negative now) to d (implied by DI being
+	// !!! Add length (which might be negative now) to d (implied by R7 being
 	// &dst[d]) so that d ends up at the right place when we jump back to the
-	// top of the loop. Before we do that, though, we save DI to AX so that, if
+	// top of the loop. Before we do that, though, we save R7 to R2 so that, if
 	// length is positive, copying the remaining length bytes will write to the
 	// right place.
-	MOVQ DI, AX
-	ADDQ CX, DI
+	MOVD R7, R2
+	ADD  R4, R7, R7
 
 finishSlowForwardCopy:
 	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
 	// length means that we overrun, but as above, that will be fixed up by
 	// subsequent iterations of the outermost loop.
-	CMPQ CX, $0
-	JLE  loop
-	MOVQ (R15), BX
-	MOVQ BX, (AX)
-	ADDQ $8, R15
-	ADDQ $8, AX
-	SUBQ $8, CX
-	JMP  finishSlowForwardCopy
+	MOVD $0, R1
+	CMP  R1, R4
+	BLE  loop
+	MOVD (R15), R3
+	MOVD R3, (R2)
+	ADD  $8, R15, R15
+	ADD  $8, R2, R2
+	SUB  $8, R4, R4
+	B    finishSlowForwardCopy
 
 verySlowForwardCopy:
 	// verySlowForwardCopy is a simple implementation of forward copy. In C
@@ -454,29 +465,30 @@ verySlowForwardCopy:
 	//     break
 	//   }
 	// }
-	MOVB (R15), BX
-	MOVB BX, (DI)
-	INCQ R15
-	INCQ DI
-	DECQ CX
-	JNZ  verySlowForwardCopy
-	JMP  loop
-
-// The code above handles copy tags.
-// ----------------------------------------
+	MOVB (R15), R3
+	MOVB R3, (R7)
+	ADD  $1, R15, R15
+	ADD  $1, R7, R7
+	SUB  $1, R4, R4
+	CBNZ R4, verySlowForwardCopy
+	B    loop
+
+	// The code above handles copy tags.
+	// ----------------------------------------
 
 end:
 	// This is the end of the "for s < len(src)".
 	//
 	// if d != len(dst) { etc }
-	CMPQ DI, R10
-	JNE  errCorrupt
+	CMP R10, R7
+	BNE errCorrupt
 
 	// return 0
-	MOVQ $0, ret+48(FP)
+	MOVD $0, ret+48(FP)
 	RET
 
 errCorrupt:
 	// return decodeErrCodeCorrupt
-	MOVQ $1, ret+48(FP)
+	MOVD $1, R2
+	MOVD R2, ret+48(FP)
 	RET
diff --git a/vendor/github.com/klauspost/compress/snappy/decode_amd64.go b/vendor/github.com/golang/snappy/decode_asm.go
similarity index 93%
rename from vendor/github.com/klauspost/compress/snappy/decode_amd64.go
rename to vendor/github.com/golang/snappy/decode_asm.go
index fcd192b..7082b34 100644
--- a/vendor/github.com/klauspost/compress/snappy/decode_amd64.go
+++ b/vendor/github.com/golang/snappy/decode_asm.go
@@ -5,6 +5,7 @@
 // +build !appengine
 // +build gc
 // +build !noasm
+// +build amd64 arm64
 
 package snappy
 
diff --git a/vendor/github.com/golang/snappy/decode_other.go b/vendor/github.com/golang/snappy/decode_other.go
index 8c9f204..2f672be 100644
--- a/vendor/github.com/golang/snappy/decode_other.go
+++ b/vendor/github.com/golang/snappy/decode_other.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !amd64 appengine !gc noasm
+// +build !amd64,!arm64 appengine !gc noasm
 
 package snappy
 
@@ -85,14 +85,28 @@ func decode(dst, src []byte) int {
 		if offset <= 0 || d < offset || length > len(dst)-d {
 			return decodeErrCodeCorrupt
 		}
-		// Copy from an earlier sub-slice of dst to a later sub-slice. Unlike
-		// the built-in copy function, this byte-by-byte copy always runs
+		// Copy from an earlier sub-slice of dst to a later sub-slice.
+		// If no overlap, use the built-in copy:
+		if offset >= length {
+			copy(dst[d:d+length], dst[d-offset:])
+			d += length
+			continue
+		}
+
+		// Unlike the built-in copy function, this byte-by-byte copy always runs
 		// forwards, even if the slices overlap. Conceptually, this is:
 		//
 		// d += forwardCopy(dst[d:d+length], dst[d-offset:])
-		for end := d + length; d != end; d++ {
-			dst[d] = dst[d-offset]
+		//
+		// We align the slices into a and b and show the compiler they are the same size.
+		// This allows the loop to run without bounds checks.
+		a := dst[d : d+length]
+		b := dst[d-offset:]
+		b = b[:len(a)]
+		for i := range a {
+			a[i] = b[i]
 		}
+		d += length
 	}
 	if d != len(dst) {
 		return decodeErrCodeCorrupt
diff --git a/vendor/github.com/golang/snappy/encode.go b/vendor/github.com/golang/snappy/encode.go
index 8d393e9..7f23657 100644
--- a/vendor/github.com/golang/snappy/encode.go
+++ b/vendor/github.com/golang/snappy/encode.go
@@ -15,6 +15,8 @@ import (
 // Otherwise, a newly allocated slice will be returned.
 //
 // The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// Encode handles the Snappy block format, not the Snappy stream format.
 func Encode(dst, src []byte) []byte {
 	if n := MaxEncodedLen(len(src)); n < 0 {
 		panic(ErrTooLarge)
@@ -139,6 +141,8 @@ func NewBufferedWriter(w io.Writer) *Writer {
 }
 
 // Writer is an io.Writer that can write Snappy-compressed bytes.
+//
+// Writer handles the Snappy stream format, not the Snappy block format.
 type Writer struct {
 	w   io.Writer
 	err error
diff --git a/vendor/github.com/golang/snappy/encode_arm64.s b/vendor/github.com/golang/snappy/encode_arm64.s
new file mode 100644
index 0000000..f8d54ad
--- /dev/null
+++ b/vendor/github.com/golang/snappy/encode_arm64.s
@@ -0,0 +1,722 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+
+// The asm code generally follows the pure Go code in encode_other.go, except
+// where marked with a "!!!".
+
+// ----------------------------------------------------------------------------
+
+// func emitLiteral(dst, lit []byte) int
+//
+// All local variables fit into registers. The register allocation:
+//	- R3	len(lit)
+//	- R4	n
+//	- R6	return value
+//	- R8	&dst[i]
+//	- R10	&lit[0]
+//
+// The 32 bytes of stack space is to call runtime·memmove.
+//
+// The unusual register allocation of local variables, such as R10 for the
+// source pointer, matches the allocation used at the call site in encodeBlock,
+// which makes it easier to manually inline this function.
+TEXT ·emitLiteral(SB), NOSPLIT, $32-56
+	MOVD dst_base+0(FP), R8
+	MOVD lit_base+24(FP), R10
+	MOVD lit_len+32(FP), R3
+	MOVD R3, R6
+	MOVW R3, R4
+	SUBW $1, R4, R4
+
+	CMPW $60, R4
+	BLT  oneByte
+	CMPW $256, R4
+	BLT  twoBytes
+
+threeBytes:
+	MOVD $0xf4, R2
+	MOVB R2, 0(R8)
+	MOVW R4, 1(R8)
+	ADD  $3, R8, R8
+	ADD  $3, R6, R6
+	B    memmove
+
+twoBytes:
+	MOVD $0xf0, R2
+	MOVB R2, 0(R8)
+	MOVB R4, 1(R8)
+	ADD  $2, R8, R8
+	ADD  $2, R6, R6
+	B    memmove
+
+oneByte:
+	LSLW $2, R4, R4
+	MOVB R4, 0(R8)
+	ADD  $1, R8, R8
+	ADD  $1, R6, R6
+
+memmove:
+	MOVD R6, ret+48(FP)
+
+	// copy(dst[i:], lit)
+	//
+	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
+	// R8, R10 and R3 as arguments.
+	MOVD R8, 8(RSP)
+	MOVD R10, 16(RSP)
+	MOVD R3, 24(RSP)
+	CALL runtime·memmove(SB)
+	RET
+
+// ----------------------------------------------------------------------------
+
+// func emitCopy(dst []byte, offset, length int) int
+//
+// All local variables fit into registers. The register allocation:
+//	- R3	length
+//	- R7	&dst[0]
+//	- R8	&dst[i]
+//	- R11	offset
+//
+// The unusual register allocation of local variables, such as R11 for the
+// offset, matches the allocation used at the call site in encodeBlock, which
+// makes it easier to manually inline this function.
+TEXT ·emitCopy(SB), NOSPLIT, $0-48
+	MOVD dst_base+0(FP), R8
+	MOVD R8, R7
+	MOVD offset+24(FP), R11
+	MOVD length+32(FP), R3
+
+loop0:
+	// for length >= 68 { etc }
+	CMPW $68, R3
+	BLT  step1
+
+	// Emit a length 64 copy, encoded as 3 bytes.
+	MOVD $0xfe, R2
+	MOVB R2, 0(R8)
+	MOVW R11, 1(R8)
+	ADD  $3, R8, R8
+	SUB  $64, R3, R3
+	B    loop0
+
+step1:
+	// if length > 64 { etc }
+	CMP $64, R3
+	BLE step2
+
+	// Emit a length 60 copy, encoded as 3 bytes.
+	MOVD $0xee, R2
+	MOVB R2, 0(R8)
+	MOVW R11, 1(R8)
+	ADD  $3, R8, R8
+	SUB  $60, R3, R3
+
+step2:
+	// if length >= 12 || offset >= 2048 { goto step3 }
+	CMP  $12, R3
+	BGE  step3
+	CMPW $2048, R11
+	BGE  step3
+
+	// Emit the remaining copy, encoded as 2 bytes.
+	MOVB R11, 1(R8)
+	LSRW $3, R11, R11
+	AND  $0xe0, R11, R11
+	SUB  $4, R3, R3
+	LSLW $2, R3
+	AND  $0xff, R3, R3
+	ORRW R3, R11, R11
+	ORRW $1, R11, R11
+	MOVB R11, 0(R8)
+	ADD  $2, R8, R8
+
+	// Return the number of bytes written.
+	SUB  R7, R8, R8
+	MOVD R8, ret+40(FP)
+	RET
+
+step3:
+	// Emit the remaining copy, encoded as 3 bytes.
+	SUB  $1, R3, R3
+	AND  $0xff, R3, R3
+	LSLW $2, R3, R3
+	ORRW $2, R3, R3
+	MOVB R3, 0(R8)
+	MOVW R11, 1(R8)
+	ADD  $3, R8, R8
+
+	// Return the number of bytes written.
+	SUB  R7, R8, R8
+	MOVD R8, ret+40(FP)
+	RET
+
+// ----------------------------------------------------------------------------
+
+// func extendMatch(src []byte, i, j int) int
+//
+// All local variables fit into registers. The register allocation:
+//	- R6	&src[0]
+//	- R7	&src[j]
+//	- R13	&src[len(src) - 8]
+//	- R14	&src[len(src)]
+//	- R15	&src[i]
+//
+// The unusual register allocation of local variables, such as R15 for a source
+// pointer, matches the allocation used at the call site in encodeBlock, which
+// makes it easier to manually inline this function.
+TEXT ·extendMatch(SB), NOSPLIT, $0-48
+	MOVD src_base+0(FP), R6
+	MOVD src_len+8(FP), R14
+	MOVD i+24(FP), R15
+	MOVD j+32(FP), R7
+	ADD  R6, R14, R14
+	ADD  R6, R15, R15
+	ADD  R6, R7, R7
+	MOVD R14, R13
+	SUB  $8, R13, R13
+
+cmp8:
+	// As long as we are 8 or more bytes before the end of src, we can load and
+	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
+	CMP  R13, R7
+	BHI  cmp1
+	MOVD (R15), R3
+	MOVD (R7), R4
+	CMP  R4, R3
+	BNE  bsf
+	ADD  $8, R15, R15
+	ADD  $8, R7, R7
+	B    cmp8
+
+bsf:
+	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
+	// the index of the first byte that differs.
+	// RBIT reverses the bit order, then CLZ counts the leading zeros, the
+	// combination of which finds the least significant bit which is set.
+	// The arm64 architecture is little-endian, and the shift by 3 converts
+	// a bit index to a byte index.
+	EOR  R3, R4, R4
+	RBIT R4, R4
+	CLZ  R4, R4
+	ADD  R4>>3, R7, R7
+
+	// Convert from &src[ret] to ret.
+	SUB  R6, R7, R7
+	MOVD R7, ret+40(FP)
+	RET
+
+cmp1:
+	// In src's tail, compare 1 byte at a time.
+	CMP  R7, R14
+	BLS  extendMatchEnd
+	MOVB (R15), R3
+	MOVB (R7), R4
+	CMP  R4, R3
+	BNE  extendMatchEnd
+	ADD  $1, R15, R15
+	ADD  $1, R7, R7
+	B    cmp1
+
+extendMatchEnd:
+	// Convert from &src[ret] to ret.
+	SUB  R6, R7, R7
+	MOVD R7, ret+40(FP)
+	RET
+
+// ----------------------------------------------------------------------------
+
+// func encodeBlock(dst, src []byte) (d int)
+//
+// All local variables fit into registers, other than "var table". The register
+// allocation:
+//	- R3	.	.
+//	- R4	.	.
+//	- R5	64	shift
+//	- R6	72	&src[0], tableSize
+//	- R7	80	&src[s]
+//	- R8	88	&dst[d]
+//	- R9	96	sLimit
+//	- R10	.	&src[nextEmit]
+//	- R11	104	prevHash, currHash, nextHash, offset
+//	- R12	112	&src[base], skip
+//	- R13	.	&src[nextS], &src[len(src) - 8]
+//	- R14	.	len(src), bytesBetweenHashLookups, &src[len(src)], x
+//	- R15	120	candidate
+//	- R16	.	hash constant, 0x1e35a7bd
+//	- R17	.	&table
+//	- .  	128	table
+//
+// The second column (64, 72, etc) is the stack offset to spill the registers
+// when calling other functions. We could pack this slightly tighter, but it's
+// simpler to have a dedicated spill map independent of the function called.
+//
+// "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
+// extra 64 bytes, to call other functions, and an extra 64 bytes, to spill
+// local variables (registers) during calls gives 32768 + 64 + 64 = 32896.
+TEXT ·encodeBlock(SB), 0, $32896-56
+	MOVD dst_base+0(FP), R8
+	MOVD src_base+24(FP), R7
+	MOVD src_len+32(FP), R14
+
+	// shift, tableSize := uint32(32-8), 1<<8
+	MOVD  $24, R5
+	MOVD  $256, R6
+	MOVW  $0xa7bd, R16
+	MOVKW $(0x1e35<<16), R16
+
+calcShift:
+	// for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
+	//	shift--
+	// }
+	MOVD $16384, R2
+	CMP  R2, R6
+	BGE  varTable
+	CMP  R14, R6
+	BGE  varTable
+	SUB  $1, R5, R5
+	LSL  $1, R6, R6
+	B    calcShift
+
+varTable:
+	// var table [maxTableSize]uint16
+	//
+	// In the asm code, unlike the Go code, we can zero-initialize only the
+	// first tableSize elements. Each uint16 element is 2 bytes and each
+	// iterations writes 64 bytes, so we can do only tableSize/32 writes
+	// instead of the 2048 writes that would zero-initialize all of table's
+	// 32768 bytes. This clear could overrun the first tableSize elements, but
+	// it won't overrun the allocated stack size.
+	ADD  $128, RSP, R17
+	MOVD R17, R4
+
+	// !!! R6 = &src[tableSize]
+	ADD R6<<1, R17, R6
+
+memclr:
+	STP.P (ZR, ZR), 64(R4)
+	STP   (ZR, ZR), -48(R4)
+	STP   (ZR, ZR), -32(R4)
+	STP   (ZR, ZR), -16(R4)
+	CMP   R4, R6
+	BHI   memclr
+
+	// !!! R6 = &src[0]
+	MOVD R7, R6
+
+	// sLimit := len(src) - inputMargin
+	MOVD R14, R9
+	SUB  $15, R9, R9
+
+	// !!! Pre-emptively spill R5, R6 and R9 to the stack. Their values don't
+	// change for the rest of the function.
+	MOVD R5, 64(RSP)
+	MOVD R6, 72(RSP)
+	MOVD R9, 96(RSP)
+
+	// nextEmit := 0
+	MOVD R6, R10
+
+	// s := 1
+	ADD $1, R7, R7
+
+	// nextHash := hash(load32(src, s), shift)
+	MOVW 0(R7), R11
+	MULW R16, R11, R11
+	LSRW R5, R11, R11
+
+outer:
+	// for { etc }
+
+	// skip := 32
+	MOVD $32, R12
+
+	// nextS := s
+	MOVD R7, R13
+
+	// candidate := 0
+	MOVD $0, R15
+
+inner0:
+	// for { etc }
+
+	// s := nextS
+	MOVD R13, R7
+
+	// bytesBetweenHashLookups := skip >> 5
+	MOVD R12, R14
+	LSR  $5, R14, R14
+
+	// nextS = s + bytesBetweenHashLookups
+	ADD R14, R13, R13
+
+	// skip += bytesBetweenHashLookups
+	ADD R14, R12, R12
+
+	// if nextS > sLimit { goto emitRemainder }
+	MOVD R13, R3
+	SUB  R6, R3, R3
+	CMP  R9, R3
+	BHI  emitRemainder
+
+	// candidate = int(table[nextHash])
+	MOVHU 0(R17)(R11<<1), R15
+
+	// table[nextHash] = uint16(s)
+	MOVD R7, R3
+	SUB  R6, R3, R3
+
+	MOVH R3, 0(R17)(R11<<1)
+
+	// nextHash = hash(load32(src, nextS), shift)
+	MOVW 0(R13), R11
+	MULW R16, R11
+	LSRW R5, R11, R11
+
+	// if load32(src, s) != load32(src, candidate) { continue } break
+	MOVW 0(R7), R3
+	MOVW (R6)(R15), R4
+	CMPW R4, R3
+	BNE  inner0
+
+fourByteMatch:
+	// As per the encode_other.go code:
+	//
+	// A 4-byte match has been found. We'll later see etc.
+
+	// !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
+	// on inputMargin in encode.go.
+	MOVD R7, R3
+	SUB  R10, R3, R3
+	CMP  $16, R3
+	BLE  emitLiteralFastPath
+
+	// ----------------------------------------
+	// Begin inline of the emitLiteral call.
+	//
+	// d += emitLiteral(dst[d:], src[nextEmit:s])
+
+	MOVW R3, R4
+	SUBW $1, R4, R4
+
+	MOVW $60, R2
+	CMPW R2, R4
+	BLT  inlineEmitLiteralOneByte
+	MOVW $256, R2
+	CMPW R2, R4
+	BLT  inlineEmitLiteralTwoBytes
+
+inlineEmitLiteralThreeBytes:
+	MOVD $0xf4, R1
+	MOVB R1, 0(R8)
+	MOVW R4, 1(R8)
+	ADD  $3, R8, R8
+	B    inlineEmitLiteralMemmove
+
+inlineEmitLiteralTwoBytes:
+	MOVD $0xf0, R1
+	MOVB R1, 0(R8)
+	MOVB R4, 1(R8)
+	ADD  $2, R8, R8
+	B    inlineEmitLiteralMemmove
+
+inlineEmitLiteralOneByte:
+	LSLW $2, R4, R4
+	MOVB R4, 0(R8)
+	ADD  $1, R8, R8
+
+inlineEmitLiteralMemmove:
+	// Spill local variables (registers) onto the stack; call; unspill.
+	//
+	// copy(dst[i:], lit)
+	//
+	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
+	// R8, R10 and R3 as arguments.
+	MOVD R8, 8(RSP)
+	MOVD R10, 16(RSP)
+	MOVD R3, 24(RSP)
+
+	// Finish the "d +=" part of "d += emitLiteral(etc)".
+	ADD   R3, R8, R8
+	MOVD  R7, 80(RSP)
+	MOVD  R8, 88(RSP)
+	MOVD  R15, 120(RSP)
+	CALL  runtime·memmove(SB)
+	MOVD  64(RSP), R5
+	MOVD  72(RSP), R6
+	MOVD  80(RSP), R7
+	MOVD  88(RSP), R8
+	MOVD  96(RSP), R9
+	MOVD  120(RSP), R15
+	ADD   $128, RSP, R17
+	MOVW  $0xa7bd, R16
+	MOVKW $(0x1e35<<16), R16
+	B     inner1
+
+inlineEmitLiteralEnd:
+	// End inline of the emitLiteral call.
+	// ----------------------------------------
+
+emitLiteralFastPath:
+	// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
+	MOVB R3, R4
+	SUBW $1, R4, R4
+	AND  $0xff, R4, R4
+	LSLW $2, R4, R4
+	MOVB R4, (R8)
+	ADD  $1, R8, R8
+
+	// !!! Implement the copy from lit to dst as a 16-byte load and store.
+	// (Encode's documentation says that dst and src must not overlap.)
+	//
+	// This always copies 16 bytes, instead of only len(lit) bytes, but that's
+	// OK. Subsequent iterations will fix up the overrun.
+	//
+	// Note that on arm64, it is legal and cheap to issue unaligned 8-byte or
+	// 16-byte loads and stores. This technique probably wouldn't be as
+	// effective on architectures that are fussier about alignment.
+	LDP 0(R10), (R0, R1)
+	STP (R0, R1), 0(R8)
+	ADD R3, R8, R8
+
+inner1:
+	// for { etc }
+
+	// base := s
+	MOVD R7, R12
+
+	// !!! offset := base - candidate
+	MOVD R12, R11
+	SUB  R15, R11, R11
+	SUB  R6, R11, R11
+
+	// ----------------------------------------
+	// Begin inline of the extendMatch call.
+	//
+	// s = extendMatch(src, candidate+4, s+4)
+
+	// !!! R14 = &src[len(src)]
+	MOVD src_len+32(FP), R14
+	ADD  R6, R14, R14
+
+	// !!! R13 = &src[len(src) - 8]
+	MOVD R14, R13
+	SUB  $8, R13, R13
+
+	// !!! R15 = &src[candidate + 4]
+	ADD $4, R15, R15
+	ADD R6, R15, R15
+
+	// !!! s += 4
+	ADD $4, R7, R7
+
+inlineExtendMatchCmp8:
+	// As long as we are 8 or more bytes before the end of src, we can load and
+	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
+	CMP  R13, R7
+	BHI  inlineExtendMatchCmp1
+	MOVD (R15), R3
+	MOVD (R7), R4
+	CMP  R4, R3
+	BNE  inlineExtendMatchBSF
+	ADD  $8, R15, R15
+	ADD  $8, R7, R7
+	B    inlineExtendMatchCmp8
+
+inlineExtendMatchBSF:
+	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
+	// the index of the first byte that differs.
+	// RBIT reverses the bit order, then CLZ counts the leading zeros, the
+	// combination of which finds the least significant bit which is set.
+	// The arm64 architecture is little-endian, and the shift by 3 converts
+	// a bit index to a byte index.
+	EOR  R3, R4, R4
+	RBIT R4, R4
+	CLZ  R4, R4
+	ADD  R4>>3, R7, R7
+	B    inlineExtendMatchEnd
+
+inlineExtendMatchCmp1:
+	// In src's tail, compare 1 byte at a time.
+	CMP  R7, R14
+	BLS  inlineExtendMatchEnd
+	MOVB (R15), R3
+	MOVB (R7), R4
+	CMP  R4, R3
+	BNE  inlineExtendMatchEnd
+	ADD  $1, R15, R15
+	ADD  $1, R7, R7
+	B    inlineExtendMatchCmp1
+
+inlineExtendMatchEnd:
+	// End inline of the extendMatch call.
+	// ----------------------------------------
+
+	// ----------------------------------------
+	// Begin inline of the emitCopy call.
+	//
+	// d += emitCopy(dst[d:], base-candidate, s-base)
+
+	// !!! length := s - base
+	MOVD R7, R3
+	SUB  R12, R3, R3
+
+inlineEmitCopyLoop0:
+	// for length >= 68 { etc }
+	MOVW $68, R2
+	CMPW R2, R3
+	BLT  inlineEmitCopyStep1
+
+	// Emit a length 64 copy, encoded as 3 bytes.
+	MOVD $0xfe, R1
+	MOVB R1, 0(R8)
+	MOVW R11, 1(R8)
+	ADD  $3, R8, R8
+	SUBW $64, R3, R3
+	B    inlineEmitCopyLoop0
+
+inlineEmitCopyStep1:
+	// if length > 64 { etc }
+	MOVW $64, R2
+	CMPW R2, R3
+	BLE  inlineEmitCopyStep2
+
+	// Emit a length 60 copy, encoded as 3 bytes.
+	MOVD $0xee, R1
+	MOVB R1, 0(R8)
+	MOVW R11, 1(R8)
+	ADD  $3, R8, R8
+	SUBW $60, R3, R3
+
+inlineEmitCopyStep2:
+	// if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 }
+	MOVW $12, R2
+	CMPW R2, R3
+	BGE  inlineEmitCopyStep3
+	MOVW $2048, R2
+	CMPW R2, R11
+	BGE  inlineEmitCopyStep3
+
+	// Emit the remaining copy, encoded as 2 bytes.
+	MOVB R11, 1(R8)
+	LSRW $8, R11, R11
+	LSLW $5, R11, R11
+	SUBW $4, R3, R3
+	AND  $0xff, R3, R3
+	LSLW $2, R3, R3
+	ORRW R3, R11, R11
+	ORRW $1, R11, R11
+	MOVB R11, 0(R8)
+	ADD  $2, R8, R8
+	B    inlineEmitCopyEnd
+
+inlineEmitCopyStep3:
+	// Emit the remaining copy, encoded as 3 bytes.
+	SUBW $1, R3, R3
+	LSLW $2, R3, R3
+	ORRW $2, R3, R3
+	MOVB R3, 0(R8)
+	MOVW R11, 1(R8)
+	ADD  $3, R8, R8
+
+inlineEmitCopyEnd:
+	// End inline of the emitCopy call.
+	// ----------------------------------------
+
+	// nextEmit = s
+	MOVD R7, R10
+
+	// if s >= sLimit { goto emitRemainder }
+	MOVD R7, R3
+	SUB  R6, R3, R3
+	CMP  R3, R9
+	BLS  emitRemainder
+
+	// As per the encode_other.go code:
+	//
+	// We could immediately etc.
+
+	// x := load64(src, s-1)
+	MOVD -1(R7), R14
+
+	// prevHash := hash(uint32(x>>0), shift)
+	MOVW R14, R11
+	MULW R16, R11, R11
+	LSRW R5, R11, R11
+
+	// table[prevHash] = uint16(s-1)
+	MOVD R7, R3
+	SUB  R6, R3, R3
+	SUB  $1, R3, R3
+
+	MOVHU R3, 0(R17)(R11<<1)
+
+	// currHash := hash(uint32(x>>8), shift)
+	LSR  $8, R14, R14
+	MOVW R14, R11
+	MULW R16, R11, R11
+	LSRW R5, R11, R11
+
+	// candidate = int(table[currHash])
+	MOVHU 0(R17)(R11<<1), R15
+
+	// table[currHash] = uint16(s)
+	ADD   $1, R3, R3
+	MOVHU R3, 0(R17)(R11<<1)
+
+	// if uint32(x>>8) == load32(src, candidate) { continue }
+	MOVW (R6)(R15), R4
+	CMPW R4, R14
+	BEQ  inner1
+
+	// nextHash = hash(uint32(x>>16), shift)
+	LSR  $8, R14, R14
+	MOVW R14, R11
+	MULW R16, R11, R11
+	LSRW R5, R11, R11
+
+	// s++
+	ADD $1, R7, R7
+
+	// break out of the inner1 for loop, i.e. continue the outer loop.
+	B outer
+
+emitRemainder:
+	// if nextEmit < len(src) { etc }
+	MOVD src_len+32(FP), R3
+	ADD  R6, R3, R3
+	CMP  R3, R10
+	BEQ  encodeBlockEnd
+
+	// d += emitLiteral(dst[d:], src[nextEmit:])
+	//
+	// Push args.
+	MOVD R8, 8(RSP)
+	MOVD $0, 16(RSP)  // Unnecessary, as the callee ignores it, but conservative.
+	MOVD $0, 24(RSP)  // Unnecessary, as the callee ignores it, but conservative.
+	MOVD R10, 32(RSP)
+	SUB  R10, R3, R3
+	MOVD R3, 40(RSP)
+	MOVD R3, 48(RSP)  // Unnecessary, as the callee ignores it, but conservative.
+
+	// Spill local variables (registers) onto the stack; call; unspill.
+	MOVD R8, 88(RSP)
+	CALL ·emitLiteral(SB)
+	MOVD 88(RSP), R8
+
+	// Finish the "d +=" part of "d += emitLiteral(etc)".
+	MOVD 56(RSP), R1
+	ADD  R1, R8, R8
+
+encodeBlockEnd:
+	MOVD dst_base+0(FP), R3
+	SUB  R3, R8, R8
+	MOVD R8, d+48(FP)
+	RET
diff --git a/vendor/github.com/golang/snappy/encode_amd64.go b/vendor/github.com/golang/snappy/encode_asm.go
similarity index 97%
rename from vendor/github.com/golang/snappy/encode_amd64.go
rename to vendor/github.com/golang/snappy/encode_asm.go
index 150d91b..107c1e7 100644
--- a/vendor/github.com/golang/snappy/encode_amd64.go
+++ b/vendor/github.com/golang/snappy/encode_asm.go
@@ -5,6 +5,7 @@
 // +build !appengine
 // +build gc
 // +build !noasm
+// +build amd64 arm64
 
 package snappy
 
diff --git a/vendor/github.com/golang/snappy/encode_other.go b/vendor/github.com/golang/snappy/encode_other.go
index dbcae90..296d7f0 100644
--- a/vendor/github.com/golang/snappy/encode_other.go
+++ b/vendor/github.com/golang/snappy/encode_other.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !amd64 appengine !gc noasm
+// +build !amd64,!arm64 appengine !gc noasm
 
 package snappy
 
diff --git a/vendor/github.com/klauspost/compress/.gitattributes b/vendor/github.com/klauspost/compress/.gitattributes
new file mode 100644
index 0000000..4024335
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/.gitattributes
@@ -0,0 +1,2 @@
+* -text
+*.bin -text -diff
diff --git a/vendor/github.com/klauspost/compress/.gitignore b/vendor/github.com/klauspost/compress/.gitignore
new file mode 100644
index 0000000..d31b378
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/.gitignore
@@ -0,0 +1,32 @@
+# Compiled Object files, Static and Dynamic libs (Shared Objects)
+*.o
+*.a
+*.so
+
+# Folders
+_obj
+_test
+
+# Architecture specific extensions/prefixes
+*.[568vq]
+[568vq].out
+
+*.cgo1.go
+*.cgo2.c
+_cgo_defun.c
+_cgo_gotypes.go
+_cgo_export.*
+
+_testmain.go
+
+*.exe
+*.test
+*.prof
+/s2/cmd/_s2sx/sfx-exe
+
+# Linux perf files
+perf.data
+perf.data.old
+
+# gdb history
+.gdb_history
diff --git a/vendor/github.com/klauspost/compress/.goreleaser.yml b/vendor/github.com/klauspost/compress/.goreleaser.yml
new file mode 100644
index 0000000..0af08e6
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/.goreleaser.yml
@@ -0,0 +1,141 @@
+# This is an example goreleaser.yaml file with some sane defaults.
+# Make sure to check the documentation at http://goreleaser.com
+before:
+  hooks:
+    - ./gen.sh
+    - go install mvdan.cc/garble@latest
+
+builds:
+  -
+    id: "s2c"
+    binary: s2c
+    main: ./s2/cmd/s2c/main.go
+    flags:
+      - -trimpath
+    env:
+      - CGO_ENABLED=0
+    goos:
+      - aix
+      - linux
+      - freebsd
+      - netbsd
+      - windows
+      - darwin
+    goarch:
+      - 386
+      - amd64
+      - arm
+      - arm64
+      - ppc64
+      - ppc64le
+      - mips64
+      - mips64le
+    goarm:
+      - 7
+    gobinary: garble
+  -
+    id: "s2d"
+    binary: s2d
+    main: ./s2/cmd/s2d/main.go
+    flags:
+      - -trimpath
+    env:
+      - CGO_ENABLED=0
+    goos:
+      - aix
+      - linux
+      - freebsd
+      - netbsd
+      - windows
+      - darwin
+    goarch:
+      - 386
+      - amd64
+      - arm
+      - arm64
+      - ppc64
+      - ppc64le
+      - mips64
+      - mips64le
+    goarm:
+      - 7
+    gobinary: garble
+  -
+    id: "s2sx"
+    binary: s2sx
+    main: ./s2/cmd/_s2sx/main.go
+    flags:
+      - -modfile=s2sx.mod
+      - -trimpath
+    env:
+      - CGO_ENABLED=0
+    goos:
+      - aix
+      - linux
+      - freebsd
+      - netbsd
+      - windows
+      - darwin
+    goarch:
+      - 386
+      - amd64
+      - arm
+      - arm64
+      - ppc64
+      - ppc64le
+      - mips64
+      - mips64le
+    goarm:
+      - 7
+    gobinary: garble
+
+archives:
+  -
+    id: s2-binaries
+    name_template: "s2-{{ .Os }}_{{ .Arch }}_{{ .Version }}"
+    replacements:
+      aix: AIX
+      darwin: OSX
+      linux: Linux
+      windows: Windows
+      386: i386
+      amd64: x86_64
+      freebsd: FreeBSD
+      netbsd: NetBSD
+    format_overrides:
+      - goos: windows
+        format: zip
+    files:
+      - unpack/*
+      - s2/LICENSE
+      - s2/README.md
+checksum:
+  name_template: 'checksums.txt'
+snapshot:
+  name_template: "{{ .Tag }}-next"
+changelog:
+  sort: asc
+  filters:
+    exclude:
+    - '^doc:'
+    - '^docs:'
+    - '^test:'
+    - '^tests:'
+    - '^Update\sREADME.md'
+
+nfpms:
+  -
+    file_name_template: "s2_package_{{ .Version }}_{{ .Os }}_{{ .Arch }}"
+    vendor: Klaus Post
+    homepage: https://github.com/klauspost/compress
+    maintainer: Klaus Post <klauspost@gmail.com>
+    description: S2 Compression Tool
+    license: BSD 3-Clause
+    formats:
+      - deb
+      - rpm
+    replacements:
+      darwin: Darwin
+      linux: Linux
+      freebsd: FreeBSD
+      amd64: x86_64
diff --git a/vendor/github.com/klauspost/compress/LICENSE b/vendor/github.com/klauspost/compress/LICENSE
index 1eb75ef..87d5574 100644
--- a/vendor/github.com/klauspost/compress/LICENSE
+++ b/vendor/github.com/klauspost/compress/LICENSE
@@ -26,3 +26,279 @@ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+------------------
+
+Files: gzhttp/*
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2016-2017 The New York Times Company
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+------------------
+
+Files: s2/cmd/internal/readahead/*
+
+The MIT License (MIT)
+
+Copyright (c) 2015 Klaus Post
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+---------------------
+Files: snappy/*
+Files: internal/snapref/*
+
+Copyright (c) 2011 The Snappy-Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+-----------------
+
+Files: s2/cmd/internal/filepathx/*
+
+Copyright 2016 The filepathx Authors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/vendor/github.com/klauspost/compress/README.md b/vendor/github.com/klauspost/compress/README.md
new file mode 100644
index 0000000..c3ec9d8
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/README.md
@@ -0,0 +1,506 @@
+# compress
+
+This package provides various compression algorithms.
+
+* [zstandard](https://github.com/klauspost/compress/tree/master/zstd#zstd) compression and decompression in pure Go.
+* [S2](https://github.com/klauspost/compress/tree/master/s2#s2-compression) is a high performance replacement for Snappy.
+* Optimized [deflate](https://godoc.org/github.com/klauspost/compress/flate) packages which can be used as a dropin replacement for [gzip](https://godoc.org/github.com/klauspost/compress/gzip), [zip](https://godoc.org/github.com/klauspost/compress/zip) and [zlib](https://godoc.org/github.com/klauspost/compress/zlib).
+* [snappy](https://github.com/klauspost/compress/tree/master/snappy) is a drop-in replacement for `github.com/golang/snappy` offering better compression and concurrent streams.
+* [huff0](https://github.com/klauspost/compress/tree/master/huff0) and [FSE](https://github.com/klauspost/compress/tree/master/fse) implementations for raw entropy encoding.
+* [gzhttp](https://github.com/klauspost/compress/tree/master/gzhttp) Provides client and server wrappers for handling gzipped requests efficiently.
+* [pgzip](https://github.com/klauspost/pgzip) is a separate package that provides a very fast parallel gzip implementation.
+* [fuzz package](https://github.com/klauspost/compress-fuzz) for fuzz testing all compressors/decompressors here.
+
+[![Go Reference](https://pkg.go.dev/badge/klauspost/compress.svg)](https://pkg.go.dev/github.com/klauspost/compress?tab=subdirectories)
+[![Go](https://github.com/klauspost/compress/actions/workflows/go.yml/badge.svg)](https://github.com/klauspost/compress/actions/workflows/go.yml)
+[![Sourcegraph Badge](https://sourcegraph.com/github.com/klauspost/compress/-/badge.svg)](https://sourcegraph.com/github.com/klauspost/compress?badge)
+
+# changelog
+
+* May 5, 2022 (v1.15.3)
+	* zstd: Allow to ignore checksum checking by @WojciechMula [#572](https://github.com/klauspost/compress/pull/572)
+	* s2: Fix incorrect seek for io.SeekEnd in [#575](https://github.com/klauspost/compress/pull/575)
+
+* Apr 26, 2022 (v1.15.2)
+	* zstd: Add x86-64 assembly for decompression on streams and blocks. Contributed by [@WojciechMula](https://github.com/WojciechMula). Typically 2x faster.  [#528](https://github.com/klauspost/compress/pull/528) [#531](https://github.com/klauspost/compress/pull/531) [#545](https://github.com/klauspost/compress/pull/545) [#537](https://github.com/klauspost/compress/pull/537)
+	* zstd: Add options to ZipDecompressor and fixes [#539](https://github.com/klauspost/compress/pull/539)
+	* s2: Use sorted search for index [#555](https://github.com/klauspost/compress/pull/555)
+	* Minimum version is Go 1.16, added CI test on 1.18.
+
+* Mar 11, 2022 (v1.15.1)
+	* huff0: Add x86 assembly of Decode4X by @WojciechMula in [#512](https://github.com/klauspost/compress/pull/512)
+	* zstd: Reuse zip decoders in [#514](https://github.com/klauspost/compress/pull/514)
+	* zstd: Detect extra block data and report as corrupted in [#520](https://github.com/klauspost/compress/pull/520)
+	* zstd: Handle zero sized frame content size stricter in [#521](https://github.com/klauspost/compress/pull/521)
+	* zstd: Add stricter block size checks in [#523](https://github.com/klauspost/compress/pull/523)
+
+* Mar 3, 2022 (v1.15.0)
+	* zstd: Refactor decoder by @klauspost in [#498](https://github.com/klauspost/compress/pull/498)
+	* zstd: Add stream encoding without goroutines by @klauspost in [#505](https://github.com/klauspost/compress/pull/505)
+	* huff0: Prevent single blocks exceeding 16 bits by @klauspost in[#507](https://github.com/klauspost/compress/pull/507)
+	* flate: Inline literal emission by @klauspost in [#509](https://github.com/klauspost/compress/pull/509)
+	* gzhttp: Add zstd to transport by @klauspost in [#400](https://github.com/klauspost/compress/pull/400)
+	* gzhttp: Make content-type optional by @klauspost in [#510](https://github.com/klauspost/compress/pull/510)
+
+<details>
+	<summary>See  Details</summary>
+Both compression and decompression now supports "synchronous" stream operations. This means that whenever "concurrency" is set to 1, they will operate without spawning goroutines.
+
+Stream decompression is now faster on asynchronous, since the goroutine allocation much more effectively splits the workload. On typical streams this will typically use 2 cores fully for decompression. When a stream has finished decoding no goroutines will be left over, so decoders can now safely be pooled and still be garbage collected.
+
+While the release has been extensively tested, it is recommended to testing when upgrading.
+</details>
+
+* Feb 22, 2022 (v1.14.4)
+	* flate: Fix rare huffman only (-2) corruption. [#503](https://github.com/klauspost/compress/pull/503)
+	* zip: Update deprecated CreateHeaderRaw to correctly call CreateRaw by @saracen in [#502](https://github.com/klauspost/compress/pull/502)
+	* zip: don't read data descriptor early by @saracen in [#501](https://github.com/klauspost/compress/pull/501)  #501
+	* huff0: Use static decompression buffer up to 30% faster by @klauspost in [#499](https://github.com/klauspost/compress/pull/499) [#500](https://github.com/klauspost/compress/pull/500)
+
+* Feb 17, 2022 (v1.14.3)
+	* flate: Improve fastest levels compression speed ~10% more throughput. [#482](https://github.com/klauspost/compress/pull/482) [#489](https://github.com/klauspost/compress/pull/489) [#490](https://github.com/klauspost/compress/pull/490) [#491](https://github.com/klauspost/compress/pull/491) [#494](https://github.com/klauspost/compress/pull/494)  [#478](https://github.com/klauspost/compress/pull/478)
+	* flate: Faster decompression speed, ~5-10%. [#483](https://github.com/klauspost/compress/pull/483)
+	* s2: Faster compression with Go v1.18 and amd64 microarch level 3+. [#484](https://github.com/klauspost/compress/pull/484) [#486](https://github.com/klauspost/compress/pull/486)
+
+* Jan 25, 2022 (v1.14.2)
+	* zstd: improve header decoder by @dsnet  [#476](https://github.com/klauspost/compress/pull/476)
+	* zstd: Add bigger default blocks  [#469](https://github.com/klauspost/compress/pull/469)
+	* zstd: Remove unused decompression buffer [#470](https://github.com/klauspost/compress/pull/470)
+	* zstd: Fix logically dead code by @ningmingxiao [#472](https://github.com/klauspost/compress/pull/472)
+	* flate: Improve level 7-9 [#471](https://github.com/klauspost/compress/pull/471) [#473](https://github.com/klauspost/compress/pull/473)
+	* zstd: Add noasm tag for xxhash [#475](https://github.com/klauspost/compress/pull/475)
+
+* Jan 11, 2022 (v1.14.1)
+	* s2: Add stream index in [#462](https://github.com/klauspost/compress/pull/462)
+	* flate: Speed and efficiency improvements in [#439](https://github.com/klauspost/compress/pull/439) [#461](https://github.com/klauspost/compress/pull/461) [#455](https://github.com/klauspost/compress/pull/455) [#452](https://github.com/klauspost/compress/pull/452) [#458](https://github.com/klauspost/compress/pull/458)
+	* zstd: Performance improvement in [#420]( https://github.com/klauspost/compress/pull/420) [#456](https://github.com/klauspost/compress/pull/456) [#437](https://github.com/klauspost/compress/pull/437) [#467](https://github.com/klauspost/compress/pull/467) [#468](https://github.com/klauspost/compress/pull/468)
+	* zstd: add arm64 xxhash assembly in [#464](https://github.com/klauspost/compress/pull/464)
+	* Add garbled for binaries for s2 in [#445](https://github.com/klauspost/compress/pull/445)
+
+* Aug 30, 2021 (v1.13.5)
+	* gz/zlib/flate: Alias stdlib errors [#425](https://github.com/klauspost/compress/pull/425)
+	* s2: Add block support to commandline tools [#413](https://github.com/klauspost/compress/pull/413)
+	* zstd: pooledZipWriter should return Writers to the same pool [#426](https://github.com/klauspost/compress/pull/426)
+	* Removed golang/snappy as external dependency for tests [#421](https://github.com/klauspost/compress/pull/421)
+
+* Aug 12, 2021 (v1.13.4)
+	* Add [snappy replacement package](https://github.com/klauspost/compress/tree/master/snappy).
+	* zstd: Fix incorrect encoding in "best" mode [#415](https://github.com/klauspost/compress/pull/415)
+
+* Aug 3, 2021 (v1.13.3) 
+	* zstd: Improve Best compression [#404](https://github.com/klauspost/compress/pull/404)
+	* zstd: Fix WriteTo error forwarding [#411](https://github.com/klauspost/compress/pull/411)
+	* gzhttp: Return http.HandlerFunc instead of http.Handler. Unlikely breaking change. [#406](https://github.com/klauspost/compress/pull/406)
+	* s2sx: Fix max size error [#399](https://github.com/klauspost/compress/pull/399)
+	* zstd: Add optional stream content size on reset [#401](https://github.com/klauspost/compress/pull/401)
+	* zstd: use SpeedBestCompression for level >= 10 [#410](https://github.com/klauspost/compress/pull/410)
+
+* Jun 14, 2021 (v1.13.1)
+	* s2: Add full Snappy output support  [#396](https://github.com/klauspost/compress/pull/396)
+	* zstd: Add configurable [Decoder window](https://pkg.go.dev/github.com/klauspost/compress/zstd#WithDecoderMaxWindow) size [#394](https://github.com/klauspost/compress/pull/394)
+	* gzhttp: Add header to skip compression  [#389](https://github.com/klauspost/compress/pull/389)
+	* s2: Improve speed with bigger output margin  [#395](https://github.com/klauspost/compress/pull/395)
+
+* Jun 3, 2021 (v1.13.0)
+	* Added [gzhttp](https://github.com/klauspost/compress/tree/master/gzhttp#gzip-handler) which allows wrapping HTTP servers and clients with GZIP compressors.
+	* zstd: Detect short invalid signatures [#382](https://github.com/klauspost/compress/pull/382)
+	* zstd: Spawn decoder goroutine only if needed. [#380](https://github.com/klauspost/compress/pull/380)
+
+<details>
+	<summary>See changes to v1.12.x</summary>
+	
+* May 25, 2021 (v1.12.3)
+	* deflate: Better/faster Huffman encoding [#374](https://github.com/klauspost/compress/pull/374)
+	* deflate: Allocate less for history. [#375](https://github.com/klauspost/compress/pull/375)
+	* zstd: Forward read errors [#373](https://github.com/klauspost/compress/pull/373) 
+
+* Apr 27, 2021 (v1.12.2)
+	* zstd: Improve better/best compression [#360](https://github.com/klauspost/compress/pull/360) [#364](https://github.com/klauspost/compress/pull/364) [#365](https://github.com/klauspost/compress/pull/365)
+	* zstd: Add helpers to compress/decompress zstd inside zip files [#363](https://github.com/klauspost/compress/pull/363)
+	* deflate: Improve level 5+6 compression [#367](https://github.com/klauspost/compress/pull/367)
+	* s2: Improve better/best compression [#358](https://github.com/klauspost/compress/pull/358) [#359](https://github.com/klauspost/compress/pull/358)
+	* s2: Load after checking src limit on amd64. [#362](https://github.com/klauspost/compress/pull/362)
+	* s2sx: Limit max executable size [#368](https://github.com/klauspost/compress/pull/368) 
+
+* Apr 14, 2021 (v1.12.1)
+	* snappy package removed. Upstream added as dependency.
+	* s2: Better compression in "best" mode [#353](https://github.com/klauspost/compress/pull/353)
+	* s2sx: Add stdin input and detect pre-compressed from signature [#352](https://github.com/klauspost/compress/pull/352)
+	* s2c/s2d: Add http as possible input [#348](https://github.com/klauspost/compress/pull/348)
+	* s2c/s2d/s2sx: Always truncate when writing files [#352](https://github.com/klauspost/compress/pull/352)
+	* zstd: Reduce memory usage further when using [WithLowerEncoderMem](https://pkg.go.dev/github.com/klauspost/compress/zstd#WithLowerEncoderMem) [#346](https://github.com/klauspost/compress/pull/346)
+	* s2: Fix potential problem with amd64 assembly and profilers [#349](https://github.com/klauspost/compress/pull/349)
+</details>
+
+<details>
+	<summary>See changes to v1.11.x</summary>
+	
+* Mar 26, 2021 (v1.11.13)
+	* zstd: Big speedup on small dictionary encodes [#344](https://github.com/klauspost/compress/pull/344) [#345](https://github.com/klauspost/compress/pull/345)
+	* zstd: Add [WithLowerEncoderMem](https://pkg.go.dev/github.com/klauspost/compress/zstd#WithLowerEncoderMem) encoder option [#336](https://github.com/klauspost/compress/pull/336)
+	* deflate: Improve entropy compression [#338](https://github.com/klauspost/compress/pull/338)
+	* s2: Clean up and minor performance improvement in best [#341](https://github.com/klauspost/compress/pull/341)
+
+* Mar 5, 2021 (v1.11.12)
+	* s2: Add `s2sx` binary that creates [self extracting archives](https://github.com/klauspost/compress/tree/master/s2#s2sx-self-extracting-archives).
+	* s2: Speed up decompression on non-assembly platforms [#328](https://github.com/klauspost/compress/pull/328)
+
+* Mar 1, 2021 (v1.11.9)
+	* s2: Add ARM64 decompression assembly. Around 2x output speed. [#324](https://github.com/klauspost/compress/pull/324)
+	* s2: Improve "better" speed and efficiency. [#325](https://github.com/klauspost/compress/pull/325)
+	* s2: Fix binaries.
+
+* Feb 25, 2021 (v1.11.8)
+	* s2: Fixed occational out-of-bounds write on amd64. Upgrade recommended.
+	* s2: Add AMD64 assembly for better mode. 25-50% faster. [#315](https://github.com/klauspost/compress/pull/315)
+	* s2: Less upfront decoder allocation. [#322](https://github.com/klauspost/compress/pull/322)
+	* zstd: Faster "compression" of incompressible data. [#314](https://github.com/klauspost/compress/pull/314)
+	* zip: Fix zip64 headers. [#313](https://github.com/klauspost/compress/pull/313)
+  
+* Jan 14, 2021 (v1.11.7)
+	* Use Bytes() interface to get bytes across packages. [#309](https://github.com/klauspost/compress/pull/309)
+	* s2: Add 'best' compression option.  [#310](https://github.com/klauspost/compress/pull/310)
+	* s2: Add ReaderMaxBlockSize, changes `s2.NewReader` signature to include varargs. [#311](https://github.com/klauspost/compress/pull/311)
+	* s2: Fix crash on small better buffers. [#308](https://github.com/klauspost/compress/pull/308)
+	* s2: Clean up decoder. [#312](https://github.com/klauspost/compress/pull/312)
+
+* Jan 7, 2021 (v1.11.6)
+	* zstd: Make decoder allocations smaller [#306](https://github.com/klauspost/compress/pull/306)
+	* zstd: Free Decoder resources when Reset is called with a nil io.Reader  [#305](https://github.com/klauspost/compress/pull/305)
+
+* Dec 20, 2020 (v1.11.4)
+	* zstd: Add Best compression mode [#304](https://github.com/klauspost/compress/pull/304)
+	* Add header decoder [#299](https://github.com/klauspost/compress/pull/299)
+	* s2: Add uncompressed stream option [#297](https://github.com/klauspost/compress/pull/297)
+	* Simplify/speed up small blocks with known max size. [#300](https://github.com/klauspost/compress/pull/300)
+	* zstd: Always reset literal dict encoder [#303](https://github.com/klauspost/compress/pull/303)
+
+* Nov 15, 2020 (v1.11.3)
+	* inflate: 10-15% faster decompression  [#293](https://github.com/klauspost/compress/pull/293)
+	* zstd: Tweak DecodeAll default allocation [#295](https://github.com/klauspost/compress/pull/295)
+
+* Oct 11, 2020 (v1.11.2)
+	* s2: Fix out of bounds read in "better" block compression [#291](https://github.com/klauspost/compress/pull/291)
+
+* Oct 1, 2020 (v1.11.1)
+	* zstd: Set allLitEntropy true in default configuration [#286](https://github.com/klauspost/compress/pull/286)
+
+* Sept 8, 2020 (v1.11.0)
+	* zstd: Add experimental compression [dictionaries](https://github.com/klauspost/compress/tree/master/zstd#dictionaries) [#281](https://github.com/klauspost/compress/pull/281)
+	* zstd: Fix mixed Write and ReadFrom calls [#282](https://github.com/klauspost/compress/pull/282)
+	* inflate/gz: Limit variable shifts, ~5% faster decompression [#274](https://github.com/klauspost/compress/pull/274)
+</details>
+
+<details>
+	<summary>See changes to v1.10.x</summary>
+ 
+* July 8, 2020 (v1.10.11) 
+	* zstd: Fix extra block when compressing with ReadFrom. [#278](https://github.com/klauspost/compress/pull/278)
+	* huff0: Also populate compression table when reading decoding table. [#275](https://github.com/klauspost/compress/pull/275)
+	
+* June 23, 2020 (v1.10.10) 
+	* zstd: Skip entropy compression in fastest mode when no matches. [#270](https://github.com/klauspost/compress/pull/270)
+	
+* June 16, 2020 (v1.10.9): 
+	* zstd: API change for specifying dictionaries. See [#268](https://github.com/klauspost/compress/pull/268)
+	* zip: update CreateHeaderRaw to handle zip64 fields. [#266](https://github.com/klauspost/compress/pull/266)
+	* Fuzzit tests removed. The service has been purchased and is no longer available.
+	
+* June 5, 2020 (v1.10.8): 
+	* 1.15x faster zstd block decompression. [#265](https://github.com/klauspost/compress/pull/265)
+	
+* June 1, 2020 (v1.10.7): 
+	* Added zstd decompression [dictionary support](https://github.com/klauspost/compress/tree/master/zstd#dictionaries)
+	* Increase zstd decompression speed up to 1.19x.  [#259](https://github.com/klauspost/compress/pull/259)
+	* Remove internal reset call in zstd compression and reduce allocations. [#263](https://github.com/klauspost/compress/pull/263)
+	
+* May 21, 2020: (v1.10.6) 
+	* zstd: Reduce allocations while decoding. [#258](https://github.com/klauspost/compress/pull/258), [#252](https://github.com/klauspost/compress/pull/252)
+	* zstd: Stricter decompression checks.
+	
+* April 12, 2020: (v1.10.5)
+	* s2-commands: Flush output when receiving SIGINT. [#239](https://github.com/klauspost/compress/pull/239)
+	
+* Apr 8, 2020: (v1.10.4) 
+	* zstd: Minor/special case optimizations. [#251](https://github.com/klauspost/compress/pull/251),  [#250](https://github.com/klauspost/compress/pull/250),  [#249](https://github.com/klauspost/compress/pull/249),  [#247](https://github.com/klauspost/compress/pull/247)
+* Mar 11, 2020: (v1.10.3) 
+	* s2: Use S2 encoder in pure Go mode for Snappy output as well. [#245](https://github.com/klauspost/compress/pull/245)
+	* s2: Fix pure Go block encoder. [#244](https://github.com/klauspost/compress/pull/244)
+	* zstd: Added "better compression" mode. [#240](https://github.com/klauspost/compress/pull/240)
+	* zstd: Improve speed of fastest compression mode by 5-10% [#241](https://github.com/klauspost/compress/pull/241)
+	* zstd: Skip creating encoders when not needed. [#238](https://github.com/klauspost/compress/pull/238)
+	
+* Feb 27, 2020: (v1.10.2) 
+	* Close to 50% speedup in inflate (gzip/zip decompression). [#236](https://github.com/klauspost/compress/pull/236) [#234](https://github.com/klauspost/compress/pull/234) [#232](https://github.com/klauspost/compress/pull/232)
+	* Reduce deflate level 1-6 memory usage up to 59%. [#227](https://github.com/klauspost/compress/pull/227)
+	
+* Feb 18, 2020: (v1.10.1)
+	* Fix zstd crash when resetting multiple times without sending data. [#226](https://github.com/klauspost/compress/pull/226)
+	* deflate: Fix dictionary use on level 1-6. [#224](https://github.com/klauspost/compress/pull/224)
+	* Remove deflate writer reference when closing. [#224](https://github.com/klauspost/compress/pull/224)
+	
+* Feb 4, 2020: (v1.10.0) 
+	* Add optional dictionary to [stateless deflate](https://pkg.go.dev/github.com/klauspost/compress/flate?tab=doc#StatelessDeflate). Breaking change, send `nil` for previous behaviour. [#216](https://github.com/klauspost/compress/pull/216)
+	* Fix buffer overflow on repeated small block deflate.  [#218](https://github.com/klauspost/compress/pull/218)
+	* Allow copying content from an existing ZIP file without decompressing+compressing. [#214](https://github.com/klauspost/compress/pull/214)
+	* Added [S2](https://github.com/klauspost/compress/tree/master/s2#s2-compression) AMD64 assembler and various optimizations. Stream speed >10GB/s.  [#186](https://github.com/klauspost/compress/pull/186)
+
+</details>
+
+<details>
+	<summary>See changes prior to v1.10.0</summary>
+
+* Jan 20,2020 (v1.9.8) Optimize gzip/deflate with better size estimates and faster table generation. [#207](https://github.com/klauspost/compress/pull/207) by [luyu6056](https://github.com/luyu6056),  [#206](https://github.com/klauspost/compress/pull/206).
+* Jan 11, 2020: S2 Encode/Decode will use provided buffer if capacity is big enough. [#204](https://github.com/klauspost/compress/pull/204) 
+* Jan 5, 2020: (v1.9.7) Fix another zstd regression in v1.9.5 - v1.9.6 removed.
+* Jan 4, 2020: (v1.9.6) Regression in v1.9.5 fixed causing corrupt zstd encodes in rare cases.
+* Jan 4, 2020: Faster IO in [s2c + s2d commandline tools](https://github.com/klauspost/compress/tree/master/s2#commandline-tools) compression/decompression. [#192](https://github.com/klauspost/compress/pull/192)
+* Dec 29, 2019: Removed v1.9.5 since fuzz tests showed a compatibility problem with the reference zstandard decoder.
+* Dec 29, 2019: (v1.9.5) zstd: 10-20% faster block compression. [#199](https://github.com/klauspost/compress/pull/199)
+* Dec 29, 2019: [zip](https://godoc.org/github.com/klauspost/compress/zip) package updated with latest Go features
+* Dec 29, 2019: zstd: Single segment flag condintions tweaked. [#197](https://github.com/klauspost/compress/pull/197)
+* Dec 18, 2019: s2: Faster compression when ReadFrom is used. [#198](https://github.com/klauspost/compress/pull/198)
+* Dec 10, 2019: s2: Fix repeat length output when just above at 16MB limit.
+* Dec 10, 2019: zstd: Add function to get decoder as io.ReadCloser. [#191](https://github.com/klauspost/compress/pull/191)
+* Dec 3, 2019: (v1.9.4) S2: limit max repeat length. [#188](https://github.com/klauspost/compress/pull/188)
+* Dec 3, 2019: Add [WithNoEntropyCompression](https://godoc.org/github.com/klauspost/compress/zstd#WithNoEntropyCompression) to zstd [#187](https://github.com/klauspost/compress/pull/187)
+* Dec 3, 2019: Reduce memory use for tests. Check for leaked goroutines.
+* Nov 28, 2019 (v1.9.3) Less allocations in stateless deflate.
+* Nov 28, 2019: 5-20% Faster huff0 decode. Impacts zstd as well. [#184](https://github.com/klauspost/compress/pull/184)
+* Nov 12, 2019 (v1.9.2) Added [Stateless Compression](#stateless-compression) for gzip/deflate.
+* Nov 12, 2019: Fixed zstd decompression of large single blocks. [#180](https://github.com/klauspost/compress/pull/180)
+* Nov 11, 2019: Set default  [s2c](https://github.com/klauspost/compress/tree/master/s2#commandline-tools) block size to 4MB.
+* Nov 11, 2019: Reduce inflate memory use by 1KB.
+* Nov 10, 2019: Less allocations in deflate bit writer.
+* Nov 10, 2019: Fix inconsistent error returned by zstd decoder.
+* Oct 28, 2019 (v1.9.1) ztsd: Fix crash when compressing blocks. [#174](https://github.com/klauspost/compress/pull/174)
+* Oct 24, 2019 (v1.9.0) zstd: Fix rare data corruption [#173](https://github.com/klauspost/compress/pull/173)
+* Oct 24, 2019 zstd: Fix huff0 out of buffer write [#171](https://github.com/klauspost/compress/pull/171) and always return errors [#172](https://github.com/klauspost/compress/pull/172) 
+* Oct 10, 2019: Big deflate rewrite, 30-40% faster with better compression [#105](https://github.com/klauspost/compress/pull/105)
+
+</details>
+
+<details>
+	<summary>See changes prior to v1.9.0</summary>
+
+* Oct 10, 2019: (v1.8.6) zstd: Allow partial reads to get flushed data. [#169](https://github.com/klauspost/compress/pull/169)
+* Oct 3, 2019: Fix inconsistent results on broken zstd streams.
+* Sep 25, 2019: Added `-rm` (remove source files) and `-q` (no output except errors) to `s2c` and `s2d` [commands](https://github.com/klauspost/compress/tree/master/s2#commandline-tools)
+* Sep 16, 2019: (v1.8.4) Add `s2c` and `s2d` [commandline tools](https://github.com/klauspost/compress/tree/master/s2#commandline-tools).
+* Sep 10, 2019: (v1.8.3) Fix s2 decoder [Skip](https://godoc.org/github.com/klauspost/compress/s2#Reader.Skip).
+* Sep 7, 2019: zstd: Added [WithWindowSize](https://godoc.org/github.com/klauspost/compress/zstd#WithWindowSize), contributed by [ianwilkes](https://github.com/ianwilkes).
+* Sep 5, 2019: (v1.8.2) Add [WithZeroFrames](https://godoc.org/github.com/klauspost/compress/zstd#WithZeroFrames) which adds full zero payload block encoding option.
+* Sep 5, 2019: Lazy initialization of zstandard predefined en/decoder tables.
+* Aug 26, 2019: (v1.8.1) S2: 1-2% compression increase in "better" compression mode.
+* Aug 26, 2019: zstd: Check maximum size of Huffman 1X compressed literals while decoding.
+* Aug 24, 2019: (v1.8.0) Added [S2 compression](https://github.com/klauspost/compress/tree/master/s2#s2-compression), a high performance replacement for Snappy. 
+* Aug 21, 2019: (v1.7.6) Fixed minor issues found by fuzzer. One could lead to zstd not decompressing.
+* Aug 18, 2019: Add [fuzzit](https://fuzzit.dev/) continuous fuzzing.
+* Aug 14, 2019: zstd: Skip incompressible data 2x faster.  [#147](https://github.com/klauspost/compress/pull/147)
+* Aug 4, 2019 (v1.7.5): Better literal compression. [#146](https://github.com/klauspost/compress/pull/146)
+* Aug 4, 2019: Faster zstd compression. [#143](https://github.com/klauspost/compress/pull/143) [#144](https://github.com/klauspost/compress/pull/144)
+* Aug 4, 2019: Faster zstd decompression. [#145](https://github.com/klauspost/compress/pull/145) [#143](https://github.com/klauspost/compress/pull/143) [#142](https://github.com/klauspost/compress/pull/142)
+* July 15, 2019 (v1.7.4): Fix double EOF block in rare cases on zstd encoder.
+* July 15, 2019 (v1.7.3): Minor speedup/compression increase in default zstd encoder.
+* July 14, 2019: zstd decoder: Fix decompression error on multiple uses with mixed content.
+* July 7, 2019 (v1.7.2): Snappy update, zstd decoder potential race fix.
+* June 17, 2019: zstd decompression bugfix.
+* June 17, 2019: fix 32 bit builds.
+* June 17, 2019: Easier use in modules (less dependencies).
+* June 9, 2019: New stronger "default" [zstd](https://github.com/klauspost/compress/tree/master/zstd#zstd) compression mode. Matches zstd default compression ratio.
+* June 5, 2019: 20-40% throughput in [zstandard](https://github.com/klauspost/compress/tree/master/zstd#zstd) compression and better compression.
+* June 5, 2019: deflate/gzip compression: Reduce memory usage of lower compression levels.
+* June 2, 2019: Added [zstandard](https://github.com/klauspost/compress/tree/master/zstd#zstd) compression!
+* May 25, 2019: deflate/gzip: 10% faster bit writer, mostly visible in lower levels.
+* Apr 22, 2019: [zstd](https://github.com/klauspost/compress/tree/master/zstd#zstd) decompression added.
+* Aug 1, 2018: Added [huff0 README](https://github.com/klauspost/compress/tree/master/huff0#huff0-entropy-compression).
+* Jul 8, 2018: Added [Performance Update 2018](#performance-update-2018) below.
+* Jun 23, 2018: Merged [Go 1.11 inflate optimizations](https://go-review.googlesource.com/c/go/+/102235). Go 1.9 is now required. Backwards compatible version tagged with [v1.3.0](https://github.com/klauspost/compress/releases/tag/v1.3.0).
+* Apr 2, 2018: Added [huff0](https://godoc.org/github.com/klauspost/compress/huff0) en/decoder. Experimental for now, API may change.
+* Mar 4, 2018: Added [FSE Entropy](https://godoc.org/github.com/klauspost/compress/fse) en/decoder. Experimental for now, API may change.
+* Nov 3, 2017: Add compression [Estimate](https://godoc.org/github.com/klauspost/compress#Estimate) function.
+* May 28, 2017: Reduce allocations when resetting decoder.
+* Apr 02, 2017: Change back to official crc32, since changes were merged in Go 1.7.
+* Jan 14, 2017: Reduce stack pressure due to array copies. See [Issue #18625](https://github.com/golang/go/issues/18625).
+* Oct 25, 2016: Level 2-4 have been rewritten and now offers significantly better performance than before.
+* Oct 20, 2016: Port zlib changes from Go 1.7 to fix zlib writer issue. Please update.
+* Oct 16, 2016: Go 1.7 changes merged. Apples to apples this package is a few percent faster, but has a significantly better balance between speed and compression per level. 
+* Mar 24, 2016: Always attempt Huffman encoding on level 4-7. This improves base 64 encoded data compression.
+* Mar 24, 2016: Small speedup for level 1-3.
+* Feb 19, 2016: Faster bit writer, level -2 is 15% faster, level 1 is 4% faster.
+* Feb 19, 2016: Handle small payloads faster in level 1-3.
+* Feb 19, 2016: Added faster level 2 + 3 compression modes.
+* Feb 19, 2016: [Rebalanced compression levels](https://blog.klauspost.com/rebalancing-deflate-compression-levels/), so there is a more even progresssion in terms of compression. New default level is 5.
+* Feb 14, 2016: Snappy: Merge upstream changes. 
+* Feb 14, 2016: Snappy: Fix aggressive skipping.
+* Feb 14, 2016: Snappy: Update benchmark.
+* Feb 13, 2016: Deflate: Fixed assembler problem that could lead to sub-optimal compression.
+* Feb 12, 2016: Snappy: Added AMD64 SSE 4.2 optimizations to matching, which makes easy to compress material run faster. Typical speedup is around 25%.
+* Feb 9, 2016: Added Snappy package fork. This version is 5-7% faster, much more on hard to compress content.
+* Jan 30, 2016: Optimize level 1 to 3 by not considering static dictionary or storing uncompressed. ~4-5% speedup.
+* Jan 16, 2016: Optimization on deflate level 1,2,3 compression.
+* Jan 8 2016: Merge [CL 18317](https://go-review.googlesource.com/#/c/18317): fix reading, writing of zip64 archives.
+* Dec 8 2015: Make level 1 and -2 deterministic even if write size differs.
+* Dec 8 2015: Split encoding functions, so hashing and matching can potentially be inlined. 1-3% faster on AMD64. 5% faster on other platforms.
+* Dec 8 2015: Fixed rare [one byte out-of bounds read](https://github.com/klauspost/compress/issues/20). Please update!
+* Nov 23 2015: Optimization on token writer. ~2-4% faster. Contributed by [@dsnet](https://github.com/dsnet).
+* Nov 20 2015: Small optimization to bit writer on 64 bit systems.
+* Nov 17 2015: Fixed out-of-bound errors if the underlying Writer returned an error. See [#15](https://github.com/klauspost/compress/issues/15).
+* Nov 12 2015: Added [io.WriterTo](https://golang.org/pkg/io/#WriterTo) support to gzip/inflate.
+* Nov 11 2015: Merged [CL 16669](https://go-review.googlesource.com/#/c/16669/4): archive/zip: enable overriding (de)compressors per file
+* Oct 15 2015: Added skipping on uncompressible data. Random data speed up >5x.
+
+</details>
+
+# deflate usage
+
+The packages are drop-in replacements for standard libraries. Simply replace the import path to use them:
+
+| old import         | new import                              | Documentation
+|--------------------|-----------------------------------------|--------------------|
+| `compress/gzip`    | `github.com/klauspost/compress/gzip`    | [gzip](https://pkg.go.dev/github.com/klauspost/compress/gzip?tab=doc)
+| `compress/zlib`    | `github.com/klauspost/compress/zlib`    | [zlib](https://pkg.go.dev/github.com/klauspost/compress/zlib?tab=doc)
+| `archive/zip`      | `github.com/klauspost/compress/zip`     | [zip](https://pkg.go.dev/github.com/klauspost/compress/zip?tab=doc)
+| `compress/flate`   | `github.com/klauspost/compress/flate`   | [flate](https://pkg.go.dev/github.com/klauspost/compress/flate?tab=doc)
+
+* Optimized [deflate](https://godoc.org/github.com/klauspost/compress/flate) packages which can be used as a dropin replacement for [gzip](https://godoc.org/github.com/klauspost/compress/gzip), [zip](https://godoc.org/github.com/klauspost/compress/zip) and [zlib](https://godoc.org/github.com/klauspost/compress/zlib).
+
+You may also be interested in [pgzip](https://github.com/klauspost/pgzip), which is a drop in replacement for gzip, which support multithreaded compression on big files and the optimized [crc32](https://github.com/klauspost/crc32) package used by these packages.
+
+The packages contains the same as the standard library, so you can use the godoc for that: [gzip](http://golang.org/pkg/compress/gzip/), [zip](http://golang.org/pkg/archive/zip/),  [zlib](http://golang.org/pkg/compress/zlib/), [flate](http://golang.org/pkg/compress/flate/).
+
+Currently there is only minor speedup on decompression (mostly CRC32 calculation).
+
+Memory usage is typically 1MB for a Writer. stdlib is in the same range. 
+If you expect to have a lot of concurrently allocated Writers consider using 
+the stateless compress described below.
+
+For compression performance, see: [this spreadsheet](https://docs.google.com/spreadsheets/d/1nuNE2nPfuINCZJRMt6wFWhKpToF95I47XjSsc-1rbPQ/edit?usp=sharing).
+
+# Stateless compression
+
+This package offers stateless compression as a special option for gzip/deflate. 
+It will do compression but without maintaining any state between Write calls.
+
+This means there will be no memory kept between Write calls, but compression and speed will be suboptimal.
+
+This is only relevant in cases where you expect to run many thousands of compressors concurrently, 
+but with very little activity. This is *not* intended for regular web servers serving individual requests.  
+
+Because of this, the size of actual Write calls will affect output size.
+
+In gzip, specify level `-3` / `gzip.StatelessCompression` to enable.
+
+For direct deflate use, NewStatelessWriter and StatelessDeflate are available. See [documentation](https://godoc.org/github.com/klauspost/compress/flate#NewStatelessWriter)
+
+A `bufio.Writer` can of course be used to control write sizes. For example, to use a 4KB buffer:
+
+```
+	// replace 'ioutil.Discard' with your output.
+	gzw, err := gzip.NewWriterLevel(ioutil.Discard, gzip.StatelessCompression)
+	if err != nil {
+		return err
+	}
+	defer gzw.Close()
+
+	w := bufio.NewWriterSize(gzw, 4096)
+	defer w.Flush()
+	
+	// Write to 'w' 
+```
+
+This will only use up to 4KB in memory when the writer is idle. 
+
+Compression is almost always worse than the fastest compression level 
+and each write will allocate (a little) memory. 
+
+# Performance Update 2018
+
+It has been a while since we have been looking at the speed of this package compared to the standard library, so I thought I would re-do my tests and give some overall recommendations based on the current state. All benchmarks have been performed with Go 1.10 on my Desktop Intel(R) Core(TM) i7-2600 CPU @3.40GHz. Since I last ran the tests, I have gotten more RAM, which means tests with big files are no longer limited by my SSD.
+
+The raw results are in my [updated spreadsheet](https://docs.google.com/spreadsheets/d/1nuNE2nPfuINCZJRMt6wFWhKpToF95I47XjSsc-1rbPQ/edit?usp=sharing). Due to cgo changes and upstream updates i could not get the cgo version of gzip to compile. Instead I included the [zstd](https://github.com/datadog/zstd) cgo implementation. If I get cgo gzip to work again, I might replace the results in the sheet.
+
+The columns to take note of are: *MB/s* - the throughput. *Reduction* - the data size reduction in percent of the original. *Rel Speed* relative speed compared to the standard library at the same level. *Smaller* - how many percent smaller is the compressed output compared to stdlib. Negative means the output was bigger. *Loss* means the loss (or gain) in compression as a percentage difference of the input.
+
+The `gzstd` (standard library gzip) and `gzkp` (this package gzip) only uses one CPU core. [`pgzip`](https://github.com/klauspost/pgzip), [`bgzf`](https://github.com/biogo/hts/tree/master/bgzf) uses all 4 cores. [`zstd`](https://github.com/DataDog/zstd) uses one core, and is a beast (but not Go, yet).
+
+
+## Overall differences.
+
+There appears to be a roughly 5-10% speed advantage over the standard library when comparing at similar compression levels.
+
+The biggest difference you will see is the result of [re-balancing](https://blog.klauspost.com/rebalancing-deflate-compression-levels/) the compression levels. I wanted by library to give a smoother transition between the compression levels than the standard library.
+
+This package attempts to provide a more smooth transition, where "1" is taking a lot of shortcuts, "5" is the reasonable trade-off and "9" is the "give me the best compression", and the values in between gives something reasonable in between. The standard library has big differences in levels 1-4, but levels 5-9 having no significant gains - often spending a lot more time than can be justified by the achieved compression.
+
+There are links to all the test data in the [spreadsheet](https://docs.google.com/spreadsheets/d/1nuNE2nPfuINCZJRMt6wFWhKpToF95I47XjSsc-1rbPQ/edit?usp=sharing) in the top left field on each tab.
+
+## Web Content
+
+This test set aims to emulate typical use in a web server. The test-set is 4GB data in 53k files, and is a mixture of (mostly) HTML, JS, CSS.
+
+Since level 1 and 9 are close to being the same code, they are quite close. But looking at the levels in-between the differences are quite big.
+
+Looking at level 6, this package is 88% faster, but will output about 6% more data. For a web server, this means you can serve 88% more data, but have to pay for 6% more bandwidth. You can draw your own conclusions on what would be the most expensive for your case.
+
+## Object files
+
+This test is for typical data files stored on a server. In this case it is a collection of Go precompiled objects. They are very compressible.
+
+The picture is similar to the web content, but with small differences since this is very compressible. Levels 2-3 offer good speed, but is sacrificing quite a bit of compression. 
+
+The standard library seems suboptimal on level 3 and 4 - offering both worse compression and speed than level 6 & 7 of this package respectively.
+
+## Highly Compressible File
+
+This is a JSON file with very high redundancy. The reduction starts at 95% on level 1, so in real life terms we are dealing with something like a highly redundant stream of data, etc.
+
+It is definitely visible that we are dealing with specialized content here, so the results are very scattered. This package does not do very well at levels 1-4, but picks up significantly at level 5 and levels 7 and 8 offering great speed for the achieved compression.
+
+So if you know you content is extremely compressible you might want to go slightly higher than the defaults. The standard library has a huge gap between levels 3 and 4 in terms of speed (2.75x slowdown), so it offers little "middle ground".
+
+## Medium-High Compressible
+
+This is a pretty common test corpus: [enwik9](http://mattmahoney.net/dc/textdata.html). It contains the first 10^9 bytes of the English Wikipedia dump on Mar. 3, 2006. This is a very good test of typical text based compression and more data heavy streams.
+
+We see a similar picture here as in "Web Content". On equal levels some compression is sacrificed for more speed. Level 5 seems to be the best trade-off between speed and size, beating stdlib level 3 in both.
+
+## Medium Compressible
+
+I will combine two test sets, one [10GB file set](http://mattmahoney.net/dc/10gb.html) and a VM disk image (~8GB). Both contain different data types and represent a typical backup scenario.
+
+The most notable thing is how quickly the standard library drops to very low compression speeds around level 5-6 without any big gains in compression. Since this type of data is fairly common, this does not seem like good behavior.
+
+
+## Un-compressible Content
+
+This is mainly a test of how good the algorithms are at detecting un-compressible input. The standard library only offers this feature with very conservative settings at level 1. Obviously there is no reason for the algorithms to try to compress input that cannot be compressed.  The only downside is that it might skip some compressible data on false detections.
+
+
+## Huffman only compression
+
+This compression library adds a special compression level, named `HuffmanOnly`, which allows near linear time compression. This is done by completely disabling matching of previous data, and only reduce the number of bits to represent each character. 
+
+This means that often used characters, like 'e' and ' ' (space) in text use the fewest bits to represent, and rare characters like '¤' takes more bits to represent. For more information see [wikipedia](https://en.wikipedia.org/wiki/Huffman_coding) or this nice [video](https://youtu.be/ZdooBTdW5bM).
+
+Since this type of compression has much less variance, the compression speed is mostly unaffected by the input data, and is usually more than *180MB/s* for a single core.
+
+The downside is that the compression ratio is usually considerably worse than even the fastest conventional compression. The compression ratio can never be better than 8:1 (12.5%). 
+
+The linear time compression can be used as a "better than nothing" mode, where you cannot risk the encoder to slow down on some content. For comparison, the size of the "Twain" text is *233460 bytes* (+29% vs. level 1) and encode speed is 144MB/s (4.5x level 1). So in this case you trade a 30% size increase for a 4 times speedup.
+
+For more information see my blog post on [Fast Linear Time Compression](http://blog.klauspost.com/constant-time-gzipzip-compression/).
+
+This is implemented on Go 1.7 as "Huffman Only" mode, though not exposed for gzip.
+
+# Other packages
+
+Here are other packages of good quality and pure Go (no cgo wrappers or autoconverted code):
+
+* [github.com/pierrec/lz4](https://github.com/pierrec/lz4) - strong multithreaded LZ4 compression.
+* [github.com/cosnicolaou/pbzip2](https://github.com/cosnicolaou/pbzip2) - multithreaded bzip2 decompression.
+* [github.com/dsnet/compress](https://github.com/dsnet/compress) - brotli decompression, bzip2 writer.
+
+# license
+
+This code is licensed under the same conditions as the original Go code. See LICENSE file.
diff --git a/vendor/github.com/klauspost/compress/compressible.go b/vendor/github.com/klauspost/compress/compressible.go
new file mode 100644
index 0000000..ea5a692
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/compressible.go
@@ -0,0 +1,85 @@
+package compress
+
+import "math"
+
+// Estimate returns a normalized compressibility estimate of block b.
+// Values close to zero are likely uncompressible.
+// Values above 0.1 are likely to be compressible.
+// Values above 0.5 are very compressible.
+// Very small lengths will return 0.
+func Estimate(b []byte) float64 {
+	if len(b) < 16 {
+		return 0
+	}
+
+	// Correctly predicted order 1
+	hits := 0
+	lastMatch := false
+	var o1 [256]byte
+	var hist [256]int
+	c1 := byte(0)
+	for _, c := range b {
+		if c == o1[c1] {
+			// We only count a hit if there was two correct predictions in a row.
+			if lastMatch {
+				hits++
+			}
+			lastMatch = true
+		} else {
+			lastMatch = false
+		}
+		o1[c1] = c
+		c1 = c
+		hist[c]++
+	}
+
+	// Use x^0.6 to give better spread
+	prediction := math.Pow(float64(hits)/float64(len(b)), 0.6)
+
+	// Calculate histogram distribution
+	variance := float64(0)
+	avg := float64(len(b)) / 256
+
+	for _, v := range hist {
+		Δ := float64(v) - avg
+		variance += Δ * Δ
+	}
+
+	stddev := math.Sqrt(float64(variance)) / float64(len(b))
+	exp := math.Sqrt(1 / float64(len(b)))
+
+	// Subtract expected stddev
+	stddev -= exp
+	if stddev < 0 {
+		stddev = 0
+	}
+	stddev *= 1 + exp
+
+	// Use x^0.4 to give better spread
+	entropy := math.Pow(stddev, 0.4)
+
+	// 50/50 weight between prediction and histogram distribution
+	return math.Pow((prediction+entropy)/2, 0.9)
+}
+
+// ShannonEntropyBits returns the number of bits minimum required to represent
+// an entropy encoding of the input bytes.
+// https://en.wiktionary.org/wiki/Shannon_entropy
+func ShannonEntropyBits(b []byte) int {
+	if len(b) == 0 {
+		return 0
+	}
+	var hist [256]int
+	for _, c := range b {
+		hist[c]++
+	}
+	shannon := float64(0)
+	invTotal := 1.0 / float64(len(b))
+	for _, v := range hist[:] {
+		if v > 0 {
+			n := float64(v)
+			shannon += math.Ceil(-math.Log2(n*invTotal) * n)
+		}
+	}
+	return int(math.Ceil(shannon))
+}
diff --git a/vendor/github.com/klauspost/compress/flate/deflate.go b/vendor/github.com/klauspost/compress/flate/deflate.go
index 25dbe3e..bffa2f3 100644
--- a/vendor/github.com/klauspost/compress/flate/deflate.go
+++ b/vendor/github.com/klauspost/compress/flate/deflate.go
@@ -6,6 +6,7 @@
 package flate
 
 import (
+	"encoding/binary"
 	"fmt"
 	"io"
 	"math"
@@ -37,15 +38,17 @@ const (
 	maxMatchLength   = 258 // The longest match for the compressor
 	minOffsetSize    = 1   // The shortest offset that makes any sense
 
-	// The maximum number of tokens we put into a single flat block, just too
-	// stop things from getting too large.
-	maxFlateBlockTokens = 1 << 14
+	// The maximum number of tokens we will encode at the time.
+	// Smaller sizes usually creates less optimal blocks.
+	// Bigger can make context switching slow.
+	// We use this for levels 7-9, so we make it big.
+	maxFlateBlockTokens = 1 << 15
 	maxStoreBlockSize   = 65535
 	hashBits            = 17 // After 17 performance degrades
 	hashSize            = 1 << hashBits
 	hashMask            = (1 << hashBits) - 1
 	hashShift           = (hashBits + minMatchLength - 1) / minMatchLength
-	maxHashOffset       = 1 << 24
+	maxHashOffset       = 1 << 28
 
 	skipNever = math.MaxInt32
 
@@ -70,9 +73,9 @@ var levels = []compressionLevel{
 	{0, 0, 0, 0, 0, 6},
 	// Levels 7-9 use increasingly more lazy matching
 	// and increasingly stringent conditions for "good enough".
-	{8, 8, 24, 16, skipNever, 7},
-	{10, 16, 24, 64, skipNever, 8},
-	{32, 258, 258, 4096, skipNever, 9},
+	{8, 12, 16, 24, skipNever, 7},
+	{16, 30, 40, 64, skipNever, 8},
+	{32, 258, 258, 1024, skipNever, 9},
 }
 
 // advancedState contains state for the advanced levels, with bigger hash tables, etc.
@@ -93,8 +96,9 @@ type advancedState struct {
 	hashOffset int
 
 	// input window: unprocessed data is window[index:windowEnd]
-	index     int
-	hashMatch [maxMatchLength + minMatchLength]uint32
+	index          int
+	estBitsPerByte int
+	hashMatch      [maxMatchLength + minMatchLength]uint32
 
 	hash uint32
 	ii   uint16 // position of last match, intended to overflow to reset.
@@ -103,6 +107,7 @@ type advancedState struct {
 type compressor struct {
 	compressionLevel
 
+	h *huffmanEncoder
 	w *huffmanBitWriter
 
 	// compression algorithm
@@ -170,7 +175,8 @@ func (d *compressor) writeBlock(tok *tokens, index int, eof bool) error {
 			window = d.window[d.blockStart:index]
 		}
 		d.blockStart = index
-		d.w.writeBlock(tok, eof, window)
+		//d.w.writeBlock(tok, eof, window)
+		d.w.writeBlockDynamic(tok, eof, window, d.sync)
 		return d.w.err
 	}
 	return nil
@@ -263,7 +269,7 @@ func (d *compressor) fillWindow(b []byte) {
 // Try to find a match starting at index whose length is greater than prevSize.
 // We only look at chainCount possibilities before giving up.
 // pos = s.index, prevHead = s.chainHead-s.hashOffset, prevLength=minMatchLength-1, lookahead
-func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead int) (length, offset int, ok bool) {
+func (d *compressor) findMatch(pos int, prevHead int, lookahead int) (length, offset int, ok bool) {
 	minMatchLook := maxMatchLength
 	if lookahead < minMatchLook {
 		minMatchLook = lookahead
@@ -279,36 +285,75 @@ func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead
 
 	// If we've got a match that's good enough, only look in 1/4 the chain.
 	tries := d.chain
-	length = prevLength
-	if length >= d.good {
-		tries >>= 2
-	}
+	length = minMatchLength - 1
 
 	wEnd := win[pos+length]
 	wPos := win[pos:]
 	minIndex := pos - windowSize
+	if minIndex < 0 {
+		minIndex = 0
+	}
+	offset = 0
+
+	cGain := 0
+	if d.chain < 100 {
+		for i := prevHead; tries > 0; tries-- {
+			if wEnd == win[i+length] {
+				n := matchLen(win[i:i+minMatchLook], wPos)
+				if n > length {
+					length = n
+					offset = pos - i
+					ok = true
+					if n >= nice {
+						// The match is good enough that we don't try to find a better one.
+						break
+					}
+					wEnd = win[pos+n]
+				}
+			}
+			if i <= minIndex {
+				// hashPrev[i & windowMask] has already been overwritten, so stop now.
+				break
+			}
+			i = int(d.state.hashPrev[i&windowMask]) - d.state.hashOffset
+			if i < minIndex {
+				break
+			}
+		}
+		return
+	}
 
+	// Some like it higher (CSV), some like it lower (JSON)
+	const baseCost = 6
+	// Base is 4 bytes at with an additional cost.
+	// Matches must be better than this.
 	for i := prevHead; tries > 0; tries-- {
 		if wEnd == win[i+length] {
 			n := matchLen(win[i:i+minMatchLook], wPos)
-
-			if n > length && (n > minMatchLength || pos-i <= 4096) {
-				length = n
-				offset = pos - i
-				ok = true
-				if n >= nice {
-					// The match is good enough that we don't try to find a better one.
-					break
+			if n > length {
+				// Calculate gain. Estimate
+				newGain := d.h.bitLengthRaw(wPos[:n]) - int(offsetExtraBits[offsetCode(uint32(pos-i))]) - baseCost - int(lengthExtraBits[lengthCodes[(n-3)&255]])
+
+				//fmt.Println(n, "gain:", newGain, "prev:", cGain, "raw:", d.h.bitLengthRaw(wPos[:n]))
+				if newGain > cGain {
+					length = n
+					offset = pos - i
+					cGain = newGain
+					ok = true
+					if n >= nice {
+						// The match is good enough that we don't try to find a better one.
+						break
+					}
+					wEnd = win[pos+n]
 				}
-				wEnd = win[pos+n]
 			}
 		}
-		if i == minIndex {
+		if i <= minIndex {
 			// hashPrev[i & windowMask] has already been overwritten, so stop now.
 			break
 		}
 		i = int(d.state.hashPrev[i&windowMask]) - d.state.hashOffset
-		if i < minIndex || i < 0 {
+		if i < minIndex {
 			break
 		}
 	}
@@ -327,8 +372,7 @@ func (d *compressor) writeStoredBlock(buf []byte) error {
 // of the supplied slice.
 // The caller must ensure that len(b) >= 4.
 func hash4(b []byte) uint32 {
-	b = b[:4]
-	return hash4u(uint32(b[3])|uint32(b[2])<<8|uint32(b[1])<<16|uint32(b[0])<<24, hashBits)
+	return hash4u(binary.LittleEndian.Uint32(b), hashBits)
 }
 
 // bulkHash4 will compute hashes using the same
@@ -337,11 +381,12 @@ func bulkHash4(b []byte, dst []uint32) {
 	if len(b) < 4 {
 		return
 	}
-	hb := uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24
+	hb := binary.LittleEndian.Uint32(b)
+
 	dst[0] = hash4u(hb, hashBits)
 	end := len(b) - 4 + 1
 	for i := 1; i < end; i++ {
-		hb = (hb << 8) | uint32(b[i+3])
+		hb = (hb >> 8) | uint32(b[i+3])<<24
 		dst[i] = hash4u(hb, hashBits)
 	}
 }
@@ -374,10 +419,21 @@ func (d *compressor) deflateLazy() {
 	if d.windowEnd-s.index < minMatchLength+maxMatchLength && !d.sync {
 		return
 	}
+	if d.windowEnd != s.index && d.chain > 100 {
+		// Get literal huffman coder.
+		if d.h == nil {
+			d.h = newHuffmanEncoder(maxFlateBlockTokens)
+		}
+		var tmp [256]uint16
+		for _, v := range d.window[s.index:d.windowEnd] {
+			tmp[v]++
+		}
+		d.h.generate(tmp[:], 15)
+	}
 
 	s.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
 	if s.index < s.maxInsertIndex {
-		s.hash = hash4(d.window[s.index : s.index+minMatchLength])
+		s.hash = hash4(d.window[s.index:])
 	}
 
 	for {
@@ -410,7 +466,7 @@ func (d *compressor) deflateLazy() {
 		}
 		if s.index < s.maxInsertIndex {
 			// Update the hash
-			s.hash = hash4(d.window[s.index : s.index+minMatchLength])
+			s.hash = hash4(d.window[s.index:])
 			ch := s.hashHead[s.hash&hashMask]
 			s.chainHead = int(ch)
 			s.hashPrev[s.index&windowMask] = ch
@@ -426,12 +482,37 @@ func (d *compressor) deflateLazy() {
 		}
 
 		if s.chainHead-s.hashOffset >= minIndex && lookahead > prevLength && prevLength < d.lazy {
-			if newLength, newOffset, ok := d.findMatch(s.index, s.chainHead-s.hashOffset, minMatchLength-1, lookahead); ok {
+			if newLength, newOffset, ok := d.findMatch(s.index, s.chainHead-s.hashOffset, lookahead); ok {
 				s.length = newLength
 				s.offset = newOffset
 			}
 		}
+
 		if prevLength >= minMatchLength && s.length <= prevLength {
+			// Check for better match at end...
+			//
+			// checkOff must be >=2 since we otherwise risk checking s.index
+			// Offset of 2 seems to yield best results.
+			const checkOff = 2
+			prevIndex := s.index - 1
+			if prevIndex+prevLength+checkOff < s.maxInsertIndex {
+				end := lookahead
+				if lookahead > maxMatchLength {
+					end = maxMatchLength
+				}
+				end += prevIndex
+				idx := prevIndex + prevLength - (4 - checkOff)
+				h := hash4(d.window[idx:])
+				ch2 := int(s.hashHead[h&hashMask]) - s.hashOffset - prevLength + (4 - checkOff)
+				if ch2 > minIndex {
+					length := matchLen(d.window[prevIndex:end], d.window[ch2:])
+					// It seems like a pure length metric is best.
+					if length > prevLength {
+						prevLength = length
+						prevOffset = prevIndex - ch2
+					}
+				}
+			}
 			// There was a match at the previous step, and the current match is
 			// not better. Output the previous match.
 			d.tokens.AddMatch(uint32(prevLength-3), uint32(prevOffset-minOffsetSize))
@@ -440,8 +521,7 @@ func (d *compressor) deflateLazy() {
 			// index and index-1 are already inserted. If there is not enough
 			// lookahead, the last two strings are not inserted into the hash
 			// table.
-			var newIndex int
-			newIndex = s.index + prevLength - 1
+			newIndex := s.index + prevLength - 1
 			// Calculate missing hashes
 			end := newIndex
 			if end > s.maxInsertIndex {
@@ -480,6 +560,7 @@ func (d *compressor) deflateLazy() {
 				}
 				d.tokens.Reset()
 			}
+			s.ii = 0
 		} else {
 			// Reset, if we got a match this run.
 			if s.length >= minMatchLength {
@@ -499,13 +580,12 @@ func (d *compressor) deflateLazy() {
 
 				// If we have a long run of no matches, skip additional bytes
 				// Resets when s.ii overflows after 64KB.
-				if s.ii > 31 {
-					n := int(s.ii >> 5)
+				if n := int(s.ii) - d.chain; n > 0 {
+					n = 1 + int(n>>6)
 					for j := 0; j < n; j++ {
 						if s.index >= d.windowEnd-1 {
 							break
 						}
-
 						d.tokens.AddLiteral(d.window[s.index-1])
 						if d.tokens.n == maxFlateBlockTokens {
 							if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
@@ -513,6 +593,14 @@ func (d *compressor) deflateLazy() {
 							}
 							d.tokens.Reset()
 						}
+						// Index...
+						if s.index < s.maxInsertIndex {
+							h := hash4(d.window[s.index:])
+							ch := s.hashHead[h]
+							s.chainHead = int(ch)
+							s.hashPrev[s.index&windowMask] = ch
+							s.hashHead[h] = uint32(s.index + s.hashOffset)
+						}
 						s.index++
 					}
 					// Flush last byte
@@ -612,7 +700,9 @@ func (d *compressor) write(b []byte) (n int, err error) {
 	}
 	n = len(b)
 	for len(b) > 0 {
-		d.step(d)
+		if d.windowEnd == len(d.window) || d.sync {
+			d.step(d)
+		}
 		b = b[d.fill(d, b):]
 		if d.err != nil {
 			return 0, d.err
@@ -645,21 +735,21 @@ func (d *compressor) init(w io.Writer, level int) (err error) {
 		d.fill = (*compressor).fillBlock
 		d.step = (*compressor).store
 	case level == ConstantCompression:
-		d.w.logNewTablePenalty = 4
-		d.window = make([]byte, maxStoreBlockSize)
+		d.w.logNewTablePenalty = 10
+		d.window = make([]byte, 32<<10)
 		d.fill = (*compressor).fillBlock
 		d.step = (*compressor).storeHuff
 	case level == DefaultCompression:
 		level = 5
 		fallthrough
 	case level >= 1 && level <= 6:
-		d.w.logNewTablePenalty = 6
+		d.w.logNewTablePenalty = 7
 		d.fast = newFastEnc(level)
 		d.window = make([]byte, maxStoreBlockSize)
 		d.fill = (*compressor).fillBlock
 		d.step = (*compressor).storeFast
 	case 7 <= level && level <= 9:
-		d.w.logNewTablePenalty = 10
+		d.w.logNewTablePenalty = 8
 		d.state = &advancedState{}
 		d.compressionLevel = levels[level]
 		d.initDeflate()
diff --git a/vendor/github.com/klauspost/compress/flate/fast_encoder.go b/vendor/github.com/klauspost/compress/flate/fast_encoder.go
index 6d4c1e9..d55ea2a 100644
--- a/vendor/github.com/klauspost/compress/flate/fast_encoder.go
+++ b/vendor/github.com/klauspost/compress/flate/fast_encoder.go
@@ -6,6 +6,7 @@
 package flate
 
 import (
+	"encoding/binary"
 	"fmt"
 	"math/bits"
 )
@@ -44,7 +45,7 @@ const (
 
 	bTableBits   = 17                                               // Bits used in the big tables
 	bTableSize   = 1 << bTableBits                                  // Size of the table
-	allocHistory = maxStoreBlockSize * 10                           // Size to preallocate for history.
+	allocHistory = maxStoreBlockSize * 5                            // Size to preallocate for history.
 	bufferReset  = (1 << 31) - allocHistory - maxStoreBlockSize - 1 // Reset the buffer offset when reaching this.
 )
 
@@ -65,26 +66,15 @@ func load32(b []byte, i int) uint32 {
 }
 
 func load64(b []byte, i int) uint64 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:8]
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
-		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+	return binary.LittleEndian.Uint64(b[i:])
 }
 
 func load3232(b []byte, i int32) uint32 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:4]
-	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+	return binary.LittleEndian.Uint32(b[i:])
 }
 
 func load6432(b []byte, i int32) uint64 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:8]
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
-		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+	return binary.LittleEndian.Uint64(b[i:])
 }
 
 func hash(u uint32) uint32 {
@@ -127,7 +117,7 @@ func (e *fastGen) addBlock(src []byte) int32 {
 // hash4 returns the hash of u to fit in a hash table with h bits.
 // Preferably h should be a constant and should always be <32.
 func hash4u(u uint32, h uint8) uint32 {
-	return (u * prime4bytes) >> ((32 - h) & 31)
+	return (u * prime4bytes) >> ((32 - h) & reg8SizeMask32)
 }
 
 type tableEntryPrev struct {
@@ -138,25 +128,25 @@ type tableEntryPrev struct {
 // hash4x64 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits.
 // Preferably h should be a constant and should always be <32.
 func hash4x64(u uint64, h uint8) uint32 {
-	return (uint32(u) * prime4bytes) >> ((32 - h) & 31)
+	return (uint32(u) * prime4bytes) >> ((32 - h) & reg8SizeMask32)
 }
 
 // hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
 // Preferably h should be a constant and should always be <64.
 func hash7(u uint64, h uint8) uint32 {
-	return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63))
+	return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & reg8SizeMask64))
 }
 
 // hash8 returns the hash of u to fit in a hash table with h bits.
 // Preferably h should be a constant and should always be <64.
 func hash8(u uint64, h uint8) uint32 {
-	return uint32((u * prime8bytes) >> ((64 - h) & 63))
+	return uint32((u * prime8bytes) >> ((64 - h) & reg8SizeMask64))
 }
 
 // hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits.
 // Preferably h should be a constant and should always be <64.
 func hash6(u uint64, h uint8) uint32 {
-	return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & 63))
+	return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & reg8SizeMask64))
 }
 
 // matchlen will return the match length between offsets and t in src.
@@ -189,7 +179,7 @@ func (e *fastGen) matchlen(s, t int32, src []byte) int32 {
 // matchlenLong will return the match length between offsets and t in src.
 // It is assumed that s > t, that t >=0 and s < len(src).
 func (e *fastGen) matchlenLong(s, t int32, src []byte) int32 {
-	if debugDecode {
+	if debugDeflate {
 		if t >= s {
 			panic(fmt.Sprint("t >=s:", t, s))
 		}
@@ -223,31 +213,20 @@ func (e *fastGen) Reset() {
 // matchLen returns the maximum length.
 // 'a' must be the shortest of the two.
 func matchLen(a, b []byte) int {
-	b = b[:len(a)]
 	var checked int
-	if len(a) > 4 {
-		// Try 4 bytes first
-		if diff := load32(a, 0) ^ load32(b, 0); diff != 0 {
-			return bits.TrailingZeros32(diff) >> 3
-		}
-		// Switch to 8 byte matching.
-		checked = 4
-		a = a[4:]
-		b = b[4:]
-		for len(a) >= 8 {
-			b = b[:len(a)]
-			if diff := load64(a, 0) ^ load64(b, 0); diff != 0 {
-				return checked + (bits.TrailingZeros64(diff) >> 3)
-			}
-			checked += 8
-			a = a[8:]
-			b = b[8:]
+
+	for len(a) >= 8 {
+		if diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b); diff != 0 {
+			return checked + (bits.TrailingZeros64(diff) >> 3)
 		}
+		checked += 8
+		a = a[8:]
+		b = b[8:]
 	}
 	b = b[:len(a)]
 	for i := range a {
 		if a[i] != b[i] {
-			return int(i) + checked
+			return i + checked
 		}
 	}
 	return len(a) + checked
diff --git a/vendor/github.com/klauspost/compress/flate/gen_inflate.go b/vendor/github.com/klauspost/compress/flate/gen_inflate.go
deleted file mode 100644
index c74a95f..0000000
--- a/vendor/github.com/klauspost/compress/flate/gen_inflate.go
+++ /dev/null
@@ -1,274 +0,0 @@
-// +build generate
-
-//go:generate go run $GOFILE && gofmt -w inflate_gen.go
-
-package main
-
-import (
-	"os"
-	"strings"
-)
-
-func main() {
-	f, err := os.Create("inflate_gen.go")
-	if err != nil {
-		panic(err)
-	}
-	defer f.Close()
-	types := []string{"*bytes.Buffer", "*bytes.Reader", "*bufio.Reader", "*strings.Reader"}
-	names := []string{"BytesBuffer", "BytesReader", "BufioReader", "StringsReader"}
-	imports := []string{"bytes", "bufio", "io", "strings", "math/bits"}
-	f.WriteString(`// Code generated by go generate gen_inflate.go. DO NOT EDIT.
-
-package flate
-
-import (
-`)
-
-	for _, imp := range imports {
-		f.WriteString("\t\"" + imp + "\"\n")
-	}
-	f.WriteString(")\n\n")
-
-	template := `
-
-// Decode a single Huffman block from f.
-// hl and hd are the Huffman states for the lit/length values
-// and the distance values, respectively. If hd == nil, using the
-// fixed distance encoding associated with fixed Huffman blocks.
-func (f *decompressor) $FUNCNAME$() {
-	const (
-		stateInit = iota // Zero value must be stateInit
-		stateDict
-	)
-	fr := f.r.($TYPE$)
-	moreBits := func() error {
-		c, err := fr.ReadByte()
-		if err != nil {
-			return noEOF(err)
-		}
-		f.roffset++
-		f.b |= uint32(c) << f.nb
-		f.nb += 8
-		return nil
-	}
-
-	switch f.stepState {
-	case stateInit:
-		goto readLiteral
-	case stateDict:
-		goto copyHistory
-	}
-
-readLiteral:
-	// Read literal and/or (length, distance) according to RFC section 3.2.3.
-	{
-		var v int
-		{
-			// Inlined v, err := f.huffSym(f.hl)
-			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
-			// with single element, huffSym must error on these two edge cases. In both
-			// cases, the chunks slice will be 0 for the invalid sequence, leading it
-			// satisfy the n == 0 check below.
-			n := uint(f.hl.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
-			for {
-				for nb < n {
-					c, err := fr.ReadByte()
-					if err != nil {
-						f.b = b
-						f.nb = nb
-						f.err = noEOF(err)
-						return
-					}
-					f.roffset++
-					b |= uint32(c) << (nb & 31)
-					nb += 8
-				}
-				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
-				n = uint(chunk & huffmanCountMask)
-				if n > huffmanChunkBits {
-					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
-					n = uint(chunk & huffmanCountMask)
-				}
-				if n <= nb {
-					if n == 0 {
-						f.b = b
-						f.nb = nb
-						if debugDecode {
-							fmt.Println("huffsym: n==0")
-						}
-						f.err = CorruptInputError(f.roffset)
-						return
-					}
-					f.b = b >> (n & 31)
-					f.nb = nb - n
-					v = int(chunk >> huffmanValueShift)
-					break
-				}
-			}
-		}
-
-		var n uint // number of bits extra
-		var length int
-		var err error
-		switch {
-		case v < 256:
-			f.dict.writeByte(byte(v))
-			if f.dict.availWrite() == 0 {
-				f.toRead = f.dict.readFlush()
-				f.step = (*decompressor).$FUNCNAME$
-				f.stepState = stateInit
-				return
-			}
-			goto readLiteral
-		case v == 256:
-			f.finishBlock()
-			return
-		// otherwise, reference to older data
-		case v < 265:
-			length = v - (257 - 3)
-			n = 0
-		case v < 269:
-			length = v*2 - (265*2 - 11)
-			n = 1
-		case v < 273:
-			length = v*4 - (269*4 - 19)
-			n = 2
-		case v < 277:
-			length = v*8 - (273*8 - 35)
-			n = 3
-		case v < 281:
-			length = v*16 - (277*16 - 67)
-			n = 4
-		case v < 285:
-			length = v*32 - (281*32 - 131)
-			n = 5
-		case v < maxNumLit:
-			length = 258
-			n = 0
-		default:
-			if debugDecode {
-				fmt.Println(v, ">= maxNumLit")
-			}
-			f.err = CorruptInputError(f.roffset)
-			return
-		}
-		if n > 0 {
-			for f.nb < n {
-				if err = moreBits(); err != nil {
-					if debugDecode {
-						fmt.Println("morebits n>0:", err)
-					}
-					f.err = err
-					return
-				}
-			}
-			length += int(f.b & uint32(1<<n-1))
-			f.b >>= n
-			f.nb -= n
-		}
-
-		var dist int
-		if f.hd == nil {
-			for f.nb < 5 {
-				if err = moreBits(); err != nil {
-					if debugDecode {
-						fmt.Println("morebits f.nb<5:", err)
-					}
-					f.err = err
-					return
-				}
-			}
-			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
-			f.b >>= 5
-			f.nb -= 5
-		} else {
-			if dist, err = f.huffSym(f.hd); err != nil {
-				if debugDecode {
-					fmt.Println("huffsym:", err)
-				}
-				f.err = err
-				return
-			}
-		}
-
-		switch {
-		case dist < 4:
-			dist++
-		case dist < maxNumDist:
-			nb := uint(dist-2) >> 1
-			// have 1 bit in bottom of dist, need nb more.
-			extra := (dist & 1) << nb
-			for f.nb < nb {
-				if err = moreBits(); err != nil {
-					if debugDecode {
-						fmt.Println("morebits f.nb<nb:", err)
-					}
-					f.err = err
-					return
-				}
-			}
-			extra |= int(f.b & uint32(1<<nb-1))
-			f.b >>= nb
-			f.nb -= nb
-			dist = 1<<(nb+1) + 1 + extra
-		default:
-			if debugDecode {
-				fmt.Println("dist too big:", dist, maxNumDist)
-			}
-			f.err = CorruptInputError(f.roffset)
-			return
-		}
-
-		// No check on length; encoding can be prescient.
-		if dist > f.dict.histSize() {
-			if debugDecode {
-				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
-			}
-			f.err = CorruptInputError(f.roffset)
-			return
-		}
-
-		f.copyLen, f.copyDist = length, dist
-		goto copyHistory
-	}
-
-copyHistory:
-	// Perform a backwards copy according to RFC section 3.2.3.
-	{
-		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
-		if cnt == 0 {
-			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
-		}
-		f.copyLen -= cnt
-
-		if f.dict.availWrite() == 0 || f.copyLen > 0 {
-			f.toRead = f.dict.readFlush()
-			f.step = (*decompressor).$FUNCNAME$ // We need to continue this work
-			f.stepState = stateDict
-			return
-		}
-		goto readLiteral
-	}
-}
-
-`
-	for i, t := range types {
-		s := strings.Replace(template, "$FUNCNAME$", "huffman"+names[i], -1)
-		s = strings.Replace(s, "$TYPE$", t, -1)
-		f.WriteString(s)
-	}
-	f.WriteString("func (f *decompressor) huffmanBlockDecoder() func() {\n")
-	f.WriteString("\tswitch f.r.(type) {\n")
-	for i, t := range types {
-		f.WriteString("\t\tcase " + t + ":\n")
-		f.WriteString("\t\t\treturn f.huffman" + names[i] + "\n")
-	}
-	f.WriteString("\t\tdefault:\n")
-	f.WriteString("\t\t\treturn f.huffmanBlockGeneric")
-	f.WriteString("\t}\n}\n")
-}
diff --git a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
index 53fe1d0..25f6d11 100644
--- a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
+++ b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
@@ -5,7 +5,10 @@
 package flate
 
 import (
+	"encoding/binary"
+	"fmt"
 	"io"
+	"math"
 )
 
 const (
@@ -22,11 +25,15 @@ const (
 	codegenCodeCount = 19
 	badCode          = 255
 
+	// maxPredefinedTokens is the maximum number of tokens
+	// where we check if fixed size is smaller.
+	maxPredefinedTokens = 250
+
 	// bufferFlushSize indicates the buffer size
 	// after which bytes are flushed to the writer.
 	// Should preferably be a multiple of 6, since
 	// we accumulate 6 bytes between writes to the buffer.
-	bufferFlushSize = 240
+	bufferFlushSize = 246
 
 	// bufferSize is the actual output byte buffer size.
 	// It must have additional headroom for a flush
@@ -34,8 +41,11 @@ const (
 	bufferSize = bufferFlushSize + 8
 )
 
+// Minimum length code that emits bits.
+const lengthExtraBitsMinCode = 8
+
 // The number of extra bits needed by length code X - LENGTH_CODES_START.
-var lengthExtraBits = [32]int8{
+var lengthExtraBits = [32]uint8{
 	/* 257 */ 0, 0, 0,
 	/* 260 */ 0, 0, 0, 0, 0, 1, 1, 1, 1, 2,
 	/* 270 */ 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,
@@ -49,28 +59,41 @@ var lengthBase = [32]uint8{
 	64, 80, 96, 112, 128, 160, 192, 224, 255,
 }
 
+// Minimum offset code that emits bits.
+const offsetExtraBitsMinCode = 4
+
 // offset code word extra bits.
-var offsetExtraBits = [64]int8{
+var offsetExtraBits = [32]int8{
 	0, 0, 0, 0, 1, 1, 2, 2, 3, 3,
 	4, 4, 5, 5, 6, 6, 7, 7, 8, 8,
 	9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
 	/* extended window */
-	14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20,
+	14, 14,
 }
 
-var offsetBase = [64]uint32{
-	/* normal deflate */
-	0x000000, 0x000001, 0x000002, 0x000003, 0x000004,
-	0x000006, 0x000008, 0x00000c, 0x000010, 0x000018,
-	0x000020, 0x000030, 0x000040, 0x000060, 0x000080,
-	0x0000c0, 0x000100, 0x000180, 0x000200, 0x000300,
-	0x000400, 0x000600, 0x000800, 0x000c00, 0x001000,
-	0x001800, 0x002000, 0x003000, 0x004000, 0x006000,
+var offsetCombined = [32]uint32{}
 
-	/* extended window */
-	0x008000, 0x00c000, 0x010000, 0x018000, 0x020000,
-	0x030000, 0x040000, 0x060000, 0x080000, 0x0c0000,
-	0x100000, 0x180000, 0x200000, 0x300000,
+func init() {
+	var offsetBase = [32]uint32{
+		/* normal deflate */
+		0x000000, 0x000001, 0x000002, 0x000003, 0x000004,
+		0x000006, 0x000008, 0x00000c, 0x000010, 0x000018,
+		0x000020, 0x000030, 0x000040, 0x000060, 0x000080,
+		0x0000c0, 0x000100, 0x000180, 0x000200, 0x000300,
+		0x000400, 0x000600, 0x000800, 0x000c00, 0x001000,
+		0x001800, 0x002000, 0x003000, 0x004000, 0x006000,
+
+		/* extended window */
+		0x008000, 0x00c000,
+	}
+
+	for i := range offsetCombined[:] {
+		// Don't use extended window values...
+		if offsetExtraBits[i] == 0 || offsetBase[i] > 0x006000 {
+			continue
+		}
+		offsetCombined[i] = uint32(offsetExtraBits[i]) | (offsetBase[i] << 8)
+	}
 }
 
 // The odd order in which the codegen code sizes are written.
@@ -85,17 +108,18 @@ type huffmanBitWriter struct {
 	// Data waiting to be written is bytes[0:nbytes]
 	// and then the low nbits of bits.
 	bits            uint64
-	nbits           uint16
+	nbits           uint8
 	nbytes          uint8
+	lastHuffMan     bool
 	literalEncoding *huffmanEncoder
+	tmpLitEncoding  *huffmanEncoder
 	offsetEncoding  *huffmanEncoder
 	codegenEncoding *huffmanEncoder
 	err             error
 	lastHeader      int
 	// Set between 0 (reused block can be up to 2x the size)
 	logNewTablePenalty uint
-	lastHuffMan        bool
-	bytes              [256]byte
+	bytes              [256 + 8]byte
 	literalFreq        [lengthCodesStart + 32]uint16
 	offsetFreq         [32]uint16
 	codegenFreq        [codegenCodeCount]uint16
@@ -127,6 +151,7 @@ func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
 	return &huffmanBitWriter{
 		writer:          w,
 		literalEncoding: newHuffmanEncoder(literalCount),
+		tmpLitEncoding:  newHuffmanEncoder(literalCount),
 		codegenEncoding: newHuffmanEncoder(codegenCodeCount),
 		offsetEncoding:  newHuffmanEncoder(offsetCodeCount),
 	}
@@ -139,37 +164,33 @@ func (w *huffmanBitWriter) reset(writer io.Writer) {
 	w.lastHuffMan = false
 }
 
-func (w *huffmanBitWriter) canReuse(t *tokens) (offsets, lits bool) {
-	offsets, lits = true, true
+func (w *huffmanBitWriter) canReuse(t *tokens) (ok bool) {
 	a := t.offHist[:offsetCodeCount]
-	b := w.offsetFreq[:len(a)]
-	for i := range a {
-		if b[i] == 0 && a[i] != 0 {
-			offsets = false
-			break
+	b := w.offsetEncoding.codes
+	b = b[:len(a)]
+	for i, v := range a {
+		if v != 0 && b[i].len == 0 {
+			return false
 		}
 	}
 
 	a = t.extraHist[:literalCount-256]
-	b = w.literalFreq[256:literalCount]
+	b = w.literalEncoding.codes[256:literalCount]
 	b = b[:len(a)]
-	for i := range a {
-		if b[i] == 0 && a[i] != 0 {
-			lits = false
-			break
+	for i, v := range a {
+		if v != 0 && b[i].len == 0 {
+			return false
 		}
 	}
-	if lits {
-		a = t.litHist[:]
-		b = w.literalFreq[:len(a)]
-		for i := range a {
-			if b[i] == 0 && a[i] != 0 {
-				lits = false
-				break
-			}
+
+	a = t.litHist[:256]
+	b = w.literalEncoding.codes[:len(a)]
+	for i, v := range a {
+		if v != 0 && b[i].len == 0 {
+			return false
 		}
 	}
-	return
+	return true
 }
 
 func (w *huffmanBitWriter) flush() {
@@ -205,7 +226,7 @@ func (w *huffmanBitWriter) write(b []byte) {
 	_, w.err = w.writer.Write(b)
 }
 
-func (w *huffmanBitWriter) writeBits(b int32, nb uint16) {
+func (w *huffmanBitWriter) writeBits(b int32, nb uint8) {
 	w.bits |= uint64(b) << (w.nbits & 63)
 	w.nbits += nb
 	if w.nbits >= 48 {
@@ -407,7 +428,7 @@ func (w *huffmanBitWriter) storedSize(in []byte) (int, bool) {
 
 func (w *huffmanBitWriter) writeCode(c hcode) {
 	// The function does not get inlined if we "& 63" the shift.
-	w.bits |= uint64(c.code) << w.nbits
+	w.bits |= uint64(c.code) << (w.nbits & 63)
 	w.nbits += c.len
 	if w.nbits >= 48 {
 		w.writeOutBits()
@@ -420,13 +441,11 @@ func (w *huffmanBitWriter) writeOutBits() {
 	w.bits >>= 48
 	w.nbits -= 48
 	n := w.nbytes
-	w.bytes[n] = byte(bits)
-	w.bytes[n+1] = byte(bits >> 8)
-	w.bytes[n+2] = byte(bits >> 16)
-	w.bytes[n+3] = byte(bits >> 24)
-	w.bytes[n+4] = byte(bits >> 32)
-	w.bytes[n+5] = byte(bits >> 40)
+
+	// We over-write, but faster...
+	binary.LittleEndian.PutUint64(w.bytes[n:], bits)
 	n += 6
+
 	if n >= bufferFlushSize {
 		if w.err != nil {
 			n = 0
@@ -435,6 +454,7 @@ func (w *huffmanBitWriter) writeOutBits() {
 		w.write(w.bytes[:n])
 		n = 0
 	}
+
 	w.nbytes = n
 }
 
@@ -551,7 +571,7 @@ func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) {
 		w.lastHeader = 0
 	}
 	numLiterals, numOffsets := w.indexTokens(tokens, false)
-	w.generate(tokens)
+	w.generate()
 	var extraBits int
 	storedSize, storable := w.storedSize(input)
 	if storable {
@@ -562,7 +582,10 @@ func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) {
 	// Fixed Huffman baseline.
 	var literalEncoding = fixedLiteralEncoding
 	var offsetEncoding = fixedOffsetEncoding
-	var size = w.fixedSize(extraBits)
+	var size = math.MaxInt32
+	if tokens.n < maxPredefinedTokens {
+		size = w.fixedSize(extraBits)
+	}
 
 	// Dynamic Huffman?
 	var numCodegens int
@@ -580,7 +603,7 @@ func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) {
 	}
 
 	// Stored bytes?
-	if storable && storedSize < size {
+	if storable && storedSize <= size {
 		w.writeStoredHeader(len(input), eof)
 		w.writeBytes(input)
 		return
@@ -619,22 +642,39 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b
 		w.lastHeader = 0
 		w.lastHuffMan = false
 	}
-	if !sync {
-		tokens.Fill()
+
+	// fillReuse enables filling of empty values.
+	// This will make encodings always reusable without testing.
+	// However, this does not appear to benefit on most cases.
+	const fillReuse = false
+
+	// Check if we can reuse...
+	if !fillReuse && w.lastHeader > 0 && !w.canReuse(tokens) {
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
 	}
+
 	numLiterals, numOffsets := w.indexTokens(tokens, !sync)
+	extraBits := 0
+	ssize, storable := w.storedSize(input)
+
+	const usePrefs = true
+	if storable || w.lastHeader > 0 {
+		extraBits = w.extraBitSize()
+	}
 
 	var size int
+
 	// Check if we should reuse.
 	if w.lastHeader > 0 {
 		// Estimate size for using a new table.
 		// Use the previous header size as the best estimate.
 		newSize := w.lastHeader + tokens.EstimatedBits()
-		newSize += newSize >> w.logNewTablePenalty
+		newSize += int(w.literalEncoding.codes[endBlockMarker].len) + newSize>>w.logNewTablePenalty
 
 		// The estimated size is calculated as an optimal table.
 		// We add a penalty to make it more realistic and re-use a bit more.
-		reuseSize := w.dynamicReuseSize(w.literalEncoding, w.offsetEncoding) + w.extraBitSize()
+		reuseSize := w.dynamicReuseSize(w.literalEncoding, w.offsetEncoding) + extraBits
 
 		// Check if a new table is better.
 		if newSize < reuseSize {
@@ -645,35 +685,83 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b
 		} else {
 			size = reuseSize
 		}
+
+		if tokens.n < maxPredefinedTokens {
+			if preSize := w.fixedSize(extraBits) + 7; usePrefs && preSize < size {
+				// Check if we get a reasonable size decrease.
+				if storable && ssize <= size {
+					w.writeStoredHeader(len(input), eof)
+					w.writeBytes(input)
+					return
+				}
+				w.writeFixedHeader(eof)
+				if !sync {
+					tokens.AddEOB()
+				}
+				w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes)
+				return
+			}
+		}
 		// Check if we get a reasonable size decrease.
-		if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
+		if storable && ssize <= size {
 			w.writeStoredHeader(len(input), eof)
 			w.writeBytes(input)
-			w.lastHeader = 0
 			return
 		}
 	}
 
 	// We want a new block/table
 	if w.lastHeader == 0 {
-		w.generate(tokens)
+		if fillReuse && !sync {
+			w.fillTokens()
+			numLiterals, numOffsets = maxNumLit, maxNumDist
+		} else {
+			w.literalFreq[endBlockMarker] = 1
+		}
+
+		w.generate()
 		// Generate codegen and codegenFrequencies, which indicates how to encode
 		// the literalEncoding and the offsetEncoding.
 		w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
 		w.codegenEncoding.generate(w.codegenFreq[:], 7)
+
 		var numCodegens int
-		size, numCodegens = w.dynamicSize(w.literalEncoding, w.offsetEncoding, w.extraBitSize())
-		// Store bytes, if we don't get a reasonable improvement.
-		if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
+		if fillReuse && !sync {
+			// Reindex for accurate size...
+			w.indexTokens(tokens, true)
+		}
+		size, numCodegens = w.dynamicSize(w.literalEncoding, w.offsetEncoding, extraBits)
+
+		// Store predefined, if we don't get a reasonable improvement.
+		if tokens.n < maxPredefinedTokens {
+			if preSize := w.fixedSize(extraBits); usePrefs && preSize <= size {
+				// Store bytes, if we don't get an improvement.
+				if storable && ssize <= preSize {
+					w.writeStoredHeader(len(input), eof)
+					w.writeBytes(input)
+					return
+				}
+				w.writeFixedHeader(eof)
+				if !sync {
+					tokens.AddEOB()
+				}
+				w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes)
+				return
+			}
+		}
+
+		if storable && ssize <= size {
+			// Store bytes, if we don't get an improvement.
 			w.writeStoredHeader(len(input), eof)
 			w.writeBytes(input)
-			w.lastHeader = 0
 			return
 		}
 
 		// Write Huffman table.
 		w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
-		w.lastHeader, _ = w.headerSize()
+		if !sync {
+			w.lastHeader, _ = w.headerSize()
+		}
 		w.lastHuffMan = false
 	}
 
@@ -684,6 +772,19 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b
 	w.writeTokens(tokens.Slice(), w.literalEncoding.codes, w.offsetEncoding.codes)
 }
 
+func (w *huffmanBitWriter) fillTokens() {
+	for i, v := range w.literalFreq[:literalCount] {
+		if v == 0 {
+			w.literalFreq[i] = 1
+		}
+	}
+	for i, v := range w.offsetFreq[:offsetCodeCount] {
+		if v == 0 {
+			w.offsetFreq[i] = 1
+		}
+	}
+}
+
 // indexTokens indexes a slice of tokens, and updates
 // literalFreq and offsetFreq, and generates literalEncoding
 // and offsetEncoding.
@@ -718,7 +819,7 @@ func (w *huffmanBitWriter) indexTokens(t *tokens, filled bool) (numLiterals, num
 	return
 }
 
-func (w *huffmanBitWriter) generate(t *tokens) {
+func (w *huffmanBitWriter) generate() {
 	w.literalEncoding.generate(w.literalFreq[:literalCount], 15)
 	w.offsetEncoding.generate(w.offsetFreq[:offsetCodeCount], 15)
 }
@@ -745,52 +846,135 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 	offs := oeCodes[:32]
 	lengths := leCodes[lengthCodesStart:]
 	lengths = lengths[:32]
+
+	// Go 1.16 LOVES having these on stack.
+	bits, nbits, nbytes := w.bits, w.nbits, w.nbytes
+
 	for _, t := range tokens {
-		if t < matchType {
-			w.writeCode(lits[t.literal()])
+		if t < 256 {
+			//w.writeCode(lits[t.literal()])
+			c := lits[t]
+			bits |= uint64(c.code) << (nbits & 63)
+			nbits += c.len
+			if nbits >= 48 {
+				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
+				bits >>= 48
+				nbits -= 48
+				nbytes += 6
+				if nbytes >= bufferFlushSize {
+					if w.err != nil {
+						nbytes = 0
+						return
+					}
+					_, w.err = w.writer.Write(w.bytes[:nbytes])
+					nbytes = 0
+				}
+			}
 			continue
 		}
 
 		// Write the length
 		length := t.length()
-		lengthCode := lengthCode(length)
+		lengthCode := lengthCode(length) & 31
 		if false {
-			w.writeCode(lengths[lengthCode&31])
+			w.writeCode(lengths[lengthCode])
 		} else {
 			// inlined
-			c := lengths[lengthCode&31]
-			w.bits |= uint64(c.code) << (w.nbits & 63)
-			w.nbits += c.len
-			if w.nbits >= 48 {
-				w.writeOutBits()
+			c := lengths[lengthCode]
+			bits |= uint64(c.code) << (nbits & 63)
+			nbits += c.len
+			if nbits >= 48 {
+				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
+				bits >>= 48
+				nbits -= 48
+				nbytes += 6
+				if nbytes >= bufferFlushSize {
+					if w.err != nil {
+						nbytes = 0
+						return
+					}
+					_, w.err = w.writer.Write(w.bytes[:nbytes])
+					nbytes = 0
+				}
 			}
 		}
 
-		extraLengthBits := uint16(lengthExtraBits[lengthCode&31])
-		if extraLengthBits > 0 {
-			extraLength := int32(length - lengthBase[lengthCode&31])
-			w.writeBits(extraLength, extraLengthBits)
+		if lengthCode >= lengthExtraBitsMinCode {
+			extraLengthBits := lengthExtraBits[lengthCode]
+			//w.writeBits(extraLength, extraLengthBits)
+			extraLength := int32(length - lengthBase[lengthCode])
+			bits |= uint64(extraLength) << (nbits & 63)
+			nbits += extraLengthBits
+			if nbits >= 48 {
+				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
+				bits >>= 48
+				nbits -= 48
+				nbytes += 6
+				if nbytes >= bufferFlushSize {
+					if w.err != nil {
+						nbytes = 0
+						return
+					}
+					_, w.err = w.writer.Write(w.bytes[:nbytes])
+					nbytes = 0
+				}
+			}
 		}
 		// Write the offset
 		offset := t.offset()
-		offsetCode := offsetCode(offset)
+		offsetCode := (offset >> 16) & 31
 		if false {
-			w.writeCode(offs[offsetCode&31])
+			w.writeCode(offs[offsetCode])
 		} else {
 			// inlined
-			c := offs[offsetCode&31]
-			w.bits |= uint64(c.code) << (w.nbits & 63)
-			w.nbits += c.len
-			if w.nbits >= 48 {
-				w.writeOutBits()
+			c := offs[offsetCode]
+			bits |= uint64(c.code) << (nbits & 63)
+			nbits += c.len
+			if nbits >= 48 {
+				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
+				bits >>= 48
+				nbits -= 48
+				nbytes += 6
+				if nbytes >= bufferFlushSize {
+					if w.err != nil {
+						nbytes = 0
+						return
+					}
+					_, w.err = w.writer.Write(w.bytes[:nbytes])
+					nbytes = 0
+				}
 			}
 		}
-		extraOffsetBits := uint16(offsetExtraBits[offsetCode&63])
-		if extraOffsetBits > 0 {
-			extraOffset := int32(offset - offsetBase[offsetCode&63])
-			w.writeBits(extraOffset, extraOffsetBits)
+
+		if offsetCode >= offsetExtraBitsMinCode {
+			offsetComb := offsetCombined[offsetCode]
+			//w.writeBits(extraOffset, extraOffsetBits)
+			bits |= uint64((offset-(offsetComb>>8))&matchOffsetOnlyMask) << (nbits & 63)
+			nbits += uint8(offsetComb)
+			if nbits >= 48 {
+				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
+				bits >>= 48
+				nbits -= 48
+				nbytes += 6
+				if nbytes >= bufferFlushSize {
+					if w.err != nil {
+						nbytes = 0
+						return
+					}
+					_, w.err = w.writer.Write(w.bytes[:nbytes])
+					nbytes = 0
+				}
+			}
 		}
 	}
+	// Restore...
+	w.bits, w.nbits, w.nbytes = bits, nbits, nbytes
+
 	if deferEOB {
 		w.writeCode(leCodes[endBlockMarker])
 	}
@@ -825,43 +1009,85 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 		}
 	}
 
+	// Fill is rarely better...
+	const fill = false
+	const numLiterals = endBlockMarker + 1
+	const numOffsets = 1
+
 	// Add everything as literals
 	// We have to estimate the header size.
 	// Assume header is around 70 bytes:
 	// https://stackoverflow.com/a/25454430
 	const guessHeaderSizeBits = 70 * 8
-	estBits, estExtra := histogramSize(input, w.literalFreq[:], !eof && !sync)
-	estBits += w.lastHeader + 15
+	histogram(input, w.literalFreq[:numLiterals], fill)
+	ssize, storable := w.storedSize(input)
+	if storable && len(input) > 1024 {
+		// Quick check for incompressible content.
+		abs := float64(0)
+		avg := float64(len(input)) / 256
+		max := float64(len(input) * 2)
+		for _, v := range w.literalFreq[:256] {
+			diff := float64(v) - avg
+			abs += diff * diff
+			if abs > max {
+				break
+			}
+		}
+		if abs < max {
+			if debugDeflate {
+				fmt.Println("stored", abs, "<", max)
+			}
+			// No chance we can compress this...
+			w.writeStoredHeader(len(input), eof)
+			w.writeBytes(input)
+			return
+		}
+	}
+	w.literalFreq[endBlockMarker] = 1
+	w.tmpLitEncoding.generate(w.literalFreq[:numLiterals], 15)
+	if fill {
+		// Clear fill...
+		for i := range w.literalFreq[:numLiterals] {
+			w.literalFreq[i] = 0
+		}
+		histogram(input, w.literalFreq[:numLiterals], false)
+	}
+	estBits := w.tmpLitEncoding.canReuseBits(w.literalFreq[:numLiterals])
+	estBits += w.lastHeader
 	if w.lastHeader == 0 {
 		estBits += guessHeaderSizeBits
 	}
 	estBits += estBits >> w.logNewTablePenalty
 
 	// Store bytes, if we don't get a reasonable improvement.
-	ssize, storable := w.storedSize(input)
-	if storable && ssize < estBits {
+	if storable && ssize <= estBits {
+		if debugDeflate {
+			fmt.Println("stored,", ssize, "<=", estBits)
+		}
 		w.writeStoredHeader(len(input), eof)
 		w.writeBytes(input)
 		return
 	}
 
 	if w.lastHeader > 0 {
-		reuseSize := w.literalEncoding.bitLength(w.literalFreq[:256])
-		estBits += estExtra
+		reuseSize := w.literalEncoding.canReuseBits(w.literalFreq[:256])
 
 		if estBits < reuseSize {
+			if debugDeflate {
+				fmt.Println("NOT reusing, reuse:", reuseSize/8, "> new:", estBits/8, "header est:", w.lastHeader/8, "bytes")
+			}
 			// We owe an EOB
 			w.writeCode(w.literalEncoding.codes[endBlockMarker])
 			w.lastHeader = 0
+		} else if debugDeflate {
+			fmt.Println("reusing, reuse:", reuseSize/8, "> new:", estBits/8, "- header est:", w.lastHeader/8)
 		}
 	}
 
-	const numLiterals = endBlockMarker + 1
-	const numOffsets = 1
+	count := 0
 	if w.lastHeader == 0 {
-		w.literalFreq[endBlockMarker] = 1
-		w.literalEncoding.generate(w.literalFreq[:numLiterals], 15)
-
+		// Use the temp encoding, so swap.
+		w.literalEncoding, w.tmpLitEncoding = w.tmpLitEncoding, w.literalEncoding
 		// Generate codegen and codegenFrequencies, which indicates how to encode
 		// the literalEncoding and the offsetEncoding.
 		w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset)
@@ -872,39 +1098,93 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 		w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
 		w.lastHuffMan = true
 		w.lastHeader, _ = w.headerSize()
+		if debugDeflate {
+			count += w.lastHeader
+			fmt.Println("header:", count/8)
+		}
+	}
+
+	encoding := w.literalEncoding.codes[:256]
+	// Go 1.16 LOVES having these on stack. At least 1.5x the speed.
+	bits, nbits, nbytes := w.bits, w.nbits, w.nbytes
+
+	if debugDeflate {
+		count -= int(nbytes)*8 + int(nbits)
+	}
+	// Unroll, write 3 codes/loop.
+	// Fastest number of unrolls.
+	for len(input) > 3 {
+		// We must have at least 48 bits free.
+		if nbits >= 8 {
+			n := nbits >> 3
+			binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+			bits >>= (n * 8) & 63
+			nbits -= n * 8
+			nbytes += n
+		}
+		if nbytes >= bufferFlushSize {
+			if w.err != nil {
+				nbytes = 0
+				return
+			}
+			if debugDeflate {
+				count += int(nbytes) * 8
+			}
+			_, w.err = w.writer.Write(w.bytes[:nbytes])
+			nbytes = 0
+		}
+		a, b := encoding[input[0]], encoding[input[1]]
+		bits |= uint64(a.code) << (nbits & 63)
+		bits |= uint64(b.code) << ((nbits + a.len) & 63)
+		c := encoding[input[2]]
+		nbits += b.len + a.len
+		bits |= uint64(c.code) << (nbits & 63)
+		nbits += c.len
+		input = input[3:]
 	}
 
-	encoding := w.literalEncoding.codes[:257]
+	// Remaining...
 	for _, t := range input {
-		// Bitwriting inlined, ~30% speedup
-		c := encoding[t]
-		w.bits |= uint64(c.code) << ((w.nbits) & 63)
-		w.nbits += c.len
-		if w.nbits >= 48 {
-			bits := w.bits
-			w.bits >>= 48
-			w.nbits -= 48
-			n := w.nbytes
-			w.bytes[n] = byte(bits)
-			w.bytes[n+1] = byte(bits >> 8)
-			w.bytes[n+2] = byte(bits >> 16)
-			w.bytes[n+3] = byte(bits >> 24)
-			w.bytes[n+4] = byte(bits >> 32)
-			w.bytes[n+5] = byte(bits >> 40)
-			n += 6
-			if n >= bufferFlushSize {
+		if nbits >= 48 {
+			binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
+			//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
+			bits >>= 48
+			nbits -= 48
+			nbytes += 6
+			if nbytes >= bufferFlushSize {
 				if w.err != nil {
-					n = 0
+					nbytes = 0
 					return
 				}
-				w.write(w.bytes[:n])
-				n = 0
+				if debugDeflate {
+					count += int(nbytes) * 8
+				}
+				_, w.err = w.writer.Write(w.bytes[:nbytes])
+				nbytes = 0
 			}
-			w.nbytes = n
+		}
+		// Bitwriting inlined, ~30% speedup
+		c := encoding[t]
+		bits |= uint64(c.code) << (nbits & 63)
+		nbits += c.len
+		if debugDeflate {
+			count += int(c.len)
 		}
 	}
+	// Restore...
+	w.bits, w.nbits, w.nbytes = bits, nbits, nbytes
+
+	if debugDeflate {
+		nb := count + int(nbytes)*8 + int(nbits)
+		fmt.Println("wrote", nb, "bits,", nb/8, "bytes.")
+	}
+	// Flush if needed to have space.
+	if w.nbits >= 48 {
+		w.writeOutBits()
+	}
+
 	if eof || sync {
-		w.writeCode(encoding[endBlockMarker])
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
 		w.lastHeader = 0
 		w.lastHuffMan = false
 	}
diff --git a/vendor/github.com/klauspost/compress/flate/huffman_code.go b/vendor/github.com/klauspost/compress/flate/huffman_code.go
index 4c39a30..9ab497c 100644
--- a/vendor/github.com/klauspost/compress/flate/huffman_code.go
+++ b/vendor/github.com/klauspost/compress/flate/huffman_code.go
@@ -17,13 +17,18 @@ const (
 
 // hcode is a huffman code with a bit code and bit length.
 type hcode struct {
-	code, len uint16
+	code uint16
+	len  uint8
 }
 
 type huffmanEncoder struct {
-	codes     []hcode
-	freqcache []literalNode
-	bitCount  [17]int32
+	codes    []hcode
+	bitCount [17]int32
+
+	// Allocate a reusable buffer with the longest possible frequency table.
+	// Possible lengths are codegenCodeCount, offsetCodeCount and literalCount.
+	// The largest of these is literalCount, so we allocate for that case.
+	freqcache [literalCount + 1]literalNode
 }
 
 type literalNode struct {
@@ -52,7 +57,7 @@ type levelInfo struct {
 }
 
 // set sets the code and length of an hcode.
-func (h *hcode) set(code uint16, length uint16) {
+func (h *hcode) set(code uint16, length uint8) {
 	h.len = length
 	h.code = code
 }
@@ -76,7 +81,7 @@ func generateFixedLiteralEncoding() *huffmanEncoder {
 	var ch uint16
 	for ch = 0; ch < literalCount; ch++ {
 		var bits uint16
-		var size uint16
+		var size uint8
 		switch {
 		case ch < 144:
 			// size 8, 000110000  .. 10111111
@@ -95,7 +100,7 @@ func generateFixedLiteralEncoding() *huffmanEncoder {
 			bits = ch + 192 - 280
 			size = 8
 		}
-		codes[ch] = hcode{code: reverseBits(bits, byte(size)), len: size}
+		codes[ch] = hcode{code: reverseBits(bits, size), len: size}
 	}
 	return h
 }
@@ -122,6 +127,29 @@ func (h *huffmanEncoder) bitLength(freq []uint16) int {
 	return total
 }
 
+func (h *huffmanEncoder) bitLengthRaw(b []byte) int {
+	var total int
+	for _, f := range b {
+		total += int(h.codes[f].len)
+	}
+	return total
+}
+
+// canReuseBits returns the number of bits or math.MaxInt32 if the encoder cannot be reused.
+func (h *huffmanEncoder) canReuseBits(freq []uint16) int {
+	var total int
+	for i, f := range freq {
+		if f != 0 {
+			code := h.codes[i]
+			if code.len == 0 {
+				return math.MaxInt32
+			}
+			total += int(f) * int(code.len)
+		}
+	}
+	return total
+}
+
 // Return the number of literals assigned to each bit size in the Huffman encoding
 //
 // This method is only called when list.length >= 3
@@ -160,14 +188,19 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
 	// of the level j ancestor.
 	var leafCounts [maxBitsLimit][maxBitsLimit]int32
 
+	// Descending to only have 1 bounds check.
+	l2f := int32(list[2].freq)
+	l1f := int32(list[1].freq)
+	l0f := int32(list[0].freq) + int32(list[1].freq)
+
 	for level := int32(1); level <= maxBits; level++ {
 		// For every level, the first two items are the first two characters.
 		// We initialize the levels as if we had already figured this out.
 		levels[level] = levelInfo{
 			level:        level,
-			lastFreq:     int32(list[1].freq),
-			nextCharFreq: int32(list[2].freq),
-			nextPairFreq: int32(list[0].freq) + int32(list[1].freq),
+			lastFreq:     l1f,
+			nextCharFreq: l2f,
+			nextPairFreq: l0f,
 		}
 		leafCounts[level][level] = 2
 		if level == 1 {
@@ -178,8 +211,8 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
 	// We need a total of 2*n - 2 items at top level and have already generated 2.
 	levels[maxBits].needed = 2*n - 4
 
-	level := maxBits
-	for {
+	level := uint32(maxBits)
+	for level < 16 {
 		l := &levels[level]
 		if l.nextPairFreq == math.MaxInt32 && l.nextCharFreq == math.MaxInt32 {
 			// We've run out of both leafs and pairs.
@@ -211,7 +244,13 @@ func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
 			// more values in the level below
 			l.lastFreq = l.nextPairFreq
 			// Take leaf counts from the lower level, except counts[level] remains the same.
-			copy(leafCounts[level][:level], leafCounts[level-1][:level])
+			if true {
+				save := leafCounts[level][level]
+				leafCounts[level] = leafCounts[level-1]
+				leafCounts[level][level] = save
+			} else {
+				copy(leafCounts[level][:level], leafCounts[level-1][:level])
+			}
 			levels[l.level-1].needed = 2
 		}
 
@@ -269,7 +308,7 @@ func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalN
 
 		sortByLiteral(chunk)
 		for _, node := range chunk {
-			h.codes[node.literal] = hcode{code: reverseBits(code, uint8(n)), len: uint16(n)}
+			h.codes[node.literal] = hcode{code: reverseBits(code, uint8(n)), len: uint8(n)}
 			code++
 		}
 		list = list[0 : len(list)-int(bits)]
@@ -281,13 +320,8 @@ func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalN
 // freq  An array of frequencies, in which frequency[i] gives the frequency of literal i.
 // maxBits  The maximum number of bits to use for any literal.
 func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) {
-	if h.freqcache == nil {
-		// Allocate a reusable buffer with the longest possible frequency table.
-		// Possible lengths are codegenCodeCount, offsetCodeCount and literalCount.
-		// The largest of these is literalCount, so we allocate for that case.
-		h.freqcache = make([]literalNode, literalCount+1)
-	}
 	list := h.freqcache[:len(freq)+1]
+	codes := h.codes[:len(freq)]
 	// Number of non-zero literals
 	count := 0
 	// Set list to be the set of all non-zero literals and their frequencies
@@ -296,11 +330,10 @@ func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) {
 			list[count] = literalNode{uint16(i), f}
 			count++
 		} else {
-			list[count] = literalNode{}
-			h.codes[i].len = 0
+			codes[i].len = 0
 		}
 	}
-	list[len(freq)] = literalNode{}
+	list[count] = literalNode{}
 
 	list = list[:count]
 	if count <= 2 {
@@ -320,44 +353,32 @@ func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) {
 	h.assignEncodingAndSize(bitCount, list)
 }
 
+// atLeastOne clamps the result between 1 and 15.
 func atLeastOne(v float32) float32 {
 	if v < 1 {
 		return 1
 	}
+	if v > 15 {
+		return 15
+	}
 	return v
 }
 
-// histogramSize accumulates a histogram of b in h.
-// An estimated size in bits is returned.
 // Unassigned values are assigned '1' in the histogram.
-// len(h) must be >= 256, and h's elements must be all zeroes.
-func histogramSize(b []byte, h []uint16, fill bool) (int, int) {
+func fillHist(b []uint16) {
+	for i, v := range b {
+		if v == 0 {
+			b[i] = 1
+		}
+	}
+}
+
+func histogram(b []byte, h []uint16, fill bool) {
 	h = h[:256]
 	for _, t := range b {
 		h[t]++
 	}
-	invTotal := 1.0 / float32(len(b))
-	shannon := float32(0.0)
-	var extra float32
 	if fill {
-		oneBits := atLeastOne(-mFastLog2(invTotal))
-		for i, v := range h[:] {
-			if v > 0 {
-				n := float32(v)
-				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
-			} else {
-				h[i] = 1
-				extra += oneBits
-			}
-		}
-	} else {
-		for _, v := range h[:] {
-			if v > 0 {
-				n := float32(v)
-				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
-			}
-		}
+		fillHist(h)
 	}
-
-	return int(shannon + 0.99), int(extra + 0.99)
 }
diff --git a/vendor/github.com/klauspost/compress/flate/inflate.go b/vendor/github.com/klauspost/compress/flate/inflate.go
index 3e4259f..414c0be 100644
--- a/vendor/github.com/klauspost/compress/flate/inflate.go
+++ b/vendor/github.com/klauspost/compress/flate/inflate.go
@@ -9,10 +9,10 @@ package flate
 
 import (
 	"bufio"
+	"compress/flate"
 	"fmt"
 	"io"
 	"math/bits"
-	"strconv"
 	"sync"
 )
 
@@ -29,16 +29,26 @@ const (
 	debugDecode = false
 )
 
+// Value of length - 3 and extra bits.
+type lengthExtra struct {
+	length, extra uint8
+}
+
+var decCodeToLen = [32]lengthExtra{{length: 0x0, extra: 0x0}, {length: 0x1, extra: 0x0}, {length: 0x2, extra: 0x0}, {length: 0x3, extra: 0x0}, {length: 0x4, extra: 0x0}, {length: 0x5, extra: 0x0}, {length: 0x6, extra: 0x0}, {length: 0x7, extra: 0x0}, {length: 0x8, extra: 0x1}, {length: 0xa, extra: 0x1}, {length: 0xc, extra: 0x1}, {length: 0xe, extra: 0x1}, {length: 0x10, extra: 0x2}, {length: 0x14, extra: 0x2}, {length: 0x18, extra: 0x2}, {length: 0x1c, extra: 0x2}, {length: 0x20, extra: 0x3}, {length: 0x28, extra: 0x3}, {length: 0x30, extra: 0x3}, {length: 0x38, extra: 0x3}, {length: 0x40, extra: 0x4}, {length: 0x50, extra: 0x4}, {length: 0x60, extra: 0x4}, {length: 0x70, extra: 0x4}, {length: 0x80, extra: 0x5}, {length: 0xa0, extra: 0x5}, {length: 0xc0, extra: 0x5}, {length: 0xe0, extra: 0x5}, {length: 0xff, extra: 0x0}, {length: 0x0, extra: 0x0}, {length: 0x0, extra: 0x0}, {length: 0x0, extra: 0x0}}
+
+var bitMask32 = [32]uint32{
+	0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF,
+	0x1FF, 0x3FF, 0x7FF, 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF,
+	0x1ffff, 0x3ffff, 0x7FFFF, 0xfFFFF, 0x1fFFFF, 0x3fFFFF, 0x7fFFFF, 0xffFFFF,
+	0x1ffFFFF, 0x3ffFFFF, 0x7ffFFFF, 0xfffFFFF, 0x1fffFFFF, 0x3fffFFFF, 0x7fffFFFF,
+} // up to 32 bits
+
 // Initialize the fixedHuffmanDecoder only once upon first use.
 var fixedOnce sync.Once
 var fixedHuffmanDecoder huffmanDecoder
 
 // A CorruptInputError reports the presence of corrupt input at a given offset.
-type CorruptInputError int64
-
-func (e CorruptInputError) Error() string {
-	return "flate: corrupt input before offset " + strconv.FormatInt(int64(e), 10)
-}
+type CorruptInputError = flate.CorruptInputError
 
 // An InternalError reports an error in the flate code itself.
 type InternalError string
@@ -48,26 +58,12 @@ func (e InternalError) Error() string { return "flate: internal error: " + strin
 // A ReadError reports an error encountered while reading input.
 //
 // Deprecated: No longer returned.
-type ReadError struct {
-	Offset int64 // byte offset where error occurred
-	Err    error // error returned by underlying Read
-}
-
-func (e *ReadError) Error() string {
-	return "flate: read error at offset " + strconv.FormatInt(e.Offset, 10) + ": " + e.Err.Error()
-}
+type ReadError = flate.ReadError
 
 // A WriteError reports an error encountered while writing output.
 //
 // Deprecated: No longer returned.
-type WriteError struct {
-	Offset int64 // byte offset where error occurred
-	Err    error // error returned by underlying Write
-}
-
-func (e *WriteError) Error() string {
-	return "flate: write error at offset " + strconv.FormatInt(e.Offset, 10) + ": " + e.Err.Error()
-}
+type WriteError = flate.WriteError
 
 // Resetter resets a ReadCloser returned by NewReader or NewReaderDict to
 // to switch to a new underlying Reader. This permits reusing a ReadCloser
@@ -339,11 +335,17 @@ func (f *decompressor) nextBlock() {
 	switch typ {
 	case 0:
 		f.dataBlock()
+		if debugDecode {
+			fmt.Println("stored block")
+		}
 	case 1:
 		// compressed, fixed Huffman tables
 		f.hl = &fixedHuffmanDecoder
 		f.hd = nil
 		f.huffmanBlockDecoder()()
+		if debugDecode {
+			fmt.Println("predefinied huffman block")
+		}
 	case 2:
 		// compressed, dynamic Huffman tables
 		if f.err = f.readHuffman(); f.err != nil {
@@ -352,6 +354,9 @@ func (f *decompressor) nextBlock() {
 		f.hl = &f.h1
 		f.hd = &f.h2
 		f.huffmanBlockDecoder()()
+		if debugDecode {
+			fmt.Println("dynamic huffman block")
+		}
 	default:
 		// 3 is reserved.
 		if debugDecode {
@@ -522,8 +527,8 @@ func (f *decompressor) readHuffman() error {
 				return err
 			}
 		}
-		rep += int(f.b & uint32(1<<nb-1))
-		f.b >>= nb
+		rep += int(f.b & uint32(1<<(nb&regSizeMaskUint32)-1))
+		f.b >>= nb & regSizeMaskUint32
 		f.nb -= nb
 		if i+rep > n {
 			if debugDecode {
@@ -561,219 +566,6 @@ func (f *decompressor) readHuffman() error {
 	return nil
 }
 
-// Decode a single Huffman block from f.
-// hl and hd are the Huffman states for the lit/length values
-// and the distance values, respectively. If hd == nil, using the
-// fixed distance encoding associated with fixed Huffman blocks.
-func (f *decompressor) huffmanBlockGeneric() {
-	const (
-		stateInit = iota // Zero value must be stateInit
-		stateDict
-	)
-
-	switch f.stepState {
-	case stateInit:
-		goto readLiteral
-	case stateDict:
-		goto copyHistory
-	}
-
-readLiteral:
-	// Read literal and/or (length, distance) according to RFC section 3.2.3.
-	{
-		var v int
-		{
-			// Inlined v, err := f.huffSym(f.hl)
-			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
-			// with single element, huffSym must error on these two edge cases. In both
-			// cases, the chunks slice will be 0 for the invalid sequence, leading it
-			// satisfy the n == 0 check below.
-			n := uint(f.hl.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
-			for {
-				for nb < n {
-					c, err := f.r.ReadByte()
-					if err != nil {
-						f.b = b
-						f.nb = nb
-						f.err = noEOF(err)
-						return
-					}
-					f.roffset++
-					b |= uint32(c) << (nb & 31)
-					nb += 8
-				}
-				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
-				n = uint(chunk & huffmanCountMask)
-				if n > huffmanChunkBits {
-					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
-					n = uint(chunk & huffmanCountMask)
-				}
-				if n <= nb {
-					if n == 0 {
-						f.b = b
-						f.nb = nb
-						if debugDecode {
-							fmt.Println("huffsym: n==0")
-						}
-						f.err = CorruptInputError(f.roffset)
-						return
-					}
-					f.b = b >> (n & 31)
-					f.nb = nb - n
-					v = int(chunk >> huffmanValueShift)
-					break
-				}
-			}
-		}
-
-		var n uint // number of bits extra
-		var length int
-		var err error
-		switch {
-		case v < 256:
-			f.dict.writeByte(byte(v))
-			if f.dict.availWrite() == 0 {
-				f.toRead = f.dict.readFlush()
-				f.step = (*decompressor).huffmanBlockGeneric
-				f.stepState = stateInit
-				return
-			}
-			goto readLiteral
-		case v == 256:
-			f.finishBlock()
-			return
-		// otherwise, reference to older data
-		case v < 265:
-			length = v - (257 - 3)
-			n = 0
-		case v < 269:
-			length = v*2 - (265*2 - 11)
-			n = 1
-		case v < 273:
-			length = v*4 - (269*4 - 19)
-			n = 2
-		case v < 277:
-			length = v*8 - (273*8 - 35)
-			n = 3
-		case v < 281:
-			length = v*16 - (277*16 - 67)
-			n = 4
-		case v < 285:
-			length = v*32 - (281*32 - 131)
-			n = 5
-		case v < maxNumLit:
-			length = 258
-			n = 0
-		default:
-			if debugDecode {
-				fmt.Println(v, ">= maxNumLit")
-			}
-			f.err = CorruptInputError(f.roffset)
-			return
-		}
-		if n > 0 {
-			for f.nb < n {
-				if err = f.moreBits(); err != nil {
-					if debugDecode {
-						fmt.Println("morebits n>0:", err)
-					}
-					f.err = err
-					return
-				}
-			}
-			length += int(f.b & uint32(1<<n-1))
-			f.b >>= n
-			f.nb -= n
-		}
-
-		var dist int
-		if f.hd == nil {
-			for f.nb < 5 {
-				if err = f.moreBits(); err != nil {
-					if debugDecode {
-						fmt.Println("morebits f.nb<5:", err)
-					}
-					f.err = err
-					return
-				}
-			}
-			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
-			f.b >>= 5
-			f.nb -= 5
-		} else {
-			if dist, err = f.huffSym(f.hd); err != nil {
-				if debugDecode {
-					fmt.Println("huffsym:", err)
-				}
-				f.err = err
-				return
-			}
-		}
-
-		switch {
-		case dist < 4:
-			dist++
-		case dist < maxNumDist:
-			nb := uint(dist-2) >> 1
-			// have 1 bit in bottom of dist, need nb more.
-			extra := (dist & 1) << nb
-			for f.nb < nb {
-				if err = f.moreBits(); err != nil {
-					if debugDecode {
-						fmt.Println("morebits f.nb<nb:", err)
-					}
-					f.err = err
-					return
-				}
-			}
-			extra |= int(f.b & uint32(1<<nb-1))
-			f.b >>= nb
-			f.nb -= nb
-			dist = 1<<(nb+1) + 1 + extra
-		default:
-			if debugDecode {
-				fmt.Println("dist too big:", dist, maxNumDist)
-			}
-			f.err = CorruptInputError(f.roffset)
-			return
-		}
-
-		// No check on length; encoding can be prescient.
-		if dist > f.dict.histSize() {
-			if debugDecode {
-				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
-			}
-			f.err = CorruptInputError(f.roffset)
-			return
-		}
-
-		f.copyLen, f.copyDist = length, dist
-		goto copyHistory
-	}
-
-copyHistory:
-	// Perform a backwards copy according to RFC section 3.2.3.
-	{
-		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
-		if cnt == 0 {
-			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
-		}
-		f.copyLen -= cnt
-
-		if f.dict.availWrite() == 0 || f.copyLen > 0 {
-			f.toRead = f.dict.readFlush()
-			f.step = (*decompressor).huffmanBlockGeneric // We need to continue this work
-			f.stepState = stateDict
-			return
-		}
-		goto readLiteral
-	}
-}
-
 // Copy a single uncompressed data block from input to output.
 func (f *decompressor) dataBlock() {
 	// Uncompressed.
@@ -869,7 +661,7 @@ func (f *decompressor) moreBits() error {
 		return noEOF(err)
 	}
 	f.roffset++
-	f.b |= uint32(c) << f.nb
+	f.b |= uint32(c) << (f.nb & regSizeMaskUint32)
 	f.nb += 8
 	return nil
 }
@@ -894,7 +686,7 @@ func (f *decompressor) huffSym(h *huffmanDecoder) (int, error) {
 				return 0, noEOF(err)
 			}
 			f.roffset++
-			b |= uint32(c) << (nb & 31)
+			b |= uint32(c) << (nb & regSizeMaskUint32)
 			nb += 8
 		}
 		chunk := h.chunks[b&(huffmanNumChunks-1)]
@@ -913,7 +705,7 @@ func (f *decompressor) huffSym(h *huffmanDecoder) (int, error) {
 				f.err = CorruptInputError(f.roffset)
 				return 0, f.err
 			}
-			f.b = b >> (n & 31)
+			f.b = b >> (n & regSizeMaskUint32)
 			f.nb = nb - n
 			return int(chunk >> huffmanValueShift), nil
 		}
diff --git a/vendor/github.com/klauspost/compress/flate/inflate_gen.go b/vendor/github.com/klauspost/compress/flate/inflate_gen.go
index 397dc1b..61342b6 100644
--- a/vendor/github.com/klauspost/compress/flate/inflate_gen.go
+++ b/vendor/github.com/klauspost/compress/flate/inflate_gen.go
@@ -20,16 +20,11 @@ func (f *decompressor) huffmanBytesBuffer() {
 		stateDict
 	)
 	fr := f.r.(*bytes.Buffer)
-	moreBits := func() error {
-		c, err := fr.ReadByte()
-		if err != nil {
-			return noEOF(err)
-		}
-		f.roffset++
-		f.b |= uint32(c) << f.nb
-		f.nb += 8
-		return nil
-	}
+
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb, dict := f.nb, f.b, &f.dict
 
 	switch f.stepState {
 	case stateInit:
@@ -49,128 +44,150 @@ readLiteral:
 			// cases, the chunks slice will be 0 for the invalid sequence, leading it
 			// satisfy the n == 0 check below.
 			n := uint(f.hl.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & 31)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & 31)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					v = int(chunk >> huffmanValueShift)
 					break
 				}
 			}
 		}
 
-		var n uint // number of bits extra
 		var length int
-		var err error
 		switch {
 		case v < 256:
-			f.dict.writeByte(byte(v))
-			if f.dict.availWrite() == 0 {
-				f.toRead = f.dict.readFlush()
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
 				f.step = (*decompressor).huffmanBytesBuffer
 				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
 				return
 			}
 			goto readLiteral
 		case v == 256:
+			f.b, f.nb = fb, fnb
 			f.finishBlock()
 			return
 		// otherwise, reference to older data
 		case v < 265:
 			length = v - (257 - 3)
-			n = 0
-		case v < 269:
-			length = v*2 - (265*2 - 11)
-			n = 1
-		case v < 273:
-			length = v*4 - (269*4 - 19)
-			n = 2
-		case v < 277:
-			length = v*8 - (273*8 - 35)
-			n = 3
-		case v < 281:
-			length = v*16 - (277*16 - 67)
-			n = 4
-		case v < 285:
-			length = v*32 - (281*32 - 131)
-			n = 5
 		case v < maxNumLit:
-			length = 258
-			n = 0
-		default:
-			if debugDecode {
-				fmt.Println(v, ">= maxNumLit")
-			}
-			f.err = CorruptInputError(f.roffset)
-			return
-		}
-		if n > 0 {
-			for f.nb < n {
-				if err = moreBits(); err != nil {
+			val := decCodeToLen[(v - 257)]
+			length = int(val.length) + 3
+			n := uint(val.extra)
+			for fnb < n {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits n>0:", err)
 					}
 					f.err = err
 					return
 				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
 			}
-			length += int(f.b & uint32(1<<n-1))
-			f.b >>= n
-			f.nb -= n
+			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
+			return
 		}
 
-		var dist int
+		var dist uint32
 		if f.hd == nil {
-			for f.nb < 5 {
-				if err = moreBits(); err != nil {
+			for fnb < 5 {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<5:", err)
 					}
 					f.err = err
 					return
 				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
-			f.b >>= 5
-			f.nb -= 5
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
 		} else {
-			if dist, err = f.huffSym(f.hd); err != nil {
-				if debugDecode {
-					fmt.Println("huffsym:", err)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hd.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					dist = uint32(chunk >> huffmanValueShift)
+					break
 				}
-				f.err = err
-				return
 			}
 		}
 
@@ -180,21 +197,28 @@ readLiteral:
 		case dist < maxNumDist:
 			nb := uint(dist-2) >> 1
 			// have 1 bit in bottom of dist, need nb more.
-			extra := (dist & 1) << nb
-			for f.nb < nb {
-				if err = moreBits(); err != nil {
+			extra := (dist & 1) << (nb & regSizeMaskUint32)
+			for fnb < nb {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<nb:", err)
 					}
 					f.err = err
 					return
 				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			extra |= int(f.b & uint32(1<<nb-1))
-			f.b >>= nb
-			f.nb -= nb
-			dist = 1<<(nb+1) + 1 + extra
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
+			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
 		default:
+			f.b, f.nb = fb, fnb
 			if debugDecode {
 				fmt.Println("dist too big:", dist, maxNumDist)
 			}
@@ -203,35 +227,38 @@ readLiteral:
 		}
 
 		// No check on length; encoding can be prescient.
-		if dist > f.dict.histSize() {
+		if dist > uint32(dict.histSize()) {
+			f.b, f.nb = fb, fnb
 			if debugDecode {
-				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+				fmt.Println("dist > dict.histSize():", dist, dict.histSize())
 			}
 			f.err = CorruptInputError(f.roffset)
 			return
 		}
 
-		f.copyLen, f.copyDist = length, dist
+		f.copyLen, f.copyDist = length, int(dist)
 		goto copyHistory
 	}
 
 copyHistory:
 	// Perform a backwards copy according to RFC section 3.2.3.
 	{
-		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
 		if cnt == 0 {
-			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
 		}
 		f.copyLen -= cnt
 
-		if f.dict.availWrite() == 0 || f.copyLen > 0 {
-			f.toRead = f.dict.readFlush()
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
 			f.step = (*decompressor).huffmanBytesBuffer // We need to continue this work
 			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
 			return
 		}
 		goto readLiteral
 	}
+	// Not reached
 }
 
 // Decode a single Huffman block from f.
@@ -244,16 +271,11 @@ func (f *decompressor) huffmanBytesReader() {
 		stateDict
 	)
 	fr := f.r.(*bytes.Reader)
-	moreBits := func() error {
-		c, err := fr.ReadByte()
-		if err != nil {
-			return noEOF(err)
-		}
-		f.roffset++
-		f.b |= uint32(c) << f.nb
-		f.nb += 8
-		return nil
-	}
+
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb, dict := f.nb, f.b, &f.dict
 
 	switch f.stepState {
 	case stateInit:
@@ -273,128 +295,150 @@ readLiteral:
 			// cases, the chunks slice will be 0 for the invalid sequence, leading it
 			// satisfy the n == 0 check below.
 			n := uint(f.hl.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & 31)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & 31)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					v = int(chunk >> huffmanValueShift)
 					break
 				}
 			}
 		}
 
-		var n uint // number of bits extra
 		var length int
-		var err error
 		switch {
 		case v < 256:
-			f.dict.writeByte(byte(v))
-			if f.dict.availWrite() == 0 {
-				f.toRead = f.dict.readFlush()
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
 				f.step = (*decompressor).huffmanBytesReader
 				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
 				return
 			}
 			goto readLiteral
 		case v == 256:
+			f.b, f.nb = fb, fnb
 			f.finishBlock()
 			return
 		// otherwise, reference to older data
 		case v < 265:
 			length = v - (257 - 3)
-			n = 0
-		case v < 269:
-			length = v*2 - (265*2 - 11)
-			n = 1
-		case v < 273:
-			length = v*4 - (269*4 - 19)
-			n = 2
-		case v < 277:
-			length = v*8 - (273*8 - 35)
-			n = 3
-		case v < 281:
-			length = v*16 - (277*16 - 67)
-			n = 4
-		case v < 285:
-			length = v*32 - (281*32 - 131)
-			n = 5
 		case v < maxNumLit:
-			length = 258
-			n = 0
-		default:
-			if debugDecode {
-				fmt.Println(v, ">= maxNumLit")
-			}
-			f.err = CorruptInputError(f.roffset)
-			return
-		}
-		if n > 0 {
-			for f.nb < n {
-				if err = moreBits(); err != nil {
+			val := decCodeToLen[(v - 257)]
+			length = int(val.length) + 3
+			n := uint(val.extra)
+			for fnb < n {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits n>0:", err)
 					}
 					f.err = err
 					return
 				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
 			}
-			length += int(f.b & uint32(1<<n-1))
-			f.b >>= n
-			f.nb -= n
+			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
+			return
 		}
 
-		var dist int
+		var dist uint32
 		if f.hd == nil {
-			for f.nb < 5 {
-				if err = moreBits(); err != nil {
+			for fnb < 5 {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<5:", err)
 					}
 					f.err = err
 					return
 				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
-			f.b >>= 5
-			f.nb -= 5
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
 		} else {
-			if dist, err = f.huffSym(f.hd); err != nil {
-				if debugDecode {
-					fmt.Println("huffsym:", err)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hd.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					dist = uint32(chunk >> huffmanValueShift)
+					break
 				}
-				f.err = err
-				return
 			}
 		}
 
@@ -404,21 +448,28 @@ readLiteral:
 		case dist < maxNumDist:
 			nb := uint(dist-2) >> 1
 			// have 1 bit in bottom of dist, need nb more.
-			extra := (dist & 1) << nb
-			for f.nb < nb {
-				if err = moreBits(); err != nil {
+			extra := (dist & 1) << (nb & regSizeMaskUint32)
+			for fnb < nb {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<nb:", err)
 					}
 					f.err = err
 					return
 				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			extra |= int(f.b & uint32(1<<nb-1))
-			f.b >>= nb
-			f.nb -= nb
-			dist = 1<<(nb+1) + 1 + extra
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
+			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
 		default:
+			f.b, f.nb = fb, fnb
 			if debugDecode {
 				fmt.Println("dist too big:", dist, maxNumDist)
 			}
@@ -427,35 +478,38 @@ readLiteral:
 		}
 
 		// No check on length; encoding can be prescient.
-		if dist > f.dict.histSize() {
+		if dist > uint32(dict.histSize()) {
+			f.b, f.nb = fb, fnb
 			if debugDecode {
-				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+				fmt.Println("dist > dict.histSize():", dist, dict.histSize())
 			}
 			f.err = CorruptInputError(f.roffset)
 			return
 		}
 
-		f.copyLen, f.copyDist = length, dist
+		f.copyLen, f.copyDist = length, int(dist)
 		goto copyHistory
 	}
 
 copyHistory:
 	// Perform a backwards copy according to RFC section 3.2.3.
 	{
-		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
 		if cnt == 0 {
-			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
 		}
 		f.copyLen -= cnt
 
-		if f.dict.availWrite() == 0 || f.copyLen > 0 {
-			f.toRead = f.dict.readFlush()
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
 			f.step = (*decompressor).huffmanBytesReader // We need to continue this work
 			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
 			return
 		}
 		goto readLiteral
 	}
+	// Not reached
 }
 
 // Decode a single Huffman block from f.
@@ -468,16 +522,11 @@ func (f *decompressor) huffmanBufioReader() {
 		stateDict
 	)
 	fr := f.r.(*bufio.Reader)
-	moreBits := func() error {
-		c, err := fr.ReadByte()
-		if err != nil {
-			return noEOF(err)
-		}
-		f.roffset++
-		f.b |= uint32(c) << f.nb
-		f.nb += 8
-		return nil
-	}
+
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb, dict := f.nb, f.b, &f.dict
 
 	switch f.stepState {
 	case stateInit:
@@ -497,128 +546,150 @@ readLiteral:
 			// cases, the chunks slice will be 0 for the invalid sequence, leading it
 			// satisfy the n == 0 check below.
 			n := uint(f.hl.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & 31)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & 31)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					v = int(chunk >> huffmanValueShift)
 					break
 				}
 			}
 		}
 
-		var n uint // number of bits extra
 		var length int
-		var err error
 		switch {
 		case v < 256:
-			f.dict.writeByte(byte(v))
-			if f.dict.availWrite() == 0 {
-				f.toRead = f.dict.readFlush()
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
 				f.step = (*decompressor).huffmanBufioReader
 				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
 				return
 			}
 			goto readLiteral
 		case v == 256:
+			f.b, f.nb = fb, fnb
 			f.finishBlock()
 			return
 		// otherwise, reference to older data
 		case v < 265:
 			length = v - (257 - 3)
-			n = 0
-		case v < 269:
-			length = v*2 - (265*2 - 11)
-			n = 1
-		case v < 273:
-			length = v*4 - (269*4 - 19)
-			n = 2
-		case v < 277:
-			length = v*8 - (273*8 - 35)
-			n = 3
-		case v < 281:
-			length = v*16 - (277*16 - 67)
-			n = 4
-		case v < 285:
-			length = v*32 - (281*32 - 131)
-			n = 5
 		case v < maxNumLit:
-			length = 258
-			n = 0
-		default:
-			if debugDecode {
-				fmt.Println(v, ">= maxNumLit")
-			}
-			f.err = CorruptInputError(f.roffset)
-			return
-		}
-		if n > 0 {
-			for f.nb < n {
-				if err = moreBits(); err != nil {
+			val := decCodeToLen[(v - 257)]
+			length = int(val.length) + 3
+			n := uint(val.extra)
+			for fnb < n {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits n>0:", err)
 					}
 					f.err = err
 					return
 				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			length += int(f.b & uint32(1<<n-1))
-			f.b >>= n
-			f.nb -= n
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
+			}
+			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
+			return
 		}
 
-		var dist int
+		var dist uint32
 		if f.hd == nil {
-			for f.nb < 5 {
-				if err = moreBits(); err != nil {
+			for fnb < 5 {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<5:", err)
 					}
 					f.err = err
 					return
 				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
-			f.b >>= 5
-			f.nb -= 5
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
 		} else {
-			if dist, err = f.huffSym(f.hd); err != nil {
-				if debugDecode {
-					fmt.Println("huffsym:", err)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hd.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					dist = uint32(chunk >> huffmanValueShift)
+					break
 				}
-				f.err = err
-				return
 			}
 		}
 
@@ -628,21 +699,28 @@ readLiteral:
 		case dist < maxNumDist:
 			nb := uint(dist-2) >> 1
 			// have 1 bit in bottom of dist, need nb more.
-			extra := (dist & 1) << nb
-			for f.nb < nb {
-				if err = moreBits(); err != nil {
+			extra := (dist & 1) << (nb & regSizeMaskUint32)
+			for fnb < nb {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<nb:", err)
 					}
 					f.err = err
 					return
 				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			extra |= int(f.b & uint32(1<<nb-1))
-			f.b >>= nb
-			f.nb -= nb
-			dist = 1<<(nb+1) + 1 + extra
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
+			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
 		default:
+			f.b, f.nb = fb, fnb
 			if debugDecode {
 				fmt.Println("dist too big:", dist, maxNumDist)
 			}
@@ -651,35 +729,38 @@ readLiteral:
 		}
 
 		// No check on length; encoding can be prescient.
-		if dist > f.dict.histSize() {
+		if dist > uint32(dict.histSize()) {
+			f.b, f.nb = fb, fnb
 			if debugDecode {
-				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+				fmt.Println("dist > dict.histSize():", dist, dict.histSize())
 			}
 			f.err = CorruptInputError(f.roffset)
 			return
 		}
 
-		f.copyLen, f.copyDist = length, dist
+		f.copyLen, f.copyDist = length, int(dist)
 		goto copyHistory
 	}
 
 copyHistory:
 	// Perform a backwards copy according to RFC section 3.2.3.
 	{
-		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
 		if cnt == 0 {
-			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
 		}
 		f.copyLen -= cnt
 
-		if f.dict.availWrite() == 0 || f.copyLen > 0 {
-			f.toRead = f.dict.readFlush()
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
 			f.step = (*decompressor).huffmanBufioReader // We need to continue this work
 			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
 			return
 		}
 		goto readLiteral
 	}
+	// Not reached
 }
 
 // Decode a single Huffman block from f.
@@ -692,16 +773,11 @@ func (f *decompressor) huffmanStringsReader() {
 		stateDict
 	)
 	fr := f.r.(*strings.Reader)
-	moreBits := func() error {
-		c, err := fr.ReadByte()
-		if err != nil {
-			return noEOF(err)
-		}
-		f.roffset++
-		f.b |= uint32(c) << f.nb
-		f.nb += 8
-		return nil
-	}
+
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb, dict := f.nb, f.b, &f.dict
 
 	switch f.stepState {
 	case stateInit:
@@ -721,128 +797,401 @@ readLiteral:
 			// cases, the chunks slice will be 0 for the invalid sequence, leading it
 			// satisfy the n == 0 check below.
 			n := uint(f.hl.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & 31)
-					nb += 8
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
 				}
-				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						if debugDecode {
 							fmt.Println("huffsym: n==0")
 						}
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & 31)
-					f.nb = nb - n
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
 					v = int(chunk >> huffmanValueShift)
 					break
 				}
 			}
 		}
 
-		var n uint // number of bits extra
 		var length int
-		var err error
 		switch {
 		case v < 256:
-			f.dict.writeByte(byte(v))
-			if f.dict.availWrite() == 0 {
-				f.toRead = f.dict.readFlush()
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
 				f.step = (*decompressor).huffmanStringsReader
 				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
 				return
 			}
 			goto readLiteral
 		case v == 256:
+			f.b, f.nb = fb, fnb
 			f.finishBlock()
 			return
 		// otherwise, reference to older data
 		case v < 265:
 			length = v - (257 - 3)
-			n = 0
-		case v < 269:
-			length = v*2 - (265*2 - 11)
-			n = 1
-		case v < 273:
-			length = v*4 - (269*4 - 19)
-			n = 2
-		case v < 277:
-			length = v*8 - (273*8 - 35)
-			n = 3
-		case v < 281:
-			length = v*16 - (277*16 - 67)
-			n = 4
-		case v < 285:
-			length = v*32 - (281*32 - 131)
-			n = 5
 		case v < maxNumLit:
-			length = 258
-			n = 0
+			val := decCodeToLen[(v - 257)]
+			length = int(val.length) + 3
+			n := uint(val.extra)
+			for fnb < n {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits n>0:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
 		default:
 			if debugDecode {
 				fmt.Println(v, ">= maxNumLit")
 			}
 			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
 			return
 		}
-		if n > 0 {
-			for f.nb < n {
-				if err = moreBits(); err != nil {
+
+		var dist uint32
+		if f.hd == nil {
+			for fnb < 5 {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits f.nb<5:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
+		} else {
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hd.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					dist = uint32(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << (nb & regSizeMaskUint32)
+			for fnb < nb {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits f.nb<nb:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
+			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
+		default:
+			f.b, f.nb = fb, fnb
+			if debugDecode {
+				fmt.Println("dist too big:", dist, maxNumDist)
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > uint32(dict.histSize()) {
+			f.b, f.nb = fb, fnb
+			if debugDecode {
+				fmt.Println("dist > dict.histSize():", dist, dict.histSize())
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, int(dist)
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
+			f.step = (*decompressor).huffmanStringsReader // We need to continue this work
+			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
+			return
+		}
+		goto readLiteral
+	}
+	// Not reached
+}
+
+// Decode a single Huffman block from f.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanGenericReader() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.(Reader)
+
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb, dict := f.nb, f.b, &f.dict
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var length int
+		switch {
+		case v < 256:
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
+				f.step = (*decompressor).huffmanGenericReader
+				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.b, f.nb = fb, fnb
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+		case v < maxNumLit:
+			val := decCodeToLen[(v - 257)]
+			length = int(val.length) + 3
+			n := uint(val.extra)
+			for fnb < n {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits n>0:", err)
 					}
 					f.err = err
 					return
 				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
 			}
-			length += int(f.b & uint32(1<<n-1))
-			f.b >>= n
-			f.nb -= n
+			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
+			return
 		}
 
-		var dist int
+		var dist uint32
 		if f.hd == nil {
-			for f.nb < 5 {
-				if err = moreBits(); err != nil {
+			for fnb < 5 {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<5:", err)
 					}
 					f.err = err
 					return
 				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
-			f.b >>= 5
-			f.nb -= 5
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
 		} else {
-			if dist, err = f.huffSym(f.hd); err != nil {
-				if debugDecode {
-					fmt.Println("huffsym:", err)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hd.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					dist = uint32(chunk >> huffmanValueShift)
+					break
 				}
-				f.err = err
-				return
 			}
 		}
 
@@ -852,21 +1201,28 @@ readLiteral:
 		case dist < maxNumDist:
 			nb := uint(dist-2) >> 1
 			// have 1 bit in bottom of dist, need nb more.
-			extra := (dist & 1) << nb
-			for f.nb < nb {
-				if err = moreBits(); err != nil {
+			extra := (dist & 1) << (nb & regSizeMaskUint32)
+			for fnb < nb {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
 					if debugDecode {
 						fmt.Println("morebits f.nb<nb:", err)
 					}
 					f.err = err
 					return
 				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
 			}
-			extra |= int(f.b & uint32(1<<nb-1))
-			f.b >>= nb
-			f.nb -= nb
-			dist = 1<<(nb+1) + 1 + extra
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
+			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
 		default:
+			f.b, f.nb = fb, fnb
 			if debugDecode {
 				fmt.Println("dist too big:", dist, maxNumDist)
 			}
@@ -875,35 +1231,38 @@ readLiteral:
 		}
 
 		// No check on length; encoding can be prescient.
-		if dist > f.dict.histSize() {
+		if dist > uint32(dict.histSize()) {
+			f.b, f.nb = fb, fnb
 			if debugDecode {
-				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+				fmt.Println("dist > dict.histSize():", dist, dict.histSize())
 			}
 			f.err = CorruptInputError(f.roffset)
 			return
 		}
 
-		f.copyLen, f.copyDist = length, dist
+		f.copyLen, f.copyDist = length, int(dist)
 		goto copyHistory
 	}
 
 copyHistory:
 	// Perform a backwards copy according to RFC section 3.2.3.
 	{
-		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
 		if cnt == 0 {
-			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
 		}
 		f.copyLen -= cnt
 
-		if f.dict.availWrite() == 0 || f.copyLen > 0 {
-			f.toRead = f.dict.readFlush()
-			f.step = (*decompressor).huffmanStringsReader // We need to continue this work
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
+			f.step = (*decompressor).huffmanGenericReader // We need to continue this work
 			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
 			return
 		}
 		goto readLiteral
 	}
+	// Not reached
 }
 
 func (f *decompressor) huffmanBlockDecoder() func() {
@@ -916,7 +1275,9 @@ func (f *decompressor) huffmanBlockDecoder() func() {
 		return f.huffmanBufioReader
 	case *strings.Reader:
 		return f.huffmanStringsReader
+	case Reader:
+		return f.huffmanGenericReader
 	default:
-		return f.huffmanBlockGeneric
+		return f.huffmanGenericReader
 	}
 }
diff --git a/vendor/github.com/klauspost/compress/flate/level1.go b/vendor/github.com/klauspost/compress/flate/level1.go
index 1e5eea3..0f14f8d 100644
--- a/vendor/github.com/klauspost/compress/flate/level1.go
+++ b/vendor/github.com/klauspost/compress/flate/level1.go
@@ -1,6 +1,10 @@
 package flate
 
-import "fmt"
+import (
+	"encoding/binary"
+	"fmt"
+	"math/bits"
+)
 
 // fastGen maintains the table for matches,
 // and the previous byte block for level 2.
@@ -116,7 +120,32 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 
 			// Extend the 4-byte match as long as possible.
 			t := candidate.offset - e.cur
-			l := e.matchlenLong(s+4, t+4, src) + 4
+			var l = int32(4)
+			if false {
+				l = e.matchlenLong(s+4, t+4, src) + 4
+			} else {
+				// inlined:
+				a := src[s+4:]
+				b := src[t+4:]
+				for len(a) >= 8 {
+					if diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b); diff != 0 {
+						l += int32(bits.TrailingZeros64(diff) >> 3)
+						break
+					}
+					l += 8
+					a = a[8:]
+					b = b[8:]
+				}
+				if len(a) < 8 {
+					b = b[:len(a)]
+					for i := range a {
+						if a[i] != b[i] {
+							break
+						}
+						l++
+					}
+				}
+			}
 
 			// Extend backwards
 			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
@@ -125,11 +154,43 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 				l++
 			}
 			if nextEmit < s {
-				emitLiteral(dst, src[nextEmit:s])
+				if false {
+					emitLiteral(dst, src[nextEmit:s])
+				} else {
+					for _, v := range src[nextEmit:s] {
+						dst.tokens[dst.n] = token(v)
+						dst.litHist[v]++
+						dst.n++
+					}
+				}
 			}
 
 			// Save the match found
-			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+			if false {
+				dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+			} else {
+				// Inlined...
+				xoffset := uint32(s - t - baseMatchOffset)
+				xlength := l
+				oc := offsetCode(xoffset)
+				xoffset |= oc << 16
+				for xlength > 0 {
+					xl := xlength
+					if xl > 258 {
+						if xl > 258+baseMatchLength {
+							xl = 258
+						} else {
+							xl = 258 - baseMatchLength
+						}
+					}
+					xlength -= xl
+					xl -= baseMatchLength
+					dst.extraHist[lengthCodes1[uint8(xl)]]++
+					dst.offHist[oc]++
+					dst.tokens[dst.n] = token(matchType | uint32(xl)<<lengthShift | xoffset)
+					dst.n++
+				}
+			}
 			s += l
 			nextEmit = s
 			if nextS >= s {
diff --git a/vendor/github.com/klauspost/compress/flate/level2.go b/vendor/github.com/klauspost/compress/flate/level2.go
index 5b986a1..8603fbd 100644
--- a/vendor/github.com/klauspost/compress/flate/level2.go
+++ b/vendor/github.com/klauspost/compress/flate/level2.go
@@ -134,7 +134,15 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 				l++
 			}
 			if nextEmit < s {
-				emitLiteral(dst, src[nextEmit:s])
+				if false {
+					emitLiteral(dst, src[nextEmit:s])
+				} else {
+					for _, v := range src[nextEmit:s] {
+						dst.tokens[dst.n] = token(v)
+						dst.litHist[v]++
+						dst.n++
+					}
+				}
 			}
 
 			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
@@ -155,7 +163,7 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 
 			// Store every second hash in-between, but offset by 1.
 			for i := s - l + 2; i < s-5; i += 7 {
-				x := load6432(src, int32(i))
+				x := load6432(src, i)
 				nextHash := hash4u(uint32(x), bTableBits)
 				e.table[nextHash] = tableEntry{offset: e.cur + i}
 				// Skip one
diff --git a/vendor/github.com/klauspost/compress/flate/level3.go b/vendor/github.com/klauspost/compress/flate/level3.go
index c22b424..039639f 100644
--- a/vendor/github.com/klauspost/compress/flate/level3.go
+++ b/vendor/github.com/klauspost/compress/flate/level3.go
@@ -5,7 +5,7 @@ import "fmt"
 // fastEncL3
 type fastEncL3 struct {
 	fastGen
-	table [tableSize]tableEntryPrev
+	table [1 << 16]tableEntryPrev
 }
 
 // Encode uses a similar algorithm to level 2, will check up to two candidates.
@@ -13,6 +13,8 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 	const (
 		inputMargin            = 8 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		tableBits              = 16
+		tableSize              = 1 << tableBits
 	)
 
 	if debugDeflate && e.cur < 0 {
@@ -73,7 +75,7 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 		nextS := s
 		var candidate tableEntry
 		for {
-			nextHash := hash(cv)
+			nextHash := hash4u(cv, tableBits)
 			s = nextS
 			nextS = s + 1 + (s-nextEmit)>>skipLog
 			if nextS > sLimit {
@@ -141,7 +143,15 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 				l++
 			}
 			if nextEmit < s {
-				emitLiteral(dst, src[nextEmit:s])
+				if false {
+					emitLiteral(dst, src[nextEmit:s])
+				} else {
+					for _, v := range src[nextEmit:s] {
+						dst.tokens[dst.n] = token(v)
+						dst.litHist[v]++
+						dst.n++
+					}
+				}
 			}
 
 			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
@@ -156,7 +166,7 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 				// Index first pair after match end.
 				if int(t+4) < len(src) && t > 0 {
 					cv := load3232(src, t)
-					nextHash := hash(cv)
+					nextHash := hash4u(cv, tableBits)
 					e.table[nextHash] = tableEntryPrev{
 						Prev: e.table[nextHash].Cur,
 						Cur:  tableEntry{offset: e.cur + t},
@@ -165,30 +175,31 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 				goto emitRemainder
 			}
 
-			// We could immediately start working at s now, but to improve
-			// compression we first update the hash table at s-3 to s.
-			x := load6432(src, s-3)
-			prevHash := hash(uint32(x))
-			e.table[prevHash] = tableEntryPrev{
-				Prev: e.table[prevHash].Cur,
-				Cur:  tableEntry{offset: e.cur + s - 3},
+			// Store every 5th hash in-between.
+			for i := s - l + 2; i < s-5; i += 5 {
+				nextHash := hash4u(load3232(src, i), tableBits)
+				e.table[nextHash] = tableEntryPrev{
+					Prev: e.table[nextHash].Cur,
+					Cur:  tableEntry{offset: e.cur + i}}
 			}
-			x >>= 8
-			prevHash = hash(uint32(x))
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-2 to s.
+			x := load6432(src, s-2)
+			prevHash := hash4u(uint32(x), tableBits)
 
 			e.table[prevHash] = tableEntryPrev{
 				Prev: e.table[prevHash].Cur,
 				Cur:  tableEntry{offset: e.cur + s - 2},
 			}
 			x >>= 8
-			prevHash = hash(uint32(x))
+			prevHash = hash4u(uint32(x), tableBits)
 
 			e.table[prevHash] = tableEntryPrev{
 				Prev: e.table[prevHash].Cur,
 				Cur:  tableEntry{offset: e.cur + s - 1},
 			}
 			x >>= 8
-			currHash := hash(uint32(x))
+			currHash := hash4u(uint32(x), tableBits)
 			candidates := e.table[currHash]
 			cv = uint32(x)
 			e.table[currHash] = tableEntryPrev{
@@ -200,15 +211,15 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 			candidate = candidates.Cur
 			minOffset := e.cur + s - (maxMatchOffset - 4)
 
-			if candidate.offset > minOffset && cv != load3232(src, candidate.offset-e.cur) {
-				// We only check if value mismatches.
-				// Offset will always be invalid in other cases.
+			if candidate.offset > minOffset {
+				if cv == load3232(src, candidate.offset-e.cur) {
+					// Found a match...
+					continue
+				}
 				candidate = candidates.Prev
 				if candidate.offset > minOffset && cv == load3232(src, candidate.offset-e.cur) {
-					offset := s - (candidate.offset - e.cur)
-					if offset <= maxMatchOffset {
-						continue
-					}
+					// Match at prev...
+					continue
 				}
 			}
 			cv = uint32(x >> 8)
diff --git a/vendor/github.com/klauspost/compress/flate/level4.go b/vendor/github.com/klauspost/compress/flate/level4.go
index e62f0c0..1cbffa1 100644
--- a/vendor/github.com/klauspost/compress/flate/level4.go
+++ b/vendor/github.com/klauspost/compress/flate/level4.go
@@ -135,7 +135,15 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 			l++
 		}
 		if nextEmit < s {
-			emitLiteral(dst, src[nextEmit:s])
+			if false {
+				emitLiteral(dst, src[nextEmit:s])
+			} else {
+				for _, v := range src[nextEmit:s] {
+					dst.tokens[dst.n] = token(v)
+					dst.litHist[v]++
+					dst.n++
+				}
+			}
 		}
 		if debugDeflate {
 			if t >= s {
diff --git a/vendor/github.com/klauspost/compress/flate/level5.go b/vendor/github.com/klauspost/compress/flate/level5.go
index d513f1f..4b97576 100644
--- a/vendor/github.com/klauspost/compress/flate/level5.go
+++ b/vendor/github.com/klauspost/compress/flate/level5.go
@@ -182,12 +182,27 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
 		// them as literal bytes.
 
-		// Extend the 4-byte match as long as possible.
 		if l == 0 {
+			// Extend the 4-byte match as long as possible.
 			l = e.matchlenLong(s+4, t+4, src) + 4
 		} else if l == maxMatchLength {
 			l += e.matchlenLong(s+l, t+l, src)
 		}
+
+		// Try to locate a better match by checking the end of best match...
+		if sAt := s + l; l < 30 && sAt < sLimit {
+			eLong := e.bTable[hash7(load6432(src, sAt), tableBits)].Cur.offset
+			// Test current
+			t2 := eLong - e.cur - l
+			off := s - t2
+			if t2 >= 0 && off < maxMatchOffset && off > 0 {
+				if l2 := e.matchlenLong(s, t2, src); l2 > l {
+					t = t2
+					l = l2
+				}
+			}
+		}
+
 		// Extend backwards
 		for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
 			s--
@@ -195,7 +210,15 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 			l++
 		}
 		if nextEmit < s {
-			emitLiteral(dst, src[nextEmit:s])
+			if false {
+				emitLiteral(dst, src[nextEmit:s])
+			} else {
+				for _, v := range src[nextEmit:s] {
+					dst.tokens[dst.n] = token(v)
+					dst.litHist[v]++
+					dst.n++
+				}
+			}
 		}
 		if debugDeflate {
 			if t >= s {
diff --git a/vendor/github.com/klauspost/compress/flate/level6.go b/vendor/github.com/klauspost/compress/flate/level6.go
index a52c80e..62888ed 100644
--- a/vendor/github.com/klauspost/compress/flate/level6.go
+++ b/vendor/github.com/klauspost/compress/flate/level6.go
@@ -211,6 +211,31 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 			l += e.matchlenLong(s+l, t+l, src)
 		}
 
+		// Try to locate a better match by checking the end-of-match...
+		if sAt := s + l; sAt < sLimit {
+			eLong := &e.bTable[hash7(load6432(src, sAt), tableBits)]
+			// Test current
+			t2 := eLong.Cur.offset - e.cur - l
+			off := s - t2
+			if off < maxMatchOffset {
+				if off > 0 && t2 >= 0 {
+					if l2 := e.matchlenLong(s, t2, src); l2 > l {
+						t = t2
+						l = l2
+					}
+				}
+				// Test next:
+				t2 = eLong.Prev.offset - e.cur - l
+				off := s - t2
+				if off > 0 && off < maxMatchOffset && t2 >= 0 {
+					if l2 := e.matchlenLong(s, t2, src); l2 > l {
+						t = t2
+						l = l2
+					}
+				}
+			}
+		}
+
 		// Extend backwards
 		for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
 			s--
@@ -218,7 +243,15 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 			l++
 		}
 		if nextEmit < s {
-			emitLiteral(dst, src[nextEmit:s])
+			if false {
+				emitLiteral(dst, src[nextEmit:s])
+			} else {
+				for _, v := range src[nextEmit:s] {
+					dst.tokens[dst.n] = token(v)
+					dst.litHist[v]++
+					dst.n++
+				}
+			}
 		}
 		if false {
 			if t >= s {
diff --git a/vendor/github.com/klauspost/compress/flate/regmask_amd64.go b/vendor/github.com/klauspost/compress/flate/regmask_amd64.go
new file mode 100644
index 0000000..6ed2806
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/regmask_amd64.go
@@ -0,0 +1,37 @@
+package flate
+
+const (
+	// Masks for shifts with register sizes of the shift value.
+	// This can be used to work around the x86 design of shifting by mod register size.
+	// It can be used when a variable shift is always smaller than the register size.
+
+	// reg8SizeMaskX - shift value is 8 bits, shifted is X
+	reg8SizeMask8  = 7
+	reg8SizeMask16 = 15
+	reg8SizeMask32 = 31
+	reg8SizeMask64 = 63
+
+	// reg16SizeMaskX - shift value is 16 bits, shifted is X
+	reg16SizeMask8  = reg8SizeMask8
+	reg16SizeMask16 = reg8SizeMask16
+	reg16SizeMask32 = reg8SizeMask32
+	reg16SizeMask64 = reg8SizeMask64
+
+	// reg32SizeMaskX - shift value is 32 bits, shifted is X
+	reg32SizeMask8  = reg8SizeMask8
+	reg32SizeMask16 = reg8SizeMask16
+	reg32SizeMask32 = reg8SizeMask32
+	reg32SizeMask64 = reg8SizeMask64
+
+	// reg64SizeMaskX - shift value is 64 bits, shifted is X
+	reg64SizeMask8  = reg8SizeMask8
+	reg64SizeMask16 = reg8SizeMask16
+	reg64SizeMask32 = reg8SizeMask32
+	reg64SizeMask64 = reg8SizeMask64
+
+	// regSizeMaskUintX - shift value is uint, shifted is X
+	regSizeMaskUint8  = reg8SizeMask8
+	regSizeMaskUint16 = reg8SizeMask16
+	regSizeMaskUint32 = reg8SizeMask32
+	regSizeMaskUint64 = reg8SizeMask64
+)
diff --git a/vendor/github.com/klauspost/compress/flate/regmask_other.go b/vendor/github.com/klauspost/compress/flate/regmask_other.go
new file mode 100644
index 0000000..1b7a2cb
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/regmask_other.go
@@ -0,0 +1,40 @@
+//go:build !amd64
+// +build !amd64
+
+package flate
+
+const (
+	// Masks for shifts with register sizes of the shift value.
+	// This can be used to work around the x86 design of shifting by mod register size.
+	// It can be used when a variable shift is always smaller than the register size.
+
+	// reg8SizeMaskX - shift value is 8 bits, shifted is X
+	reg8SizeMask8  = 0xff
+	reg8SizeMask16 = 0xff
+	reg8SizeMask32 = 0xff
+	reg8SizeMask64 = 0xff
+
+	// reg16SizeMaskX - shift value is 16 bits, shifted is X
+	reg16SizeMask8  = 0xffff
+	reg16SizeMask16 = 0xffff
+	reg16SizeMask32 = 0xffff
+	reg16SizeMask64 = 0xffff
+
+	// reg32SizeMaskX - shift value is 32 bits, shifted is X
+	reg32SizeMask8  = 0xffffffff
+	reg32SizeMask16 = 0xffffffff
+	reg32SizeMask32 = 0xffffffff
+	reg32SizeMask64 = 0xffffffff
+
+	// reg64SizeMaskX - shift value is 64 bits, shifted is X
+	reg64SizeMask8  = 0xffffffffffffffff
+	reg64SizeMask16 = 0xffffffffffffffff
+	reg64SizeMask32 = 0xffffffffffffffff
+	reg64SizeMask64 = 0xffffffffffffffff
+
+	// regSizeMaskUintX - shift value is uint, shifted is X
+	regSizeMaskUint8  = ^uint(0)
+	regSizeMaskUint16 = ^uint(0)
+	regSizeMaskUint32 = ^uint(0)
+	regSizeMaskUint64 = ^uint(0)
+)
diff --git a/vendor/github.com/klauspost/compress/flate/stateless.go b/vendor/github.com/klauspost/compress/flate/stateless.go
index 53e8991..544162a 100644
--- a/vendor/github.com/klauspost/compress/flate/stateless.go
+++ b/vendor/github.com/klauspost/compress/flate/stateless.go
@@ -249,7 +249,15 @@ func statelessEnc(dst *tokens, src []byte, startAt int16) {
 				l++
 			}
 			if nextEmit < s {
-				emitLiteral(dst, src[nextEmit:s])
+				if false {
+					emitLiteral(dst, src[nextEmit:s])
+				} else {
+					for _, v := range src[nextEmit:s] {
+						dst.tokens[dst.n] = token(v)
+						dst.litHist[v]++
+						dst.n++
+					}
+				}
 			}
 
 			// Save the match found
diff --git a/vendor/github.com/klauspost/compress/flate/token.go b/vendor/github.com/klauspost/compress/flate/token.go
index f9abf60..d818790 100644
--- a/vendor/github.com/klauspost/compress/flate/token.go
+++ b/vendor/github.com/klauspost/compress/flate/token.go
@@ -13,14 +13,16 @@ import (
 )
 
 const (
-	// 2 bits:   type   0 = literal  1=EOF  2=Match   3=Unused
-	// 8 bits:   xlength = length - MIN_MATCH_LENGTH
-	// 22 bits   xoffset = offset - MIN_OFFSET_SIZE, or literal
-	lengthShift = 22
-	offsetMask  = 1<<lengthShift - 1
-	typeMask    = 3 << 30
-	literalType = 0 << 30
-	matchType   = 1 << 30
+	// bits 0-16  	xoffset = offset - MIN_OFFSET_SIZE, or literal - 16 bits
+	// bits 16-22	offsetcode - 5 bits
+	// bits 22-30   xlength = length - MIN_MATCH_LENGTH - 8 bits
+	// bits 30-32   type   0 = literal  1=EOF  2=Match   3=Unused - 2 bits
+	lengthShift         = 22
+	offsetMask          = 1<<lengthShift - 1
+	typeMask            = 3 << 30
+	literalType         = 0 << 30
+	matchType           = 1 << 30
+	matchOffsetOnlyMask = 0xffff
 )
 
 // The length code for length X (MIN_MATCH_LENGTH <= X <= MAX_MATCH_LENGTH)
@@ -126,11 +128,11 @@ var offsetCodes14 = [256]uint32{
 type token uint32
 
 type tokens struct {
-	nLits     int
 	extraHist [32]uint16  // codes 256->maxnumlit
 	offHist   [32]uint16  // offset codes
 	litHist   [256]uint16 // codes 0->255
-	n         uint16      // Must be able to contain maxStoreBlockSize
+	nFilled   int
+	n         uint16 // Must be able to contain maxStoreBlockSize
 	tokens    [maxStoreBlockSize + 1]token
 }
 
@@ -139,7 +141,7 @@ func (t *tokens) Reset() {
 		return
 	}
 	t.n = 0
-	t.nLits = 0
+	t.nFilled = 0
 	for i := range t.litHist[:] {
 		t.litHist[i] = 0
 	}
@@ -158,12 +160,12 @@ func (t *tokens) Fill() {
 	for i, v := range t.litHist[:] {
 		if v == 0 {
 			t.litHist[i] = 1
-			t.nLits++
+			t.nFilled++
 		}
 	}
 	for i, v := range t.extraHist[:literalCount-256] {
 		if v == 0 {
-			t.nLits++
+			t.nFilled++
 			t.extraHist[i] = 1
 		}
 	}
@@ -187,26 +189,23 @@ func (t *tokens) indexTokens(in []token) {
 			t.AddLiteral(tok.literal())
 			continue
 		}
-		t.AddMatch(uint32(tok.length()), tok.offset())
+		t.AddMatch(uint32(tok.length()), tok.offset()&matchOffsetOnlyMask)
 	}
 }
 
 // emitLiteral writes a literal chunk and returns the number of bytes written.
 func emitLiteral(dst *tokens, lit []byte) {
-	ol := int(dst.n)
-	for i, v := range lit {
-		dst.tokens[(i+ol)&maxStoreBlockSize] = token(v)
+	for _, v := range lit {
+		dst.tokens[dst.n] = token(v)
 		dst.litHist[v]++
+		dst.n++
 	}
-	dst.n += uint16(len(lit))
-	dst.nLits += len(lit)
 }
 
 func (t *tokens) AddLiteral(lit byte) {
 	t.tokens[t.n] = token(lit)
 	t.litHist[lit]++
 	t.n++
-	t.nLits++
 }
 
 // from https://stackoverflow.com/a/28730362
@@ -227,12 +226,13 @@ func (t *tokens) EstimatedBits() int {
 	shannon := float32(0)
 	bits := int(0)
 	nMatches := 0
-	if t.nLits > 0 {
-		invTotal := 1.0 / float32(t.nLits)
+	total := int(t.n) + t.nFilled
+	if total > 0 {
+		invTotal := 1.0 / float32(total)
 		for _, v := range t.litHist[:] {
 			if v > 0 {
 				n := float32(v)
-				shannon += -mFastLog2(n*invTotal) * n
+				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
 			}
 		}
 		// Just add 15 for EOB
@@ -240,7 +240,7 @@ func (t *tokens) EstimatedBits() int {
 		for i, v := range t.extraHist[1 : literalCount-256] {
 			if v > 0 {
 				n := float32(v)
-				shannon += -mFastLog2(n*invTotal) * n
+				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
 				bits += int(lengthExtraBits[i&31]) * int(v)
 				nMatches += int(v)
 			}
@@ -251,7 +251,7 @@ func (t *tokens) EstimatedBits() int {
 		for i, v := range t.offHist[:offsetCodeCount] {
 			if v > 0 {
 				n := float32(v)
-				shannon += -mFastLog2(n*invTotal) * n
+				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
 				bits += int(offsetExtraBits[i&31]) * int(v)
 			}
 		}
@@ -270,11 +270,12 @@ func (t *tokens) AddMatch(xlength uint32, xoffset uint32) {
 			panic(fmt.Errorf("invalid offset: %v", xoffset))
 		}
 	}
-	t.nLits++
-	lengthCode := lengthCodes1[uint8(xlength)] & 31
+	oCode := offsetCode(xoffset)
+	xoffset |= oCode << 16
+
+	t.extraHist[lengthCodes1[uint8(xlength)]]++
+	t.offHist[oCode&31]++
 	t.tokens[t.n] = token(matchType | xlength<<lengthShift | xoffset)
-	t.extraHist[lengthCode]++
-	t.offHist[offsetCode(xoffset)&31]++
 	t.n++
 }
 
@@ -286,20 +287,23 @@ func (t *tokens) AddMatchLong(xlength int32, xoffset uint32) {
 			panic(fmt.Errorf("invalid offset: %v", xoffset))
 		}
 	}
-	oc := offsetCode(xoffset) & 31
+	oc := offsetCode(xoffset)
+	xoffset |= oc << 16
 	for xlength > 0 {
 		xl := xlength
 		if xl > 258 {
 			// We need to have at least baseMatchLength left over for next loop.
-			xl = 258 - baseMatchLength
+			if xl > 258+baseMatchLength {
+				xl = 258
+			} else {
+				xl = 258 - baseMatchLength
+			}
 		}
 		xlength -= xl
-		xl -= 3
-		t.nLits++
-		lengthCode := lengthCodes1[uint8(xl)] & 31
+		xl -= baseMatchLength
+		t.extraHist[lengthCodes1[uint8(xl)]]++
+		t.offHist[oc&31]++
 		t.tokens[t.n] = token(matchType | uint32(xl)<<lengthShift | xoffset)
-		t.extraHist[lengthCode]++
-		t.offHist[oc]++
 		t.n++
 	}
 }
@@ -354,8 +358,8 @@ func (t token) offset() uint32 { return uint32(t) & offsetMask }
 
 func (t token) length() uint8 { return uint8(t >> lengthShift) }
 
-// The code is never more than 8 bits, but is returned as uint32 for convenience.
-func lengthCode(len uint8) uint32 { return uint32(lengthCodes[len]) }
+// Convert length to code.
+func lengthCode(len uint8) uint8 { return lengthCodes[len] }
 
 // Returns the offset code corresponding to a specific offset
 func offsetCode(off uint32) uint32 {
diff --git a/vendor/github.com/klauspost/compress/fse/README.md b/vendor/github.com/klauspost/compress/fse/README.md
index 27d8ed5..ea7324d 100644
--- a/vendor/github.com/klauspost/compress/fse/README.md
+++ b/vendor/github.com/klauspost/compress/fse/README.md
@@ -1,79 +1,79 @@
-# Finite State Entropy
-
-This package provides Finite State Entropy encoding and decoding.
-            
-Finite State Entropy (also referenced as [tANS](https://en.wikipedia.org/wiki/Asymmetric_numeral_systems#tANS)) 
-encoding provides a fast near-optimal symbol encoding/decoding
-for byte blocks as implemented in [zstandard](https://github.com/facebook/zstd).
-
-This can be used for compressing input with a lot of similar input values to the smallest number of bytes.
-This does not perform any multi-byte [dictionary coding](https://en.wikipedia.org/wiki/Dictionary_coder) as LZ coders,
-but it can be used as a secondary step to compressors (like Snappy) that does not do entropy encoding. 
-
-* [Godoc documentation](https://godoc.org/github.com/klauspost/compress/fse)
-
-## News
-
- * Feb 2018: First implementation released. Consider this beta software for now.
-
-# Usage
-
-This package provides a low level interface that allows to compress single independent blocks. 
-
-Each block is separate, and there is no built in integrity checks. 
-This means that the caller should keep track of block sizes and also do checksums if needed.  
-
-Compressing a block is done via the [`Compress`](https://godoc.org/github.com/klauspost/compress/fse#Compress) function.
-You must provide input and will receive the output and maybe an error.
-
-These error values can be returned:
-
-| Error               | Description                                                                 |
-|---------------------|-----------------------------------------------------------------------------|
-| `<nil>`             | Everything ok, output is returned                                           |
-| `ErrIncompressible` | Returned when input is judged to be too hard to compress                    |
-| `ErrUseRLE`         | Returned from the compressor when the input is a single byte value repeated |
-| `(error)`           | An internal error occurred.                                                 |
-
-As can be seen above there are errors that will be returned even under normal operation so it is important to handle these.
-
-To reduce allocations you can provide a [`Scratch`](https://godoc.org/github.com/klauspost/compress/fse#Scratch) object 
-that can be re-used for successive calls. Both compression and decompression accepts a `Scratch` object, and the same 
-object can be used for both.   
-
-Be aware, that when re-using a `Scratch` object that the *output* buffer is also re-used, so if you are still using this
-you must set the `Out` field in the scratch to nil. The same buffer is used for compression and decompression output.
-
-Decompressing is done by calling the [`Decompress`](https://godoc.org/github.com/klauspost/compress/fse#Decompress) function.
-You must provide the output from the compression stage, at exactly the size you got back. If you receive an error back
-your input was likely corrupted. 
-
-It is important to note that a successful decoding does *not* mean your output matches your original input. 
-There are no integrity checks, so relying on errors from the decompressor does not assure your data is valid.
-
-For more detailed usage, see examples in the [godoc documentation](https://godoc.org/github.com/klauspost/compress/fse#pkg-examples).
-
-# Performance
-
-A lot of factors are affecting speed. Block sizes and compressibility of the material are primary factors.  
-All compression functions are currently only running on the calling goroutine so only one core will be used per block.  
-
-The compressor is significantly faster if symbols are kept as small as possible. The highest byte value of the input
-is used to reduce some of the processing, so if all your input is above byte value 64 for instance, it may be 
-beneficial to transpose all your input values down by 64.   
-
-With moderate block sizes around 64k speed are typically 200MB/s per core for compression and 
-around 300MB/s decompression speed. 
-
-The same hardware typically does Huffman (deflate) encoding at 125MB/s and decompression at 100MB/s. 
-
-# Plans
-
-At one point, more internals will be exposed to facilitate more "expert" usage of the components. 
-
-A streaming interface is also likely to be implemented. Likely compatible with [FSE stream format](https://github.com/Cyan4973/FiniteStateEntropy/blob/dev/programs/fileio.c#L261).  
-
-# Contributing
-
-Contributions are always welcome. Be aware that adding public functions will require good justification and breaking 
+# Finite State Entropy
+
+This package provides Finite State Entropy encoding and decoding.
+            
+Finite State Entropy (also referenced as [tANS](https://en.wikipedia.org/wiki/Asymmetric_numeral_systems#tANS)) 
+encoding provides a fast near-optimal symbol encoding/decoding
+for byte blocks as implemented in [zstandard](https://github.com/facebook/zstd).
+
+This can be used for compressing input with a lot of similar input values to the smallest number of bytes.
+This does not perform any multi-byte [dictionary coding](https://en.wikipedia.org/wiki/Dictionary_coder) as LZ coders,
+but it can be used as a secondary step to compressors (like Snappy) that does not do entropy encoding. 
+
+* [Godoc documentation](https://godoc.org/github.com/klauspost/compress/fse)
+
+## News
+
+ * Feb 2018: First implementation released. Consider this beta software for now.
+
+# Usage
+
+This package provides a low level interface that allows to compress single independent blocks. 
+
+Each block is separate, and there is no built in integrity checks. 
+This means that the caller should keep track of block sizes and also do checksums if needed.  
+
+Compressing a block is done via the [`Compress`](https://godoc.org/github.com/klauspost/compress/fse#Compress) function.
+You must provide input and will receive the output and maybe an error.
+
+These error values can be returned:
+
+| Error               | Description                                                                 |
+|---------------------|-----------------------------------------------------------------------------|
+| `<nil>`             | Everything ok, output is returned                                           |
+| `ErrIncompressible` | Returned when input is judged to be too hard to compress                    |
+| `ErrUseRLE`         | Returned from the compressor when the input is a single byte value repeated |
+| `(error)`           | An internal error occurred.                                                 |
+
+As can be seen above there are errors that will be returned even under normal operation so it is important to handle these.
+
+To reduce allocations you can provide a [`Scratch`](https://godoc.org/github.com/klauspost/compress/fse#Scratch) object 
+that can be re-used for successive calls. Both compression and decompression accepts a `Scratch` object, and the same 
+object can be used for both.   
+
+Be aware, that when re-using a `Scratch` object that the *output* buffer is also re-used, so if you are still using this
+you must set the `Out` field in the scratch to nil. The same buffer is used for compression and decompression output.
+
+Decompressing is done by calling the [`Decompress`](https://godoc.org/github.com/klauspost/compress/fse#Decompress) function.
+You must provide the output from the compression stage, at exactly the size you got back. If you receive an error back
+your input was likely corrupted. 
+
+It is important to note that a successful decoding does *not* mean your output matches your original input. 
+There are no integrity checks, so relying on errors from the decompressor does not assure your data is valid.
+
+For more detailed usage, see examples in the [godoc documentation](https://godoc.org/github.com/klauspost/compress/fse#pkg-examples).
+
+# Performance
+
+A lot of factors are affecting speed. Block sizes and compressibility of the material are primary factors.  
+All compression functions are currently only running on the calling goroutine so only one core will be used per block.  
+
+The compressor is significantly faster if symbols are kept as small as possible. The highest byte value of the input
+is used to reduce some of the processing, so if all your input is above byte value 64 for instance, it may be 
+beneficial to transpose all your input values down by 64.   
+
+With moderate block sizes around 64k speed are typically 200MB/s per core for compression and 
+around 300MB/s decompression speed. 
+
+The same hardware typically does Huffman (deflate) encoding at 125MB/s and decompression at 100MB/s. 
+
+# Plans
+
+At one point, more internals will be exposed to facilitate more "expert" usage of the components. 
+
+A streaming interface is also likely to be implemented. Likely compatible with [FSE stream format](https://github.com/Cyan4973/FiniteStateEntropy/blob/dev/programs/fileio.c#L261).  
+
+# Contributing
+
+Contributions are always welcome. Be aware that adding public functions will require good justification and breaking 
 changes will likely not be accepted. If in doubt open an issue before writing the PR.  
\ No newline at end of file
diff --git a/vendor/github.com/klauspost/compress/fse/compress.go b/vendor/github.com/klauspost/compress/fse/compress.go
index b69237c..6f34191 100644
--- a/vendor/github.com/klauspost/compress/fse/compress.go
+++ b/vendor/github.com/klauspost/compress/fse/compress.go
@@ -92,7 +92,6 @@ func (c *cState) init(bw *bitWriter, ct *cTable, tableLog uint8, first symbolTra
 	im := int32((nbBitsOut << 16) - first.deltaNbBits)
 	lu := (im >> nbBitsOut) + first.deltaFindState
 	c.state = c.stateTable[lu]
-	return
 }
 
 // encode the output symbol provided and write it to the bitstream.
@@ -301,7 +300,7 @@ func (s *Scratch) writeCount() error {
 	out[outP+1] = byte(bitStream >> 8)
 	outP += (bitCount + 7) / 8
 
-	if uint16(charnum) > s.symbolLen {
+	if charnum > s.symbolLen {
 		return errors.New("internal error: charnum > s.symbolLen")
 	}
 	s.Out = out[:outP]
@@ -331,7 +330,7 @@ type cTable struct {
 func (s *Scratch) allocCtable() {
 	tableSize := 1 << s.actualTableLog
 	// get tableSymbol that is big enough.
-	if cap(s.ct.tableSymbol) < int(tableSize) {
+	if cap(s.ct.tableSymbol) < tableSize {
 		s.ct.tableSymbol = make([]byte, tableSize)
 	}
 	s.ct.tableSymbol = s.ct.tableSymbol[:tableSize]
@@ -565,8 +564,8 @@ func (s *Scratch) normalizeCount2() error {
 		distributed  uint32
 		total        = uint32(s.br.remain())
 		tableLog     = s.actualTableLog
-		lowThreshold = uint32(total >> tableLog)
-		lowOne       = uint32((total * 3) >> (tableLog + 1))
+		lowThreshold = total >> tableLog
+		lowOne       = (total * 3) >> (tableLog + 1)
 	)
 	for i, cnt := range s.count[:s.symbolLen] {
 		if cnt == 0 {
@@ -591,7 +590,7 @@ func (s *Scratch) normalizeCount2() error {
 
 	if (total / toDistribute) > lowOne {
 		// risk of rounding to zero
-		lowOne = uint32((total * 3) / (toDistribute * 2))
+		lowOne = (total * 3) / (toDistribute * 2)
 		for i, cnt := range s.count[:s.symbolLen] {
 			if (s.norm[i] == notYetAssigned) && (cnt <= lowOne) {
 				s.norm[i] = 1
diff --git a/vendor/github.com/klauspost/compress/fse/decompress.go b/vendor/github.com/klauspost/compress/fse/decompress.go
index 413ec3b..926f5f1 100644
--- a/vendor/github.com/klauspost/compress/fse/decompress.go
+++ b/vendor/github.com/klauspost/compress/fse/decompress.go
@@ -172,7 +172,7 @@ type decSymbol struct {
 // allocDtable will allocate decoding tables if they are not big enough.
 func (s *Scratch) allocDtable() {
 	tableSize := 1 << s.actualTableLog
-	if cap(s.decTable) < int(tableSize) {
+	if cap(s.decTable) < tableSize {
 		s.decTable = make([]decSymbol, tableSize)
 	}
 	s.decTable = s.decTable[:tableSize]
@@ -340,7 +340,7 @@ type decoder struct {
 func (d *decoder) init(in *bitReader, dt []decSymbol, tableLog uint8) {
 	d.dt = dt
 	d.br = in
-	d.state = uint16(in.getBits(tableLog))
+	d.state = in.getBits(tableLog)
 }
 
 // next returns the next symbol and sets the next state.
diff --git a/vendor/github.com/klauspost/compress/gen.sh b/vendor/github.com/klauspost/compress/gen.sh
new file mode 100644
index 0000000..aff9422
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/gen.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+cd s2/cmd/_s2sx/ || exit 1
+go generate .
diff --git a/vendor/github.com/klauspost/compress/huff0/README.md b/vendor/github.com/klauspost/compress/huff0/README.md
index ec4f909..8b6e5c6 100644
--- a/vendor/github.com/klauspost/compress/huff0/README.md
+++ b/vendor/github.com/klauspost/compress/huff0/README.md
@@ -1,87 +1,89 @@
-# Huff0 entropy compression
-
-This package provides Huff0 encoding and decoding as used in zstd.
-            
-[Huff0](https://github.com/Cyan4973/FiniteStateEntropy#new-generation-entropy-coders), 
-a Huffman codec designed for modern CPU, featuring OoO (Out of Order) operations on multiple ALU 
-(Arithmetic Logic Unit), achieving extremely fast compression and decompression speeds.
-
-This can be used for compressing input with a lot of similar input values to the smallest number of bytes.
-This does not perform any multi-byte [dictionary coding](https://en.wikipedia.org/wiki/Dictionary_coder) as LZ coders,
-but it can be used as a secondary step to compressors (like Snappy) that does not do entropy encoding. 
-
-* [Godoc documentation](https://godoc.org/github.com/klauspost/compress/huff0)
-
-## News
-
- * Mar 2018: First implementation released. Consider this beta software for now.
-
-# Usage
-
-This package provides a low level interface that allows to compress single independent blocks. 
-
-Each block is separate, and there is no built in integrity checks. 
-This means that the caller should keep track of block sizes and also do checksums if needed.  
-
-Compressing a block is done via the [`Compress1X`](https://godoc.org/github.com/klauspost/compress/huff0#Compress1X) and 
-[`Compress4X`](https://godoc.org/github.com/klauspost/compress/huff0#Compress4X) functions.
-You must provide input and will receive the output and maybe an error.
-
-These error values can be returned:
-
-| Error               | Description                                                                 |
-|---------------------|-----------------------------------------------------------------------------|
-| `<nil>`             | Everything ok, output is returned                                           |
-| `ErrIncompressible` | Returned when input is judged to be too hard to compress                    |
-| `ErrUseRLE`         | Returned from the compressor when the input is a single byte value repeated |
-| `ErrTooBig`         | Returned if the input block exceeds the maximum allowed size (128 Kib)      |
-| `(error)`           | An internal error occurred.                                                 |
-
-
-As can be seen above some of there are errors that will be returned even under normal operation so it is important to handle these.
-
-To reduce allocations you can provide a [`Scratch`](https://godoc.org/github.com/klauspost/compress/huff0#Scratch) object 
-that can be re-used for successive calls. Both compression and decompression accepts a `Scratch` object, and the same 
-object can be used for both.   
-
-Be aware, that when re-using a `Scratch` object that the *output* buffer is also re-used, so if you are still using this
-you must set the `Out` field in the scratch to nil. The same buffer is used for compression and decompression output.
-
-The `Scratch` object will retain state that allows to re-use previous tables for encoding and decoding.  
-
-## Tables and re-use
-
-Huff0 allows for reusing tables from the previous block to save space if that is expected to give better/faster results. 
-
-The Scratch object allows you to set a [`ReusePolicy`](https://godoc.org/github.com/klauspost/compress/huff0#ReusePolicy) 
-that controls this behaviour. See the documentation for details. This can be altered between each block.
-
-Do however note that this information is *not* stored in the output block and it is up to the users of the package to
-record whether [`ReadTable`](https://godoc.org/github.com/klauspost/compress/huff0#ReadTable) should be called,
-based on the boolean reported back from the CompressXX call. 
-
-If you want to store the table separate from the data, you can access them as `OutData` and `OutTable` on the 
-[`Scratch`](https://godoc.org/github.com/klauspost/compress/huff0#Scratch) object.
-
-## Decompressing
-
-The first part of decoding is to initialize the decoding table through [`ReadTable`](https://godoc.org/github.com/klauspost/compress/huff0#ReadTable).
-This will initialize the decoding tables. 
-You can supply the complete block to `ReadTable` and it will return the data part of the block 
-which can be given to the decompressor. 
-
-Decompressing is done by calling the [`Decompress1X`](https://godoc.org/github.com/klauspost/compress/huff0#Scratch.Decompress1X) 
-or [`Decompress4X`](https://godoc.org/github.com/klauspost/compress/huff0#Scratch.Decompress4X) function.
-
-For concurrently decompressing content with a fixed table a stateless [`Decoder`](https://godoc.org/github.com/klauspost/compress/huff0#Decoder) can be requested which will remain correct as long as the scratch is unchanged. The capacity of the provided slice indicates the expected output size.
-
-You must provide the output from the compression stage, at exactly the size you got back. If you receive an error back
-your input was likely corrupted. 
-
-It is important to note that a successful decoding does *not* mean your output matches your original input. 
-There are no integrity checks, so relying on errors from the decompressor does not assure your data is valid.
-
-# Contributing
-
-Contributions are always welcome. Be aware that adding public functions will require good justification and breaking 
-changes will likely not be accepted. If in doubt open an issue before writing the PR.
+# Huff0 entropy compression
+
+This package provides Huff0 encoding and decoding as used in zstd.
+            
+[Huff0](https://github.com/Cyan4973/FiniteStateEntropy#new-generation-entropy-coders), 
+a Huffman codec designed for modern CPU, featuring OoO (Out of Order) operations on multiple ALU 
+(Arithmetic Logic Unit), achieving extremely fast compression and decompression speeds.
+
+This can be used for compressing input with a lot of similar input values to the smallest number of bytes.
+This does not perform any multi-byte [dictionary coding](https://en.wikipedia.org/wiki/Dictionary_coder) as LZ coders,
+but it can be used as a secondary step to compressors (like Snappy) that does not do entropy encoding. 
+
+* [Godoc documentation](https://godoc.org/github.com/klauspost/compress/huff0)
+
+## News
+
+This is used as part of the [zstandard](https://github.com/klauspost/compress/tree/master/zstd#zstd) compression and decompression package.
+
+This ensures that most functionality is well tested.
+
+# Usage
+
+This package provides a low level interface that allows to compress single independent blocks. 
+
+Each block is separate, and there is no built in integrity checks. 
+This means that the caller should keep track of block sizes and also do checksums if needed.  
+
+Compressing a block is done via the [`Compress1X`](https://godoc.org/github.com/klauspost/compress/huff0#Compress1X) and 
+[`Compress4X`](https://godoc.org/github.com/klauspost/compress/huff0#Compress4X) functions.
+You must provide input and will receive the output and maybe an error.
+
+These error values can be returned:
+
+| Error               | Description                                                                 |
+|---------------------|-----------------------------------------------------------------------------|
+| `<nil>`             | Everything ok, output is returned                                           |
+| `ErrIncompressible` | Returned when input is judged to be too hard to compress                    |
+| `ErrUseRLE`         | Returned from the compressor when the input is a single byte value repeated |
+| `ErrTooBig`         | Returned if the input block exceeds the maximum allowed size (128 Kib)      |
+| `(error)`           | An internal error occurred.                                                 |
+
+
+As can be seen above some of there are errors that will be returned even under normal operation so it is important to handle these.
+
+To reduce allocations you can provide a [`Scratch`](https://godoc.org/github.com/klauspost/compress/huff0#Scratch) object 
+that can be re-used for successive calls. Both compression and decompression accepts a `Scratch` object, and the same 
+object can be used for both.   
+
+Be aware, that when re-using a `Scratch` object that the *output* buffer is also re-used, so if you are still using this
+you must set the `Out` field in the scratch to nil. The same buffer is used for compression and decompression output.
+
+The `Scratch` object will retain state that allows to re-use previous tables for encoding and decoding.  
+
+## Tables and re-use
+
+Huff0 allows for reusing tables from the previous block to save space if that is expected to give better/faster results. 
+
+The Scratch object allows you to set a [`ReusePolicy`](https://godoc.org/github.com/klauspost/compress/huff0#ReusePolicy) 
+that controls this behaviour. See the documentation for details. This can be altered between each block.
+
+Do however note that this information is *not* stored in the output block and it is up to the users of the package to
+record whether [`ReadTable`](https://godoc.org/github.com/klauspost/compress/huff0#ReadTable) should be called,
+based on the boolean reported back from the CompressXX call. 
+
+If you want to store the table separate from the data, you can access them as `OutData` and `OutTable` on the 
+[`Scratch`](https://godoc.org/github.com/klauspost/compress/huff0#Scratch) object.
+
+## Decompressing
+
+The first part of decoding is to initialize the decoding table through [`ReadTable`](https://godoc.org/github.com/klauspost/compress/huff0#ReadTable).
+This will initialize the decoding tables. 
+You can supply the complete block to `ReadTable` and it will return the data part of the block 
+which can be given to the decompressor. 
+
+Decompressing is done by calling the [`Decompress1X`](https://godoc.org/github.com/klauspost/compress/huff0#Scratch.Decompress1X) 
+or [`Decompress4X`](https://godoc.org/github.com/klauspost/compress/huff0#Scratch.Decompress4X) function.
+
+For concurrently decompressing content with a fixed table a stateless [`Decoder`](https://godoc.org/github.com/klauspost/compress/huff0#Decoder) can be requested which will remain correct as long as the scratch is unchanged. The capacity of the provided slice indicates the expected output size.
+
+You must provide the output from the compression stage, at exactly the size you got back. If you receive an error back
+your input was likely corrupted. 
+
+It is important to note that a successful decoding does *not* mean your output matches your original input. 
+There are no integrity checks, so relying on errors from the decompressor does not assure your data is valid.
+
+# Contributing
+
+Contributions are always welcome. Be aware that adding public functions will require good justification and breaking 
+changes will likely not be accepted. If in doubt open an issue before writing the PR.
diff --git a/vendor/github.com/klauspost/compress/huff0/bitreader.go b/vendor/github.com/klauspost/compress/huff0/bitreader.go
index a4979e8..451160e 100644
--- a/vendor/github.com/klauspost/compress/huff0/bitreader.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitreader.go
@@ -8,115 +8,10 @@ package huff0
 import (
 	"encoding/binary"
 	"errors"
+	"fmt"
 	"io"
 )
 
-// bitReader reads a bitstream in reverse.
-// The last set bit indicates the start of the stream and is used
-// for aligning the input.
-type bitReader struct {
-	in       []byte
-	off      uint // next byte to read is at in[off - 1]
-	value    uint64
-	bitsRead uint8
-}
-
-// init initializes and resets the bit reader.
-func (b *bitReader) init(in []byte) error {
-	if len(in) < 1 {
-		return errors.New("corrupt stream: too short")
-	}
-	b.in = in
-	b.off = uint(len(in))
-	// The highest bit of the last byte indicates where to start
-	v := in[len(in)-1]
-	if v == 0 {
-		return errors.New("corrupt stream, did not find end of stream")
-	}
-	b.bitsRead = 64
-	b.value = 0
-	if len(in) >= 8 {
-		b.fillFastStart()
-	} else {
-		b.fill()
-		b.fill()
-	}
-	b.bitsRead += 8 - uint8(highBit32(uint32(v)))
-	return nil
-}
-
-// peekBitsFast requires that at least one bit is requested every time.
-// There are no checks if the buffer is filled.
-func (b *bitReader) peekBitsFast(n uint8) uint16 {
-	const regMask = 64 - 1
-	v := uint16((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
-	return v
-}
-
-// fillFast() will make sure at least 32 bits are available.
-// There must be at least 4 bytes available.
-func (b *bitReader) fillFast() {
-	if b.bitsRead < 32 {
-		return
-	}
-
-	// 2 bounds checks.
-	v := b.in[b.off-4 : b.off]
-	v = v[:4]
-	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-	b.value = (b.value << 32) | uint64(low)
-	b.bitsRead -= 32
-	b.off -= 4
-}
-
-func (b *bitReader) advance(n uint8) {
-	b.bitsRead += n
-}
-
-// fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
-func (b *bitReader) fillFastStart() {
-	// Do single re-slice to avoid bounds checks.
-	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
-	b.bitsRead = 0
-	b.off -= 8
-}
-
-// fill() will make sure at least 32 bits are available.
-func (b *bitReader) fill() {
-	if b.bitsRead < 32 {
-		return
-	}
-	if b.off > 4 {
-		v := b.in[b.off-4:]
-		v = v[:4]
-		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-		b.value = (b.value << 32) | uint64(low)
-		b.bitsRead -= 32
-		b.off -= 4
-		return
-	}
-	for b.off > 0 {
-		b.value = (b.value << 8) | uint64(b.in[b.off-1])
-		b.bitsRead -= 8
-		b.off--
-	}
-}
-
-// finished returns true if all bits have been read from the bit stream.
-func (b *bitReader) finished() bool {
-	return b.off == 0 && b.bitsRead >= 64
-}
-
-// close the bitstream and returns an error if out-of-buffer reads occurred.
-func (b *bitReader) close() error {
-	// Release reference.
-	b.in = nil
-	if b.bitsRead > 64 {
-		return io.ErrUnexpectedEOF
-	}
-	return nil
-}
-
 // bitReader reads a bitstream in reverse.
 // The last set bit indicates the start of the stream and is used
 // for aligning the input.
@@ -213,10 +108,17 @@ func (b *bitReaderBytes) finished() bool {
 	return b.off == 0 && b.bitsRead >= 64
 }
 
+func (b *bitReaderBytes) remaining() uint {
+	return b.off*8 + uint(64-b.bitsRead)
+}
+
 // close the bitstream and returns an error if out-of-buffer reads occurred.
 func (b *bitReaderBytes) close() error {
 	// Release reference.
 	b.in = nil
+	if b.remaining() > 0 {
+		return fmt.Errorf("corrupt input: %d bits remain on stream", b.remaining())
+	}
 	if b.bitsRead > 64 {
 		return io.ErrUnexpectedEOF
 	}
@@ -263,6 +165,11 @@ func (b *bitReaderShifted) peekBitsFast(n uint8) uint16 {
 	return uint16(b.value >> ((64 - n) & 63))
 }
 
+// peekTopBits(n) is equvialent to peekBitFast(64 - n)
+func (b *bitReaderShifted) peekTopBits(n uint8) uint16 {
+	return uint16(b.value >> n)
+}
+
 func (b *bitReaderShifted) advance(n uint8) {
 	b.bitsRead += n
 	b.value <<= n & 63
@@ -318,10 +225,17 @@ func (b *bitReaderShifted) finished() bool {
 	return b.off == 0 && b.bitsRead >= 64
 }
 
+func (b *bitReaderShifted) remaining() uint {
+	return b.off*8 + uint(64-b.bitsRead)
+}
+
 // close the bitstream and returns an error if out-of-buffer reads occurred.
 func (b *bitReaderShifted) close() error {
 	// Release reference.
 	b.in = nil
+	if b.remaining() > 0 {
+		return fmt.Errorf("corrupt input: %d bits remain on stream", b.remaining())
+	}
 	if b.bitsRead > 64 {
 		return io.ErrUnexpectedEOF
 	}
diff --git a/vendor/github.com/klauspost/compress/huff0/bitwriter.go b/vendor/github.com/klauspost/compress/huff0/bitwriter.go
index bda4021..6bce4e8 100644
--- a/vendor/github.com/klauspost/compress/huff0/bitwriter.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitwriter.go
@@ -43,6 +43,11 @@ func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
 func (b *bitWriter) encSymbol(ct cTable, symbol byte) {
 	enc := ct[symbol]
 	b.bitContainer |= uint64(enc.val) << (b.nBits & 63)
+	if false {
+		if enc.nBits == 0 {
+			panic("nbits 0")
+		}
+	}
 	b.nBits += enc.nBits
 }
 
@@ -54,6 +59,14 @@ func (b *bitWriter) encTwoSymbols(ct cTable, av, bv byte) {
 	sh := b.nBits & 63
 	combined := uint64(encA.val) | (uint64(encB.val) << (encA.nBits & 63))
 	b.bitContainer |= combined << sh
+	if false {
+		if encA.nBits == 0 {
+			panic("nbitsA 0")
+		}
+		if encB.nBits == 0 {
+			panic("nbitsB 0")
+		}
+	}
 	b.nBits += encA.nBits + encB.nBits
 }
 
diff --git a/vendor/github.com/klauspost/compress/huff0/compress.go b/vendor/github.com/klauspost/compress/huff0/compress.go
index 0843cb0..bc95ac6 100644
--- a/vendor/github.com/klauspost/compress/huff0/compress.go
+++ b/vendor/github.com/klauspost/compress/huff0/compress.go
@@ -2,6 +2,7 @@ package huff0
 
 import (
 	"fmt"
+	"math"
 	"runtime"
 	"sync"
 )
@@ -77,8 +78,11 @@ func compress(in []byte, s *Scratch, compressor func(src []byte) ([]byte, error)
 		// Each symbol present maximum once or too well distributed.
 		return nil, false, ErrIncompressible
 	}
-
-	if s.Reuse == ReusePolicyPrefer && canReuse {
+	if s.Reuse == ReusePolicyMust && !canReuse {
+		// We must reuse, but we can't.
+		return nil, false, ErrIncompressible
+	}
+	if (s.Reuse == ReusePolicyPrefer || s.Reuse == ReusePolicyMust) && canReuse {
 		keepTable := s.cTable
 		keepTL := s.actualTableLog
 		s.cTable = s.prevTable
@@ -90,6 +94,9 @@ func compress(in []byte, s *Scratch, compressor func(src []byte) ([]byte, error)
 			s.OutData = s.Out
 			return s.Out, true, nil
 		}
+		if s.Reuse == ReusePolicyMust {
+			return nil, false, ErrIncompressible
+		}
 		// Do not attempt to re-use later.
 		s.prevTable = s.prevTable[:0]
 	}
@@ -155,6 +162,70 @@ func compress(in []byte, s *Scratch, compressor func(src []byte) ([]byte, error)
 	return s.Out, false, nil
 }
 
+// EstimateSizes will estimate the data sizes
+func EstimateSizes(in []byte, s *Scratch) (tableSz, dataSz, reuseSz int, err error) {
+	s, err = s.prepare(in)
+	if err != nil {
+		return 0, 0, 0, err
+	}
+
+	// Create histogram, if none was provided.
+	tableSz, dataSz, reuseSz = -1, -1, -1
+	maxCount := s.maxCount
+	var canReuse = false
+	if maxCount == 0 {
+		maxCount, canReuse = s.countSimple(in)
+	} else {
+		canReuse = s.canUseTable(s.prevTable)
+	}
+
+	// We want the output size to be less than this:
+	wantSize := len(in)
+	if s.WantLogLess > 0 {
+		wantSize -= wantSize >> s.WantLogLess
+	}
+
+	// Reset for next run.
+	s.clearCount = true
+	s.maxCount = 0
+	if maxCount >= len(in) {
+		if maxCount > len(in) {
+			return 0, 0, 0, fmt.Errorf("maxCount (%d) > length (%d)", maxCount, len(in))
+		}
+		if len(in) == 1 {
+			return 0, 0, 0, ErrIncompressible
+		}
+		// One symbol, use RLE
+		return 0, 0, 0, ErrUseRLE
+	}
+	if maxCount == 1 || maxCount < (len(in)>>7) {
+		// Each symbol present maximum once or too well distributed.
+		return 0, 0, 0, ErrIncompressible
+	}
+
+	// Calculate new table.
+	err = s.buildCTable()
+	if err != nil {
+		return 0, 0, 0, err
+	}
+
+	if false && !s.canUseTable(s.cTable) {
+		panic("invalid table generated")
+	}
+
+	tableSz, err = s.cTable.estTableSize(s)
+	if err != nil {
+		return 0, 0, 0, err
+	}
+	if canReuse {
+		reuseSz = s.prevTable.estimateSize(s.count[:s.symbolLen])
+	}
+	dataSz = s.cTable.estimateSize(s.count[:s.symbolLen])
+
+	// Restore
+	return tableSz, dataSz, reuseSz, nil
+}
+
 func (s *Scratch) compress1X(src []byte) ([]byte, error) {
 	return s.compress1xDo(s.Out, src)
 }
@@ -219,6 +290,10 @@ func (s *Scratch) compress4X(src []byte) ([]byte, error) {
 		if err != nil {
 			return nil, err
 		}
+		if len(s.Out)-idx > math.MaxUint16 {
+			// We cannot store the size in the jump table
+			return nil, ErrIncompressible
+		}
 		// Write compressed length as little endian before block.
 		if i < 3 {
 			// Last length is not written.
@@ -262,6 +337,10 @@ func (s *Scratch) compress4Xp(src []byte) ([]byte, error) {
 			return nil, errs[i]
 		}
 		o := s.tmpOut[i]
+		if len(o) > math.MaxUint16 {
+			// We cannot store the size in the jump table
+			return nil, ErrIncompressible
+		}
 		// Write compressed length as little endian before block.
 		if i < 3 {
 			// Last length is not written.
@@ -397,7 +476,7 @@ func (s *Scratch) buildCTable() error {
 	var startNode = int16(s.symbolLen)
 	nonNullRank := s.symbolLen - 1
 
-	nodeNb := int16(startNode)
+	nodeNb := startNode
 	huffNode := s.nodes[1 : huffNodesLen+1]
 
 	// This overlays the slice above, but allows "-1" index lookups.
@@ -530,7 +609,6 @@ func (s *Scratch) huffSort() {
 		}
 		nodes[pos&huffNodesMask] = nodeElt{count: c, symbol: byte(n)}
 	}
-	return
 }
 
 func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
@@ -574,7 +652,7 @@ func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
 
 		// Get pos of last (smallest) symbol per rank
 		{
-			currentNbBits := uint8(maxNbBits)
+			currentNbBits := maxNbBits
 			for pos := int(n); pos >= 0; pos-- {
 				if huffNode[pos].nbBits >= currentNbBits {
 					continue
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress.go b/vendor/github.com/klauspost/compress/huff0/decompress.go
index a03b263..04f6529 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress.go
@@ -4,6 +4,7 @@ import (
 	"errors"
 	"fmt"
 	"io"
+	"sync"
 
 	"github.com/klauspost/compress/fse"
 )
@@ -20,7 +21,7 @@ type dEntrySingle struct {
 
 // double-symbols decoding
 type dEntryDouble struct {
-	seq   uint16
+	seq   [4]byte
 	nBits uint8
 	len   uint8
 }
@@ -32,7 +33,7 @@ const use8BitTables = true
 // The size of the input may be larger than the table definition.
 // Any content remaining after the table definition will be returned.
 // If no Scratch is provided a new one is allocated.
-// The returned Scratch can be used for decoding input using this table.
+// The returned Scratch can be used for encoding or decoding input using this table.
 func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
 	s, err = s.prepare(in)
 	if err != nil {
@@ -58,8 +59,8 @@ func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
 		s.symbolLen = uint16(oSize)
 		in = in[iSize:]
 	} else {
-		if len(in) <= int(iSize) {
-			return s, nil, errors.New("input too small for table")
+		if len(in) < int(iSize) {
+			return s, nil, fmt.Errorf("input too small for table, want %d bytes, have %d", iSize, len(in))
 		}
 		// FSE compressed weights
 		s.fse.DecompressLimit = 255
@@ -138,15 +139,33 @@ func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
 	if len(s.dt.single) != tSize {
 		s.dt.single = make([]dEntrySingle, tSize)
 	}
+	cTable := s.prevTable
+	if cap(cTable) < maxSymbolValue+1 {
+		cTable = make([]cTableEntry, 0, maxSymbolValue+1)
+	}
+	cTable = cTable[:maxSymbolValue+1]
+	s.prevTable = cTable[:s.symbolLen]
+	s.prevTableLog = s.actualTableLog
+
 	for n, w := range s.huffWeight[:s.symbolLen] {
 		if w == 0 {
+			cTable[n] = cTableEntry{
+				val:   0,
+				nBits: 0,
+			}
 			continue
 		}
 		length := (uint32(1) << w) >> 1
 		d := dEntrySingle{
 			entry: uint16(s.actualTableLog+1-w) | (uint16(n) << 8),
 		}
+
 		rank := &rankStats[w]
+		cTable[n] = cTableEntry{
+			val:   uint16(*rank >> (w - 1)),
+			nBits: uint8(d.entry),
+		}
+
 		single := s.dt.single[*rank : *rank+length]
 		for i := range single {
 			single[i] = d
@@ -198,6 +217,7 @@ func (s *Scratch) Decoder() *Decoder {
 	return &Decoder{
 		dt:             s.dt,
 		actualTableLog: s.actualTableLog,
+		bufs:           &s.decPool,
 	}
 }
 
@@ -205,6 +225,15 @@ func (s *Scratch) Decoder() *Decoder {
 type Decoder struct {
 	dt             dTable
 	actualTableLog uint8
+	bufs           *sync.Pool
+}
+
+func (d *Decoder) buffer() *[4][256]byte {
+	buf, ok := d.bufs.Get().(*[4][256]byte)
+	if ok {
+		return buf
+	}
+	return &[4][256]byte{}
 }
 
 // Decompress1X will decompress a 1X encoded stream.
@@ -231,7 +260,8 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
 	dt := d.dt.single[:tlSize]
 
 	// Use temp table to avoid bound checks/append penalty.
-	var buf [256]byte
+	bufs := d.buffer()
+	buf := &bufs[0]
 	var off uint8
 
 	for br.off >= 8 {
@@ -259,6 +289,7 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
 		if off == 0 {
 			if len(dst)+256 > maxDecodedSize {
 				br.close()
+				d.bufs.Put(bufs)
 				return nil, ErrMaxDecodedSizeExceeded
 			}
 			dst = append(dst, buf[:]...)
@@ -266,6 +297,7 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
 	}
 
 	if len(dst)+int(off) > maxDecodedSize {
+		d.bufs.Put(bufs)
 		br.close()
 		return nil, ErrMaxDecodedSizeExceeded
 	}
@@ -292,6 +324,7 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
 			}
 		}
 		if len(dst) >= maxDecodedSize {
+			d.bufs.Put(bufs)
 			br.close()
 			return nil, ErrMaxDecodedSizeExceeded
 		}
@@ -301,6 +334,7 @@ func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
 		bitsLeft -= nBits
 		dst = append(dst, uint8(v.entry>>8))
 	}
+	d.bufs.Put(bufs)
 	return dst, br.close()
 }
 
@@ -323,41 +357,258 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 	dt := d.dt.single[:256]
 
 	// Use temp table to avoid bound checks/append penalty.
-	var buf [256]byte
+	bufs := d.buffer()
+	buf := &bufs[0]
 	var off uint8
 
-	shift := (8 - d.actualTableLog) & 7
-
-	//fmt.Printf("mask: %b, tl:%d\n", mask, d.actualTableLog)
-	for br.off >= 4 {
-		br.fillFast()
-		v := dt[br.peekByteFast()>>shift]
-		br.advance(uint8(v.entry))
-		buf[off+0] = uint8(v.entry >> 8)
-
-		v = dt[br.peekByteFast()>>shift]
-		br.advance(uint8(v.entry))
-		buf[off+1] = uint8(v.entry >> 8)
-
-		v = dt[br.peekByteFast()>>shift]
-		br.advance(uint8(v.entry))
-		buf[off+2] = uint8(v.entry >> 8)
-
-		v = dt[br.peekByteFast()>>shift]
-		br.advance(uint8(v.entry))
-		buf[off+3] = uint8(v.entry >> 8)
-
-		off += 4
-		if off == 0 {
-			if len(dst)+256 > maxDecodedSize {
-				br.close()
-				return nil, ErrMaxDecodedSizeExceeded
+	switch d.actualTableLog {
+	case 8:
+		const shift = 8 - 8
+		for br.off >= 4 {
+			br.fillFast()
+			v := dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+0] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+1] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+2] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+3] = uint8(v.entry >> 8)
+
+			off += 4
+			if off == 0 {
+				if len(dst)+256 > maxDecodedSize {
+					br.close()
+					d.bufs.Put(bufs)
+					return nil, ErrMaxDecodedSizeExceeded
+				}
+				dst = append(dst, buf[:]...)
+			}
+		}
+	case 7:
+		const shift = 8 - 7
+		for br.off >= 4 {
+			br.fillFast()
+			v := dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+0] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+1] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+2] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+3] = uint8(v.entry >> 8)
+
+			off += 4
+			if off == 0 {
+				if len(dst)+256 > maxDecodedSize {
+					br.close()
+					d.bufs.Put(bufs)
+					return nil, ErrMaxDecodedSizeExceeded
+				}
+				dst = append(dst, buf[:]...)
+			}
+		}
+	case 6:
+		const shift = 8 - 6
+		for br.off >= 4 {
+			br.fillFast()
+			v := dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+0] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+1] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+2] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+3] = uint8(v.entry >> 8)
+
+			off += 4
+			if off == 0 {
+				if len(dst)+256 > maxDecodedSize {
+					d.bufs.Put(bufs)
+					br.close()
+					return nil, ErrMaxDecodedSizeExceeded
+				}
+				dst = append(dst, buf[:]...)
+			}
+		}
+	case 5:
+		const shift = 8 - 5
+		for br.off >= 4 {
+			br.fillFast()
+			v := dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+0] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+1] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+2] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+3] = uint8(v.entry >> 8)
+
+			off += 4
+			if off == 0 {
+				if len(dst)+256 > maxDecodedSize {
+					d.bufs.Put(bufs)
+					br.close()
+					return nil, ErrMaxDecodedSizeExceeded
+				}
+				dst = append(dst, buf[:]...)
+			}
+		}
+	case 4:
+		const shift = 8 - 4
+		for br.off >= 4 {
+			br.fillFast()
+			v := dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+0] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+1] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+2] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+3] = uint8(v.entry >> 8)
+
+			off += 4
+			if off == 0 {
+				if len(dst)+256 > maxDecodedSize {
+					d.bufs.Put(bufs)
+					br.close()
+					return nil, ErrMaxDecodedSizeExceeded
+				}
+				dst = append(dst, buf[:]...)
+			}
+		}
+	case 3:
+		const shift = 8 - 3
+		for br.off >= 4 {
+			br.fillFast()
+			v := dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+0] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+1] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+2] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+3] = uint8(v.entry >> 8)
+
+			off += 4
+			if off == 0 {
+				if len(dst)+256 > maxDecodedSize {
+					d.bufs.Put(bufs)
+					br.close()
+					return nil, ErrMaxDecodedSizeExceeded
+				}
+				dst = append(dst, buf[:]...)
+			}
+		}
+	case 2:
+		const shift = 8 - 2
+		for br.off >= 4 {
+			br.fillFast()
+			v := dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+0] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+1] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+2] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+3] = uint8(v.entry >> 8)
+
+			off += 4
+			if off == 0 {
+				if len(dst)+256 > maxDecodedSize {
+					d.bufs.Put(bufs)
+					br.close()
+					return nil, ErrMaxDecodedSizeExceeded
+				}
+				dst = append(dst, buf[:]...)
+			}
+		}
+	case 1:
+		const shift = 8 - 1
+		for br.off >= 4 {
+			br.fillFast()
+			v := dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+0] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+1] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+2] = uint8(v.entry >> 8)
+
+			v = dt[uint8(br.value>>(56+shift))]
+			br.advance(uint8(v.entry))
+			buf[off+3] = uint8(v.entry >> 8)
+
+			off += 4
+			if off == 0 {
+				if len(dst)+256 > maxDecodedSize {
+					d.bufs.Put(bufs)
+					br.close()
+					return nil, ErrMaxDecodedSizeExceeded
+				}
+				dst = append(dst, buf[:]...)
 			}
-			dst = append(dst, buf[:]...)
 		}
+	default:
+		d.bufs.Put(bufs)
+		return nil, fmt.Errorf("invalid tablelog: %d", d.actualTableLog)
 	}
 
 	if len(dst)+int(off) > maxDecodedSize {
+		d.bufs.Put(bufs)
 		br.close()
 		return nil, ErrMaxDecodedSizeExceeded
 	}
@@ -365,6 +616,8 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 
 	// br < 4, so uint8 is fine
 	bitsLeft := int8(uint8(br.off)*8 + (64 - br.bitsRead))
+	shift := (8 - d.actualTableLog) & 7
+
 	for bitsLeft > 0 {
 		if br.bitsRead >= 64-8 {
 			for br.off > 0 {
@@ -375,6 +628,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 		}
 		if len(dst) >= maxDecodedSize {
 			br.close()
+			d.bufs.Put(bufs)
 			return nil, ErrMaxDecodedSizeExceeded
 		}
 		v := dt[br.peekByteFast()>>shift]
@@ -383,6 +637,7 @@ func (d *Decoder) decompress1X8Bit(dst, src []byte) ([]byte, error) {
 		bitsLeft -= int8(nBits)
 		dst = append(dst, uint8(v.entry>>8))
 	}
+	d.bufs.Put(bufs)
 	return dst, br.close()
 }
 
@@ -402,33 +657,35 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
 	dt := d.dt.single[:256]
 
 	// Use temp table to avoid bound checks/append penalty.
-	var buf [256]byte
+	bufs := d.buffer()
+	buf := &bufs[0]
 	var off uint8
 
-	const shift = 0
+	const shift = 56
 
 	//fmt.Printf("mask: %b, tl:%d\n", mask, d.actualTableLog)
 	for br.off >= 4 {
 		br.fillFast()
-		v := dt[br.peekByteFast()>>shift]
+		v := dt[uint8(br.value>>shift)]
 		br.advance(uint8(v.entry))
 		buf[off+0] = uint8(v.entry >> 8)
 
-		v = dt[br.peekByteFast()>>shift]
+		v = dt[uint8(br.value>>shift)]
 		br.advance(uint8(v.entry))
 		buf[off+1] = uint8(v.entry >> 8)
 
-		v = dt[br.peekByteFast()>>shift]
+		v = dt[uint8(br.value>>shift)]
 		br.advance(uint8(v.entry))
 		buf[off+2] = uint8(v.entry >> 8)
 
-		v = dt[br.peekByteFast()>>shift]
+		v = dt[uint8(br.value>>shift)]
 		br.advance(uint8(v.entry))
 		buf[off+3] = uint8(v.entry >> 8)
 
 		off += 4
 		if off == 0 {
 			if len(dst)+256 > maxDecodedSize {
+				d.bufs.Put(bufs)
 				br.close()
 				return nil, ErrMaxDecodedSizeExceeded
 			}
@@ -437,6 +694,7 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
 	}
 
 	if len(dst)+int(off) > maxDecodedSize {
+		d.bufs.Put(bufs)
 		br.close()
 		return nil, ErrMaxDecodedSizeExceeded
 	}
@@ -453,208 +711,20 @@ func (d *Decoder) decompress1X8BitExactly(dst, src []byte) ([]byte, error) {
 			}
 		}
 		if len(dst) >= maxDecodedSize {
+			d.bufs.Put(bufs)
 			br.close()
 			return nil, ErrMaxDecodedSizeExceeded
 		}
-		v := dt[br.peekByteFast()>>shift]
+		v := dt[br.peekByteFast()]
 		nBits := uint8(v.entry)
 		br.advance(nBits)
 		bitsLeft -= int8(nBits)
 		dst = append(dst, uint8(v.entry>>8))
 	}
+	d.bufs.Put(bufs)
 	return dst, br.close()
 }
 
-// Decompress4X will decompress a 4X encoded stream.
-// The length of the supplied input must match the end of a block exactly.
-// The *capacity* of the dst slice must match the destination size of
-// the uncompressed data exactly.
-func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
-	if len(d.dt.single) == 0 {
-		return nil, errors.New("no table loaded")
-	}
-	if len(src) < 6+(4*1) {
-		return nil, errors.New("input too small")
-	}
-	if use8BitTables && d.actualTableLog <= 8 {
-		return d.decompress4X8bit(dst, src)
-	}
-
-	var br [4]bitReaderShifted
-	start := 6
-	for i := 0; i < 3; i++ {
-		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
-		if start+length >= len(src) {
-			return nil, errors.New("truncated input (or invalid offset)")
-		}
-		err := br[i].init(src[start : start+length])
-		if err != nil {
-			return nil, err
-		}
-		start += length
-	}
-	err := br[3].init(src[start:])
-	if err != nil {
-		return nil, err
-	}
-
-	// destination, offset to match first output
-	dstSize := cap(dst)
-	dst = dst[:dstSize]
-	out := dst
-	dstEvery := (dstSize + 3) / 4
-
-	const tlSize = 1 << tableLogMax
-	const tlMask = tlSize - 1
-	single := d.dt.single[:tlSize]
-
-	// Use temp table to avoid bound checks/append penalty.
-	var buf [256]byte
-	var off uint8
-	var decoded int
-
-	// Decode 2 values from each decoder/loop.
-	const bufoff = 256 / 4
-	for {
-		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
-			break
-		}
-
-		{
-			const stream = 0
-			const stream2 = 1
-			br[stream].fillFast()
-			br[stream2].fillFast()
-
-			val := br[stream].peekBitsFast(d.actualTableLog)
-			v := single[val&tlMask]
-			br[stream].advance(uint8(v.entry))
-			buf[off+bufoff*stream] = uint8(v.entry >> 8)
-
-			val2 := br[stream2].peekBitsFast(d.actualTableLog)
-			v2 := single[val2&tlMask]
-			br[stream2].advance(uint8(v2.entry))
-			buf[off+bufoff*stream2] = uint8(v2.entry >> 8)
-
-			val = br[stream].peekBitsFast(d.actualTableLog)
-			v = single[val&tlMask]
-			br[stream].advance(uint8(v.entry))
-			buf[off+bufoff*stream+1] = uint8(v.entry >> 8)
-
-			val2 = br[stream2].peekBitsFast(d.actualTableLog)
-			v2 = single[val2&tlMask]
-			br[stream2].advance(uint8(v2.entry))
-			buf[off+bufoff*stream2+1] = uint8(v2.entry >> 8)
-		}
-
-		{
-			const stream = 2
-			const stream2 = 3
-			br[stream].fillFast()
-			br[stream2].fillFast()
-
-			val := br[stream].peekBitsFast(d.actualTableLog)
-			v := single[val&tlMask]
-			br[stream].advance(uint8(v.entry))
-			buf[off+bufoff*stream] = uint8(v.entry >> 8)
-
-			val2 := br[stream2].peekBitsFast(d.actualTableLog)
-			v2 := single[val2&tlMask]
-			br[stream2].advance(uint8(v2.entry))
-			buf[off+bufoff*stream2] = uint8(v2.entry >> 8)
-
-			val = br[stream].peekBitsFast(d.actualTableLog)
-			v = single[val&tlMask]
-			br[stream].advance(uint8(v.entry))
-			buf[off+bufoff*stream+1] = uint8(v.entry >> 8)
-
-			val2 = br[stream2].peekBitsFast(d.actualTableLog)
-			v2 = single[val2&tlMask]
-			br[stream2].advance(uint8(v2.entry))
-			buf[off+bufoff*stream2+1] = uint8(v2.entry >> 8)
-		}
-
-		off += 2
-
-		if off == bufoff {
-			if bufoff > dstEvery {
-				return nil, errors.New("corruption detected: stream overrun 1")
-			}
-			copy(out, buf[:bufoff])
-			copy(out[dstEvery:], buf[bufoff:bufoff*2])
-			copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
-			copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
-			off = 0
-			out = out[bufoff:]
-			decoded += 256
-			// There must at least be 3 buffers left.
-			if len(out) < dstEvery*3 {
-				return nil, errors.New("corruption detected: stream overrun 2")
-			}
-		}
-	}
-	if off > 0 {
-		ioff := int(off)
-		if len(out) < dstEvery*3+ioff {
-			return nil, errors.New("corruption detected: stream overrun 3")
-		}
-		copy(out, buf[:off])
-		copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
-		copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
-		copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
-		decoded += int(off) * 4
-		out = out[off:]
-	}
-
-	// Decode remaining.
-	for i := range br {
-		offset := dstEvery * i
-		br := &br[i]
-		bitsLeft := br.off*8 + uint(64-br.bitsRead)
-		for bitsLeft > 0 {
-			br.fill()
-			if false && br.bitsRead >= 32 {
-				if br.off >= 4 {
-					v := br.in[br.off-4:]
-					v = v[:4]
-					low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-					br.value = (br.value << 32) | uint64(low)
-					br.bitsRead -= 32
-					br.off -= 4
-				} else {
-					for br.off > 0 {
-						br.value = (br.value << 8) | uint64(br.in[br.off-1])
-						br.bitsRead -= 8
-						br.off--
-					}
-				}
-			}
-			// end inline...
-			if offset >= len(out) {
-				return nil, errors.New("corruption detected: stream overrun 4")
-			}
-
-			// Read value and increment offset.
-			val := br.peekBitsFast(d.actualTableLog)
-			v := single[val&tlMask].entry
-			nBits := uint8(v)
-			br.advance(nBits)
-			bitsLeft -= uint(nBits)
-			out[offset] = uint8(v >> 8)
-			offset++
-		}
-		decoded += offset - dstEvery*i
-		err = br.close()
-		if err != nil {
-			return nil, err
-		}
-	}
-	if dstSize != decoded {
-		return nil, errors.New("corruption detected: short output block")
-	}
-	return dst, nil
-}
-
 // Decompress4X will decompress a 4X encoded stream.
 // The length of the supplied input must match the end of a block exactly.
 // The *capacity* of the dst slice must match the destination size of
@@ -688,19 +758,18 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
 	out := dst
 	dstEvery := (dstSize + 3) / 4
 
-	shift := (8 - d.actualTableLog) & 7
+	shift := (56 + (8 - d.actualTableLog)) & 63
 
 	const tlSize = 1 << 8
-	const tlMask = tlSize - 1
 	single := d.dt.single[:tlSize]
 
 	// Use temp table to avoid bound checks/append penalty.
-	var buf [256]byte
+	buf := d.buffer()
 	var off uint8
 	var decoded int
 
 	// Decode 4 values from each decoder/loop.
-	const bufoff = 256 / 4
+	const bufoff = 256
 	for {
 		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
 			break
@@ -710,96 +779,109 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
 			// Interleave 2 decodes.
 			const stream = 0
 			const stream2 = 1
-			br[stream].fillFast()
-			br[stream2].fillFast()
-
-			v := single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 := single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+1] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+2] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+3] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
+			br1 := &br[stream]
+			br2 := &br[stream2]
+			br1.fillFast()
+			br2.fillFast()
+
+			v := single[uint8(br1.value>>shift)].entry
+			v2 := single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off] = uint8(v >> 8)
+			buf[stream2][off] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+1] = uint8(v >> 8)
+			buf[stream2][off+1] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+2] = uint8(v >> 8)
+			buf[stream2][off+2] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+3] = uint8(v >> 8)
+			buf[stream2][off+3] = uint8(v2 >> 8)
 		}
 
 		{
 			const stream = 2
 			const stream2 = 3
-			br[stream].fillFast()
-			br[stream2].fillFast()
-
-			v := single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 := single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+1] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+2] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+3] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
+			br1 := &br[stream]
+			br2 := &br[stream2]
+			br1.fillFast()
+			br2.fillFast()
+
+			v := single[uint8(br1.value>>shift)].entry
+			v2 := single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off] = uint8(v >> 8)
+			buf[stream2][off] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+1] = uint8(v >> 8)
+			buf[stream2][off+1] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+2] = uint8(v >> 8)
+			buf[stream2][off+2] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+3] = uint8(v >> 8)
+			buf[stream2][off+3] = uint8(v2 >> 8)
 		}
 
 		off += 4
 
-		if off == bufoff {
+		if off == 0 {
 			if bufoff > dstEvery {
+				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 1")
 			}
-			copy(out, buf[:bufoff])
-			copy(out[dstEvery:], buf[bufoff:bufoff*2])
-			copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
-			copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
-			off = 0
+			copy(out, buf[0][:])
+			copy(out[dstEvery:], buf[1][:])
+			copy(out[dstEvery*2:], buf[2][:])
+			copy(out[dstEvery*3:], buf[3][:])
 			out = out[bufoff:]
-			decoded += 256
+			decoded += bufoff * 4
 			// There must at least be 3 buffers left.
 			if len(out) < dstEvery*3 {
+				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 2")
 			}
 		}
@@ -807,23 +889,31 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
 	if off > 0 {
 		ioff := int(off)
 		if len(out) < dstEvery*3+ioff {
+			d.bufs.Put(buf)
 			return nil, errors.New("corruption detected: stream overrun 3")
 		}
-		copy(out, buf[:off])
-		copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
-		copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
-		copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
+		copy(out, buf[0][:off])
+		copy(out[dstEvery:], buf[1][:off])
+		copy(out[dstEvery*2:], buf[2][:off])
+		copy(out[dstEvery*3:], buf[3][:off])
 		decoded += int(off) * 4
 		out = out[off:]
 	}
 
 	// Decode remaining.
+	// Decode remaining.
+	remainBytes := dstEvery - (decoded / 4)
 	for i := range br {
 		offset := dstEvery * i
+		endsAt := offset + remainBytes
+		if endsAt > len(out) {
+			endsAt = len(out)
+		}
 		br := &br[i]
-		bitsLeft := int(br.off*8) + int(64-br.bitsRead)
+		bitsLeft := br.remaining()
 		for bitsLeft > 0 {
 			if br.finished() {
+				d.bufs.Put(buf)
 				return nil, io.ErrUnexpectedEOF
 			}
 			if br.bitsRead >= 56 {
@@ -843,24 +933,31 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
 				}
 			}
 			// end inline...
-			if offset >= len(out) {
+			if offset >= endsAt {
+				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 4")
 			}
 
 			// Read value and increment offset.
-			v := single[br.peekByteFast()>>shift].entry
+			v := single[uint8(br.value>>shift)].entry
 			nBits := uint8(v)
 			br.advance(nBits)
-			bitsLeft -= int(nBits)
+			bitsLeft -= uint(nBits)
 			out[offset] = uint8(v >> 8)
 			offset++
 		}
+		if offset != endsAt {
+			d.bufs.Put(buf)
+			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
+		}
 		decoded += offset - dstEvery*i
 		err = br.close()
 		if err != nil {
+			d.bufs.Put(buf)
 			return nil, err
 		}
 	}
+	d.bufs.Put(buf)
 	if dstSize != decoded {
 		return nil, errors.New("corruption detected: short output block")
 	}
@@ -896,18 +993,18 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
 	out := dst
 	dstEvery := (dstSize + 3) / 4
 
-	const shift = 0
+	const shift = 56
 	const tlSize = 1 << 8
 	const tlMask = tlSize - 1
 	single := d.dt.single[:tlSize]
 
 	// Use temp table to avoid bound checks/append penalty.
-	var buf [256]byte
+	buf := d.buffer()
 	var off uint8
 	var decoded int
 
 	// Decode 4 values from each decoder/loop.
-	const bufoff = 256 / 4
+	const bufoff = 256
 	for {
 		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
 			break
@@ -917,96 +1014,109 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
 			// Interleave 2 decodes.
 			const stream = 0
 			const stream2 = 1
-			br[stream].fillFast()
-			br[stream2].fillFast()
-
-			v := single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 := single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+1] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+2] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+3] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
+			br1 := &br[stream]
+			br2 := &br[stream2]
+			br1.fillFast()
+			br2.fillFast()
+
+			v := single[uint8(br1.value>>shift)].entry
+			v2 := single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off] = uint8(v >> 8)
+			buf[stream2][off] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+1] = uint8(v >> 8)
+			buf[stream2][off+1] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+2] = uint8(v >> 8)
+			buf[stream2][off+2] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+3] = uint8(v >> 8)
+			buf[stream2][off+3] = uint8(v2 >> 8)
 		}
 
 		{
 			const stream = 2
 			const stream2 = 3
-			br[stream].fillFast()
-			br[stream2].fillFast()
-
-			v := single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 := single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+1] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+2] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
-
-			v = single[br[stream].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream+3] = uint8(v >> 8)
-			br[stream].advance(uint8(v))
-
-			v2 = single[br[stream2].peekByteFast()>>shift].entry
-			buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
-			br[stream2].advance(uint8(v2))
+			br1 := &br[stream]
+			br2 := &br[stream2]
+			br1.fillFast()
+			br2.fillFast()
+
+			v := single[uint8(br1.value>>shift)].entry
+			v2 := single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off] = uint8(v >> 8)
+			buf[stream2][off] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+1] = uint8(v >> 8)
+			buf[stream2][off+1] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+2] = uint8(v >> 8)
+			buf[stream2][off+2] = uint8(v2 >> 8)
+
+			v = single[uint8(br1.value>>shift)].entry
+			v2 = single[uint8(br2.value>>shift)].entry
+			br1.bitsRead += uint8(v)
+			br1.value <<= v & 63
+			br2.bitsRead += uint8(v2)
+			br2.value <<= v2 & 63
+			buf[stream][off+3] = uint8(v >> 8)
+			buf[stream2][off+3] = uint8(v2 >> 8)
 		}
 
 		off += 4
 
-		if off == bufoff {
+		if off == 0 {
 			if bufoff > dstEvery {
+				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 1")
 			}
-			copy(out, buf[:bufoff])
-			copy(out[dstEvery:], buf[bufoff:bufoff*2])
-			copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
-			copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
-			off = 0
+			copy(out, buf[0][:])
+			copy(out[dstEvery:], buf[1][:])
+			copy(out[dstEvery*2:], buf[2][:])
+			copy(out[dstEvery*3:], buf[3][:])
 			out = out[bufoff:]
-			decoded += 256
+			decoded += bufoff * 4
 			// There must at least be 3 buffers left.
 			if len(out) < dstEvery*3 {
+				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 2")
 			}
 		}
@@ -1016,21 +1126,27 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
 		if len(out) < dstEvery*3+ioff {
 			return nil, errors.New("corruption detected: stream overrun 3")
 		}
-		copy(out, buf[:off])
-		copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
-		copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
-		copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
+		copy(out, buf[0][:off])
+		copy(out[dstEvery:], buf[1][:off])
+		copy(out[dstEvery*2:], buf[2][:off])
+		copy(out[dstEvery*3:], buf[3][:off])
 		decoded += int(off) * 4
 		out = out[off:]
 	}
 
 	// Decode remaining.
+	remainBytes := dstEvery - (decoded / 4)
 	for i := range br {
 		offset := dstEvery * i
+		endsAt := offset + remainBytes
+		if endsAt > len(out) {
+			endsAt = len(out)
+		}
 		br := &br[i]
-		bitsLeft := int(br.off*8) + int(64-br.bitsRead)
+		bitsLeft := br.remaining()
 		for bitsLeft > 0 {
 			if br.finished() {
+				d.bufs.Put(buf)
 				return nil, io.ErrUnexpectedEOF
 			}
 			if br.bitsRead >= 56 {
@@ -1050,24 +1166,32 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
 				}
 			}
 			// end inline...
-			if offset >= len(out) {
+			if offset >= endsAt {
+				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 4")
 			}
 
 			// Read value and increment offset.
-			v := single[br.peekByteFast()>>shift].entry
+			v := single[br.peekByteFast()].entry
 			nBits := uint8(v)
 			br.advance(nBits)
-			bitsLeft -= int(nBits)
+			bitsLeft -= uint(nBits)
 			out[offset] = uint8(v >> 8)
 			offset++
 		}
+		if offset != endsAt {
+			d.bufs.Put(buf)
+			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
+		}
+
 		decoded += offset - dstEvery*i
 		err = br.close()
 		if err != nil {
+			d.bufs.Put(buf)
 			return nil, err
 		}
 	}
+	d.bufs.Put(buf)
 	if dstSize != decoded {
 		return nil, errors.New("corruption detected: short output block")
 	}
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
new file mode 100644
index 0000000..3415e5d
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
@@ -0,0 +1,148 @@
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+// This file contains the specialisation of Decoder.Decompress4X
+// that uses an asm implementation of its main loop.
+package huff0
+
+import (
+	"errors"
+	"fmt"
+)
+
+// decompress4x_main_loop_x86 is an x86 assembler implementation
+// of Decompress4X when tablelog > 8.
+//go:noescape
+func decompress4x_main_loop_amd64(ctx *decompress4xContext)
+
+// decompress4x_8b_loop_x86 is an x86 assembler implementation
+// of Decompress4X when tablelog <= 8 which decodes 4 entries
+// per loop.
+//go:noescape
+func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
+
+// fallback8BitSize is the size where using Go version is faster.
+const fallback8BitSize = 800
+
+type decompress4xContext struct {
+	pbr0     *bitReaderShifted
+	pbr1     *bitReaderShifted
+	pbr2     *bitReaderShifted
+	pbr3     *bitReaderShifted
+	peekBits uint8
+	out      *byte
+	dstEvery int
+	tbl      *dEntrySingle
+	decoded  int
+	limit    *byte
+}
+
+// Decompress4X will decompress a 4X encoded stream.
+// The length of the supplied input must match the end of a block exactly.
+// The *capacity* of the dst slice must match the destination size of
+// the uncompressed data exactly.
+func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
+	if len(d.dt.single) == 0 {
+		return nil, errors.New("no table loaded")
+	}
+	if len(src) < 6+(4*1) {
+		return nil, errors.New("input too small")
+	}
+
+	use8BitTables := d.actualTableLog <= 8
+	if cap(dst) < fallback8BitSize && use8BitTables {
+		return d.decompress4X8bit(dst, src)
+	}
+
+	var br [4]bitReaderShifted
+	// Decode "jump table"
+	start := 6
+	for i := 0; i < 3; i++ {
+		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
+		if start+length >= len(src) {
+			return nil, errors.New("truncated input (or invalid offset)")
+		}
+		err := br[i].init(src[start : start+length])
+		if err != nil {
+			return nil, err
+		}
+		start += length
+	}
+	err := br[3].init(src[start:])
+	if err != nil {
+		return nil, err
+	}
+
+	// destination, offset to match first output
+	dstSize := cap(dst)
+	dst = dst[:dstSize]
+	out := dst
+	dstEvery := (dstSize + 3) / 4
+
+	const tlSize = 1 << tableLogMax
+	const tlMask = tlSize - 1
+	single := d.dt.single[:tlSize]
+
+	var decoded int
+
+	if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) {
+		ctx := decompress4xContext{
+			pbr0:     &br[0],
+			pbr1:     &br[1],
+			pbr2:     &br[2],
+			pbr3:     &br[3],
+			peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
+			out:      &out[0],
+			dstEvery: dstEvery,
+			tbl:      &single[0],
+			limit:    &out[dstEvery-4], // Always stop decoding when first buffer gets here to avoid writing OOB on last.
+		}
+		if use8BitTables {
+			decompress4x_8b_main_loop_amd64(&ctx)
+		} else {
+			decompress4x_main_loop_amd64(&ctx)
+		}
+
+		decoded = ctx.decoded
+		out = out[decoded/4:]
+	}
+
+	// Decode remaining.
+	remainBytes := dstEvery - (decoded / 4)
+	for i := range br {
+		offset := dstEvery * i
+		endsAt := offset + remainBytes
+		if endsAt > len(out) {
+			endsAt = len(out)
+		}
+		br := &br[i]
+		bitsLeft := br.remaining()
+		for bitsLeft > 0 {
+			br.fill()
+			if offset >= endsAt {
+				return nil, errors.New("corruption detected: stream overrun 4")
+			}
+
+			// Read value and increment offset.
+			val := br.peekBitsFast(d.actualTableLog)
+			v := single[val&tlMask].entry
+			nBits := uint8(v)
+			br.advance(nBits)
+			bitsLeft -= uint(nBits)
+			out[offset] = uint8(v >> 8)
+			offset++
+		}
+		if offset != endsAt {
+			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
+		}
+		decoded += offset - dstEvery*i
+		err = br.close()
+		if err != nil {
+			return nil, err
+		}
+	}
+	if dstSize != decoded {
+		return nil, errors.New("corruption detected: short output block")
+	}
+	return dst, nil
+}
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
new file mode 100644
index 0000000..06287f5
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
@@ -0,0 +1,662 @@
+// Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT.
+
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+// func decompress4x_main_loop_amd64(ctx *decompress4xContext)
+TEXT ·decompress4x_main_loop_amd64(SB), $8-8
+	XORQ DX, DX
+
+	// Preload values
+	MOVQ    ctx+0(FP), AX
+	MOVBQZX 32(AX), SI
+	MOVQ    40(AX), DI
+	MOVQ    DI, BX
+	MOVQ    72(AX), CX
+	MOVQ    CX, (SP)
+	MOVQ    48(AX), R8
+	MOVQ    56(AX), R9
+	MOVQ    (AX), R10
+	MOVQ    8(AX), R11
+	MOVQ    16(AX), R12
+	MOVQ    24(AX), R13
+
+	// Main loop
+main_loop:
+	MOVQ  BX, DI
+	CMPQ  DI, (SP)
+	SETGE DL
+
+	// br0.fillFast32()
+	MOVQ    32(R10), R14
+	MOVBQZX 40(R10), R15
+	CMPQ    R15, $0x20
+	JBE     skip_fill0
+	MOVQ    24(R10), AX
+	SUBQ    $0x20, R15
+	SUBQ    $0x04, AX
+	MOVQ    (R10), BP
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (AX)(BP*1), BP
+	MOVQ R15, CX
+	SHLQ CL, BP
+	MOVQ AX, 24(R10)
+	ORQ  BP, R14
+
+	// exhausted = exhausted || (br0.off < 4)
+	CMPQ  AX, $0x04
+	SETLT AL
+	ORB   AL, DL
+
+skip_fill0:
+	// val0 := br0.peekTopBits(peekBits)
+	MOVQ R14, BP
+	MOVQ SI, CX
+	SHRQ CL, BP
+
+	// v0 := table[val0&mask]
+	MOVW (R9)(BP*2), CX
+
+	// br0.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R14
+	ADDB CL, R15
+
+	// val1 := br0.peekTopBits(peekBits)
+	MOVQ SI, CX
+	MOVQ R14, BP
+	SHRQ CL, BP
+
+	// v1 := table[val1&mask]
+	MOVW (R9)(BP*2), CX
+
+	// br0.advance(uint8(v1.entry))
+	MOVB CH, AH
+	SHLQ CL, R14
+	ADDB CL, R15
+
+	// these two writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	MOVW AX, (DI)
+
+	// update the bitrader reader structure
+	MOVQ R14, 32(R10)
+	MOVB R15, 40(R10)
+	ADDQ R8, DI
+
+	// br1.fillFast32()
+	MOVQ    32(R11), R14
+	MOVBQZX 40(R11), R15
+	CMPQ    R15, $0x20
+	JBE     skip_fill1
+	MOVQ    24(R11), AX
+	SUBQ    $0x20, R15
+	SUBQ    $0x04, AX
+	MOVQ    (R11), BP
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (AX)(BP*1), BP
+	MOVQ R15, CX
+	SHLQ CL, BP
+	MOVQ AX, 24(R11)
+	ORQ  BP, R14
+
+	// exhausted = exhausted || (br1.off < 4)
+	CMPQ  AX, $0x04
+	SETLT AL
+	ORB   AL, DL
+
+skip_fill1:
+	// val0 := br1.peekTopBits(peekBits)
+	MOVQ R14, BP
+	MOVQ SI, CX
+	SHRQ CL, BP
+
+	// v0 := table[val0&mask]
+	MOVW (R9)(BP*2), CX
+
+	// br1.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R14
+	ADDB CL, R15
+
+	// val1 := br1.peekTopBits(peekBits)
+	MOVQ SI, CX
+	MOVQ R14, BP
+	SHRQ CL, BP
+
+	// v1 := table[val1&mask]
+	MOVW (R9)(BP*2), CX
+
+	// br1.advance(uint8(v1.entry))
+	MOVB CH, AH
+	SHLQ CL, R14
+	ADDB CL, R15
+
+	// these two writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	MOVW AX, (DI)
+
+	// update the bitrader reader structure
+	MOVQ R14, 32(R11)
+	MOVB R15, 40(R11)
+	ADDQ R8, DI
+
+	// br2.fillFast32()
+	MOVQ    32(R12), R14
+	MOVBQZX 40(R12), R15
+	CMPQ    R15, $0x20
+	JBE     skip_fill2
+	MOVQ    24(R12), AX
+	SUBQ    $0x20, R15
+	SUBQ    $0x04, AX
+	MOVQ    (R12), BP
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (AX)(BP*1), BP
+	MOVQ R15, CX
+	SHLQ CL, BP
+	MOVQ AX, 24(R12)
+	ORQ  BP, R14
+
+	// exhausted = exhausted || (br2.off < 4)
+	CMPQ  AX, $0x04
+	SETLT AL
+	ORB   AL, DL
+
+skip_fill2:
+	// val0 := br2.peekTopBits(peekBits)
+	MOVQ R14, BP
+	MOVQ SI, CX
+	SHRQ CL, BP
+
+	// v0 := table[val0&mask]
+	MOVW (R9)(BP*2), CX
+
+	// br2.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R14
+	ADDB CL, R15
+
+	// val1 := br2.peekTopBits(peekBits)
+	MOVQ SI, CX
+	MOVQ R14, BP
+	SHRQ CL, BP
+
+	// v1 := table[val1&mask]
+	MOVW (R9)(BP*2), CX
+
+	// br2.advance(uint8(v1.entry))
+	MOVB CH, AH
+	SHLQ CL, R14
+	ADDB CL, R15
+
+	// these two writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	MOVW AX, (DI)
+
+	// update the bitrader reader structure
+	MOVQ R14, 32(R12)
+	MOVB R15, 40(R12)
+	ADDQ R8, DI
+
+	// br3.fillFast32()
+	MOVQ    32(R13), R14
+	MOVBQZX 40(R13), R15
+	CMPQ    R15, $0x20
+	JBE     skip_fill3
+	MOVQ    24(R13), AX
+	SUBQ    $0x20, R15
+	SUBQ    $0x04, AX
+	MOVQ    (R13), BP
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (AX)(BP*1), BP
+	MOVQ R15, CX
+	SHLQ CL, BP
+	MOVQ AX, 24(R13)
+	ORQ  BP, R14
+
+	// exhausted = exhausted || (br3.off < 4)
+	CMPQ  AX, $0x04
+	SETLT AL
+	ORB   AL, DL
+
+skip_fill3:
+	// val0 := br3.peekTopBits(peekBits)
+	MOVQ R14, BP
+	MOVQ SI, CX
+	SHRQ CL, BP
+
+	// v0 := table[val0&mask]
+	MOVW (R9)(BP*2), CX
+
+	// br3.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R14
+	ADDB CL, R15
+
+	// val1 := br3.peekTopBits(peekBits)
+	MOVQ SI, CX
+	MOVQ R14, BP
+	SHRQ CL, BP
+
+	// v1 := table[val1&mask]
+	MOVW (R9)(BP*2), CX
+
+	// br3.advance(uint8(v1.entry))
+	MOVB CH, AH
+	SHLQ CL, R14
+	ADDB CL, R15
+
+	// these two writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	MOVW AX, (DI)
+
+	// update the bitrader reader structure
+	MOVQ  R14, 32(R13)
+	MOVB  R15, 40(R13)
+	ADDQ  $0x02, BX
+	TESTB DL, DL
+	JZ    main_loop
+	MOVQ  ctx+0(FP), AX
+	MOVQ  40(AX), CX
+	MOVQ  BX, DX
+	SUBQ  CX, DX
+	SHLQ  $0x02, DX
+	MOVQ  DX, 64(AX)
+	RET
+
+// func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
+TEXT ·decompress4x_8b_main_loop_amd64(SB), $16-8
+	XORQ DX, DX
+
+	// Preload values
+	MOVQ    ctx+0(FP), CX
+	MOVBQZX 32(CX), BX
+	MOVQ    40(CX), SI
+	MOVQ    SI, (SP)
+	MOVQ    72(CX), DX
+	MOVQ    DX, 8(SP)
+	MOVQ    48(CX), DI
+	MOVQ    56(CX), R8
+	MOVQ    (CX), R9
+	MOVQ    8(CX), R10
+	MOVQ    16(CX), R11
+	MOVQ    24(CX), R12
+
+	// Main loop
+main_loop:
+	MOVQ  (SP), SI
+	CMPQ  SI, 8(SP)
+	SETGE DL
+
+	// br1000.fillFast32()
+	MOVQ    32(R9), R13
+	MOVBQZX 40(R9), R14
+	CMPQ    R14, $0x20
+	JBE     skip_fill1000
+	MOVQ    24(R9), R15
+	SUBQ    $0x20, R14
+	SUBQ    $0x04, R15
+	MOVQ    (R9), BP
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (R15)(BP*1), BP
+	MOVQ R14, CX
+	SHLQ CL, BP
+	MOVQ R15, 24(R9)
+	ORQ  BP, R13
+
+	// exhausted = exhausted || (br1000.off < 4)
+	CMPQ  R15, $0x04
+	SETLT AL
+	ORB   AL, DL
+
+skip_fill1000:
+	// val0 := br0.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v0 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br0.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R13
+	ADDB CL, R14
+
+	// val1 := br0.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v1 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br0.advance(uint8(v1.entry)
+	MOVB   CH, AH
+	SHLQ   CL, R13
+	ADDB   CL, R14
+	BSWAPL AX
+
+	// val2 := br0.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v2 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br0.advance(uint8(v2.entry)
+	MOVB CH, AH
+	SHLQ CL, R13
+	ADDB CL, R14
+
+	// val3 := br0.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v3 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br0.advance(uint8(v3.entry)
+	MOVB   CH, AL
+	SHLQ   CL, R13
+	ADDB   CL, R14
+	BSWAPL AX
+
+	// these four writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+	MOVL AX, (SI)
+
+	// update the bitreader reader structure
+	MOVQ R13, 32(R9)
+	MOVB R14, 40(R9)
+	ADDQ DI, SI
+
+	// br1001.fillFast32()
+	MOVQ    32(R10), R13
+	MOVBQZX 40(R10), R14
+	CMPQ    R14, $0x20
+	JBE     skip_fill1001
+	MOVQ    24(R10), R15
+	SUBQ    $0x20, R14
+	SUBQ    $0x04, R15
+	MOVQ    (R10), BP
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (R15)(BP*1), BP
+	MOVQ R14, CX
+	SHLQ CL, BP
+	MOVQ R15, 24(R10)
+	ORQ  BP, R13
+
+	// exhausted = exhausted || (br1001.off < 4)
+	CMPQ  R15, $0x04
+	SETLT AL
+	ORB   AL, DL
+
+skip_fill1001:
+	// val0 := br1.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v0 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br1.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R13
+	ADDB CL, R14
+
+	// val1 := br1.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v1 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br1.advance(uint8(v1.entry)
+	MOVB   CH, AH
+	SHLQ   CL, R13
+	ADDB   CL, R14
+	BSWAPL AX
+
+	// val2 := br1.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v2 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br1.advance(uint8(v2.entry)
+	MOVB CH, AH
+	SHLQ CL, R13
+	ADDB CL, R14
+
+	// val3 := br1.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v3 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br1.advance(uint8(v3.entry)
+	MOVB   CH, AL
+	SHLQ   CL, R13
+	ADDB   CL, R14
+	BSWAPL AX
+
+	// these four writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+	MOVL AX, (SI)
+
+	// update the bitreader reader structure
+	MOVQ R13, 32(R10)
+	MOVB R14, 40(R10)
+	ADDQ DI, SI
+
+	// br1002.fillFast32()
+	MOVQ    32(R11), R13
+	MOVBQZX 40(R11), R14
+	CMPQ    R14, $0x20
+	JBE     skip_fill1002
+	MOVQ    24(R11), R15
+	SUBQ    $0x20, R14
+	SUBQ    $0x04, R15
+	MOVQ    (R11), BP
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (R15)(BP*1), BP
+	MOVQ R14, CX
+	SHLQ CL, BP
+	MOVQ R15, 24(R11)
+	ORQ  BP, R13
+
+	// exhausted = exhausted || (br1002.off < 4)
+	CMPQ  R15, $0x04
+	SETLT AL
+	ORB   AL, DL
+
+skip_fill1002:
+	// val0 := br2.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v0 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br2.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R13
+	ADDB CL, R14
+
+	// val1 := br2.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v1 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br2.advance(uint8(v1.entry)
+	MOVB   CH, AH
+	SHLQ   CL, R13
+	ADDB   CL, R14
+	BSWAPL AX
+
+	// val2 := br2.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v2 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br2.advance(uint8(v2.entry)
+	MOVB CH, AH
+	SHLQ CL, R13
+	ADDB CL, R14
+
+	// val3 := br2.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v3 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br2.advance(uint8(v3.entry)
+	MOVB   CH, AL
+	SHLQ   CL, R13
+	ADDB   CL, R14
+	BSWAPL AX
+
+	// these four writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+	MOVL AX, (SI)
+
+	// update the bitreader reader structure
+	MOVQ R13, 32(R11)
+	MOVB R14, 40(R11)
+	ADDQ DI, SI
+
+	// br1003.fillFast32()
+	MOVQ    32(R12), R13
+	MOVBQZX 40(R12), R14
+	CMPQ    R14, $0x20
+	JBE     skip_fill1003
+	MOVQ    24(R12), R15
+	SUBQ    $0x20, R14
+	SUBQ    $0x04, R15
+	MOVQ    (R12), BP
+
+	// b.value |= uint64(low) << (b.bitsRead & 63)
+	MOVL (R15)(BP*1), BP
+	MOVQ R14, CX
+	SHLQ CL, BP
+	MOVQ R15, 24(R12)
+	ORQ  BP, R13
+
+	// exhausted = exhausted || (br1003.off < 4)
+	CMPQ  R15, $0x04
+	SETLT AL
+	ORB   AL, DL
+
+skip_fill1003:
+	// val0 := br3.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v0 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br3.advance(uint8(v0.entry)
+	MOVB CH, AL
+	SHLQ CL, R13
+	ADDB CL, R14
+
+	// val1 := br3.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v1 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br3.advance(uint8(v1.entry)
+	MOVB   CH, AH
+	SHLQ   CL, R13
+	ADDB   CL, R14
+	BSWAPL AX
+
+	// val2 := br3.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v2 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br3.advance(uint8(v2.entry)
+	MOVB CH, AH
+	SHLQ CL, R13
+	ADDB CL, R14
+
+	// val3 := br3.peekTopBits(peekBits)
+	MOVQ R13, R15
+	MOVQ BX, CX
+	SHRQ CL, R15
+
+	// v3 := table[val0&mask]
+	MOVW (R8)(R15*2), CX
+
+	// br3.advance(uint8(v3.entry)
+	MOVB   CH, AL
+	SHLQ   CL, R13
+	ADDB   CL, R14
+	BSWAPL AX
+
+	// these four writes get coalesced
+	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+	MOVL AX, (SI)
+
+	// update the bitreader reader structure
+	MOVQ  R13, 32(R12)
+	MOVB  R14, 40(R12)
+	ADDQ  $0x04, (SP)
+	TESTB DL, DL
+	JZ    main_loop
+	MOVQ  ctx+0(FP), AX
+	MOVQ  40(AX), CX
+	MOVQ  (SP), DX
+	SUBQ  CX, DX
+	SHLQ  $0x02, DX
+	MOVQ  DX, 64(AX)
+	RET
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_generic.go b/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
new file mode 100644
index 0000000..126b4d6
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
@@ -0,0 +1,193 @@
+//go:build !amd64 || appengine || !gc || noasm
+// +build !amd64 appengine !gc noasm
+
+// This file contains a generic implementation of Decoder.Decompress4X.
+package huff0
+
+import (
+	"errors"
+	"fmt"
+)
+
+// Decompress4X will decompress a 4X encoded stream.
+// The length of the supplied input must match the end of a block exactly.
+// The *capacity* of the dst slice must match the destination size of
+// the uncompressed data exactly.
+func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
+	if len(d.dt.single) == 0 {
+		return nil, errors.New("no table loaded")
+	}
+	if len(src) < 6+(4*1) {
+		return nil, errors.New("input too small")
+	}
+	if use8BitTables && d.actualTableLog <= 8 {
+		return d.decompress4X8bit(dst, src)
+	}
+
+	var br [4]bitReaderShifted
+	// Decode "jump table"
+	start := 6
+	for i := 0; i < 3; i++ {
+		length := int(src[i*2]) | (int(src[i*2+1]) << 8)
+		if start+length >= len(src) {
+			return nil, errors.New("truncated input (or invalid offset)")
+		}
+		err := br[i].init(src[start : start+length])
+		if err != nil {
+			return nil, err
+		}
+		start += length
+	}
+	err := br[3].init(src[start:])
+	if err != nil {
+		return nil, err
+	}
+
+	// destination, offset to match first output
+	dstSize := cap(dst)
+	dst = dst[:dstSize]
+	out := dst
+	dstEvery := (dstSize + 3) / 4
+
+	const tlSize = 1 << tableLogMax
+	const tlMask = tlSize - 1
+	single := d.dt.single[:tlSize]
+
+	// Use temp table to avoid bound checks/append penalty.
+	buf := d.buffer()
+	var off uint8
+	var decoded int
+
+	// Decode 2 values from each decoder/loop.
+	const bufoff = 256
+	for {
+		if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
+			break
+		}
+
+		{
+			const stream = 0
+			const stream2 = 1
+			br[stream].fillFast()
+			br[stream2].fillFast()
+
+			val := br[stream].peekBitsFast(d.actualTableLog)
+			val2 := br[stream2].peekBitsFast(d.actualTableLog)
+			v := single[val&tlMask]
+			v2 := single[val2&tlMask]
+			br[stream].advance(uint8(v.entry))
+			br[stream2].advance(uint8(v2.entry))
+			buf[stream][off] = uint8(v.entry >> 8)
+			buf[stream2][off] = uint8(v2.entry >> 8)
+
+			val = br[stream].peekBitsFast(d.actualTableLog)
+			val2 = br[stream2].peekBitsFast(d.actualTableLog)
+			v = single[val&tlMask]
+			v2 = single[val2&tlMask]
+			br[stream].advance(uint8(v.entry))
+			br[stream2].advance(uint8(v2.entry))
+			buf[stream][off+1] = uint8(v.entry >> 8)
+			buf[stream2][off+1] = uint8(v2.entry >> 8)
+		}
+
+		{
+			const stream = 2
+			const stream2 = 3
+			br[stream].fillFast()
+			br[stream2].fillFast()
+
+			val := br[stream].peekBitsFast(d.actualTableLog)
+			val2 := br[stream2].peekBitsFast(d.actualTableLog)
+			v := single[val&tlMask]
+			v2 := single[val2&tlMask]
+			br[stream].advance(uint8(v.entry))
+			br[stream2].advance(uint8(v2.entry))
+			buf[stream][off] = uint8(v.entry >> 8)
+			buf[stream2][off] = uint8(v2.entry >> 8)
+
+			val = br[stream].peekBitsFast(d.actualTableLog)
+			val2 = br[stream2].peekBitsFast(d.actualTableLog)
+			v = single[val&tlMask]
+			v2 = single[val2&tlMask]
+			br[stream].advance(uint8(v.entry))
+			br[stream2].advance(uint8(v2.entry))
+			buf[stream][off+1] = uint8(v.entry >> 8)
+			buf[stream2][off+1] = uint8(v2.entry >> 8)
+		}
+
+		off += 2
+
+		if off == 0 {
+			if bufoff > dstEvery {
+				d.bufs.Put(buf)
+				return nil, errors.New("corruption detected: stream overrun 1")
+			}
+			copy(out, buf[0][:])
+			copy(out[dstEvery:], buf[1][:])
+			copy(out[dstEvery*2:], buf[2][:])
+			copy(out[dstEvery*3:], buf[3][:])
+			out = out[bufoff:]
+			decoded += bufoff * 4
+			// There must at least be 3 buffers left.
+			if len(out) < dstEvery*3 {
+				d.bufs.Put(buf)
+				return nil, errors.New("corruption detected: stream overrun 2")
+			}
+		}
+	}
+	if off > 0 {
+		ioff := int(off)
+		if len(out) < dstEvery*3+ioff {
+			d.bufs.Put(buf)
+			return nil, errors.New("corruption detected: stream overrun 3")
+		}
+		copy(out, buf[0][:off])
+		copy(out[dstEvery:], buf[1][:off])
+		copy(out[dstEvery*2:], buf[2][:off])
+		copy(out[dstEvery*3:], buf[3][:off])
+		decoded += int(off) * 4
+		out = out[off:]
+	}
+
+	// Decode remaining.
+	remainBytes := dstEvery - (decoded / 4)
+	for i := range br {
+		offset := dstEvery * i
+		endsAt := offset + remainBytes
+		if endsAt > len(out) {
+			endsAt = len(out)
+		}
+		br := &br[i]
+		bitsLeft := br.remaining()
+		for bitsLeft > 0 {
+			br.fill()
+			if offset >= endsAt {
+				d.bufs.Put(buf)
+				return nil, errors.New("corruption detected: stream overrun 4")
+			}
+
+			// Read value and increment offset.
+			val := br.peekBitsFast(d.actualTableLog)
+			v := single[val&tlMask].entry
+			nBits := uint8(v)
+			br.advance(nBits)
+			bitsLeft -= uint(nBits)
+			out[offset] = uint8(v >> 8)
+			offset++
+		}
+		if offset != endsAt {
+			d.bufs.Put(buf)
+			return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
+		}
+		decoded += offset - dstEvery*i
+		err = br.close()
+		if err != nil {
+			return nil, err
+		}
+	}
+	d.bufs.Put(buf)
+	if dstSize != decoded {
+		return nil, errors.New("corruption detected: short output block")
+	}
+	return dst, nil
+}
diff --git a/vendor/github.com/klauspost/compress/huff0/huff0.go b/vendor/github.com/klauspost/compress/huff0/huff0.go
index 177d6c4..e8ad17a 100644
--- a/vendor/github.com/klauspost/compress/huff0/huff0.go
+++ b/vendor/github.com/klauspost/compress/huff0/huff0.go
@@ -8,6 +8,7 @@ import (
 	"fmt"
 	"math"
 	"math/bits"
+	"sync"
 
 	"github.com/klauspost/compress/fse"
 )
@@ -55,6 +56,9 @@ const (
 	// ReusePolicyNone will disable re-use of tables.
 	// This is slightly faster than ReusePolicyAllow but may produce larger output.
 	ReusePolicyNone
+
+	// ReusePolicyMust must allow reuse and produce smaller output.
+	ReusePolicyMust
 )
 
 type Scratch struct {
@@ -113,9 +117,20 @@ type Scratch struct {
 	nodes          []nodeElt
 	tmpOut         [4][]byte
 	fse            *fse.Scratch
+	decPool        sync.Pool // *[4][256]byte buffers.
 	huffWeight     [maxSymbolValue + 1]byte
 }
 
+// TransferCTable will transfer the previously used compression table.
+func (s *Scratch) TransferCTable(src *Scratch) {
+	if cap(s.prevTable) < len(src.prevTable) {
+		s.prevTable = make(cTable, 0, maxSymbolValue+1)
+	}
+	s.prevTable = s.prevTable[:len(src.prevTable)]
+	copy(s.prevTable, src.prevTable)
+	s.prevTableLog = src.prevTableLog
+}
+
 func (s *Scratch) prepare(in []byte) (*Scratch, error) {
 	if len(in) > BlockSizeMax {
 		return nil, ErrTooBig
@@ -232,6 +247,68 @@ func (c cTable) write(s *Scratch) error {
 	return nil
 }
 
+func (c cTable) estTableSize(s *Scratch) (sz int, err error) {
+	var (
+		// precomputed conversion table
+		bitsToWeight [tableLogMax + 1]byte
+		huffLog      = s.actualTableLog
+		// last weight is not saved.
+		maxSymbolValue = uint8(s.symbolLen - 1)
+		huffWeight     = s.huffWeight[:256]
+	)
+	const (
+		maxFSETableLog = 6
+	)
+	// convert to weight
+	bitsToWeight[0] = 0
+	for n := uint8(1); n < huffLog+1; n++ {
+		bitsToWeight[n] = huffLog + 1 - n
+	}
+
+	// Acquire histogram for FSE.
+	hist := s.fse.Histogram()
+	hist = hist[:256]
+	for i := range hist[:16] {
+		hist[i] = 0
+	}
+	for n := uint8(0); n < maxSymbolValue; n++ {
+		v := bitsToWeight[c[n].nBits] & 15
+		huffWeight[n] = v
+		hist[v]++
+	}
+
+	// FSE compress if feasible.
+	if maxSymbolValue >= 2 {
+		huffMaxCnt := uint32(0)
+		huffMax := uint8(0)
+		for i, v := range hist[:16] {
+			if v == 0 {
+				continue
+			}
+			huffMax = byte(i)
+			if v > huffMaxCnt {
+				huffMaxCnt = v
+			}
+		}
+		s.fse.HistogramFinished(huffMax, int(huffMaxCnt))
+		s.fse.TableLog = maxFSETableLog
+		b, err := fse.Compress(huffWeight[:maxSymbolValue], s.fse)
+		if err == nil && len(b) < int(s.symbolLen>>1) {
+			sz += 1 + len(b)
+			return sz, nil
+		}
+		// Unable to compress (RLE/uncompressible)
+	}
+	// write raw values as 4-bits (max : 15)
+	if maxSymbolValue > (256 - 128) {
+		// should not happen : likely means source cannot be compressed
+		return 0, ErrIncompressible
+	}
+	// special case, pack weights 4 bits/weight.
+	sz += 1 + int(maxSymbolValue/2)
+	return sz, nil
+}
+
 // estimateSize returns the estimated size in bytes of the input represented in the
 // histogram supplied.
 func (c cTable) estimateSize(hist []uint32) int {
diff --git a/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo.go b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo.go
new file mode 100644
index 0000000..3954c51
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo.go
@@ -0,0 +1,34 @@
+// Package cpuinfo gives runtime info about the current CPU.
+//
+// This is a very limited module meant for use internally
+// in this project. For more versatile solution check
+// https://github.com/klauspost/cpuid.
+package cpuinfo
+
+// HasBMI1 checks whether an x86 CPU supports the BMI1 extension.
+func HasBMI1() bool {
+	return hasBMI1
+}
+
+// HasBMI2 checks whether an x86 CPU supports the BMI2 extension.
+func HasBMI2() bool {
+	return hasBMI2
+}
+
+// DisableBMI2 will disable BMI2, for testing purposes.
+// Call returned function to restore previous state.
+func DisableBMI2() func() {
+	old := hasBMI2
+	hasBMI2 = false
+	return func() {
+		hasBMI2 = old
+	}
+}
+
+// HasBMI checks whether an x86 CPU supports both BMI1 and BMI2 extensions.
+func HasBMI() bool {
+	return HasBMI1() && HasBMI2()
+}
+
+var hasBMI1 bool
+var hasBMI2 bool
diff --git a/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.go b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.go
new file mode 100644
index 0000000..e802579
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.go
@@ -0,0 +1,11 @@
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+package cpuinfo
+
+// go:noescape
+func x86extensions() (bmi1, bmi2 bool)
+
+func init() {
+	hasBMI1, hasBMI2 = x86extensions()
+}
diff --git a/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.s b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.s
new file mode 100644
index 0000000..4465fbe
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/internal/cpuinfo/cpuinfo_amd64.s
@@ -0,0 +1,36 @@
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+#include "funcdata.h"
+#include "go_asm.h"
+
+TEXT ·x86extensions(SB), NOSPLIT, $0
+	// 1. determine max EAX value
+	XORQ AX, AX
+	CPUID
+
+	CMPQ AX, $7
+	JB   unsupported
+
+	// 2. EAX = 7, ECX = 0 --- see Table 3-8 "Information Returned by CPUID Instruction"
+	MOVQ $7, AX
+	MOVQ $0, CX
+	CPUID
+
+	BTQ   $3, BX // bit 3 = BMI1
+	SETCS AL
+
+	BTQ   $8, BX // bit 8 = BMI2
+	SETCS AH
+
+	MOVB AL, bmi1+0(FP)
+	MOVB AH, bmi2+1(FP)
+	RET
+
+unsupported:
+	XORQ AX, AX
+	MOVB AL, bmi1+0(FP)
+	MOVB AL, bmi2+1(FP)
+	RET
diff --git a/vendor/github.com/klauspost/compress/snappy/LICENSE b/vendor/github.com/klauspost/compress/internal/snapref/LICENSE
similarity index 100%
rename from vendor/github.com/klauspost/compress/snappy/LICENSE
rename to vendor/github.com/klauspost/compress/internal/snapref/LICENSE
diff --git a/vendor/github.com/klauspost/compress/snappy/decode.go b/vendor/github.com/klauspost/compress/internal/snapref/decode.go
similarity index 85%
rename from vendor/github.com/klauspost/compress/snappy/decode.go
rename to vendor/github.com/klauspost/compress/internal/snapref/decode.go
index 72efb03..40796a4 100644
--- a/vendor/github.com/klauspost/compress/snappy/decode.go
+++ b/vendor/github.com/klauspost/compress/internal/snapref/decode.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-package snappy
+package snapref
 
 import (
 	"encoding/binary"
@@ -52,6 +52,8 @@ const (
 // Otherwise, a newly allocated slice will be returned.
 //
 // The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// Decode handles the Snappy block format, not the Snappy stream format.
 func Decode(dst, src []byte) ([]byte, error) {
 	dLen, s, err := decodedLen(src)
 	if err != nil {
@@ -83,6 +85,8 @@ func NewReader(r io.Reader) *Reader {
 }
 
 // Reader is an io.Reader that can read Snappy-compressed bytes.
+//
+// Reader handles the Snappy stream format, not the Snappy block format.
 type Reader struct {
 	r       io.Reader
 	err     error
@@ -114,32 +118,23 @@ func (r *Reader) readFull(p []byte, allowEOF bool) (ok bool) {
 	return true
 }
 
-// Read satisfies the io.Reader interface.
-func (r *Reader) Read(p []byte) (int, error) {
-	if r.err != nil {
-		return 0, r.err
-	}
-	for {
-		if r.i < r.j {
-			n := copy(p, r.decoded[r.i:r.j])
-			r.i += n
-			return n, nil
-		}
+func (r *Reader) fill() error {
+	for r.i >= r.j {
 		if !r.readFull(r.buf[:4], true) {
-			return 0, r.err
+			return r.err
 		}
 		chunkType := r.buf[0]
 		if !r.readHeader {
 			if chunkType != chunkTypeStreamIdentifier {
 				r.err = ErrCorrupt
-				return 0, r.err
+				return r.err
 			}
 			r.readHeader = true
 		}
 		chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16
 		if chunkLen > len(r.buf) {
 			r.err = ErrUnsupported
-			return 0, r.err
+			return r.err
 		}
 
 		// The chunk types are specified at
@@ -149,11 +144,11 @@ func (r *Reader) Read(p []byte) (int, error) {
 			// Section 4.2. Compressed data (chunk type 0x00).
 			if chunkLen < checksumSize {
 				r.err = ErrCorrupt
-				return 0, r.err
+				return r.err
 			}
 			buf := r.buf[:chunkLen]
 			if !r.readFull(buf, false) {
-				return 0, r.err
+				return r.err
 			}
 			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
 			buf = buf[checksumSize:]
@@ -161,19 +156,19 @@ func (r *Reader) Read(p []byte) (int, error) {
 			n, err := DecodedLen(buf)
 			if err != nil {
 				r.err = err
-				return 0, r.err
+				return r.err
 			}
 			if n > len(r.decoded) {
 				r.err = ErrCorrupt
-				return 0, r.err
+				return r.err
 			}
 			if _, err := Decode(r.decoded, buf); err != nil {
 				r.err = err
-				return 0, r.err
+				return r.err
 			}
 			if crc(r.decoded[:n]) != checksum {
 				r.err = ErrCorrupt
-				return 0, r.err
+				return r.err
 			}
 			r.i, r.j = 0, n
 			continue
@@ -182,25 +177,25 @@ func (r *Reader) Read(p []byte) (int, error) {
 			// Section 4.3. Uncompressed data (chunk type 0x01).
 			if chunkLen < checksumSize {
 				r.err = ErrCorrupt
-				return 0, r.err
+				return r.err
 			}
 			buf := r.buf[:checksumSize]
 			if !r.readFull(buf, false) {
-				return 0, r.err
+				return r.err
 			}
 			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
 			// Read directly into r.decoded instead of via r.buf.
 			n := chunkLen - checksumSize
 			if n > len(r.decoded) {
 				r.err = ErrCorrupt
-				return 0, r.err
+				return r.err
 			}
 			if !r.readFull(r.decoded[:n], false) {
-				return 0, r.err
+				return r.err
 			}
 			if crc(r.decoded[:n]) != checksum {
 				r.err = ErrCorrupt
-				return 0, r.err
+				return r.err
 			}
 			r.i, r.j = 0, n
 			continue
@@ -209,15 +204,15 @@ func (r *Reader) Read(p []byte) (int, error) {
 			// Section 4.1. Stream identifier (chunk type 0xff).
 			if chunkLen != len(magicBody) {
 				r.err = ErrCorrupt
-				return 0, r.err
+				return r.err
 			}
 			if !r.readFull(r.buf[:len(magicBody)], false) {
-				return 0, r.err
+				return r.err
 			}
 			for i := 0; i < len(magicBody); i++ {
 				if r.buf[i] != magicBody[i] {
 					r.err = ErrCorrupt
-					return 0, r.err
+					return r.err
 				}
 			}
 			continue
@@ -226,12 +221,44 @@ func (r *Reader) Read(p []byte) (int, error) {
 		if chunkType <= 0x7f {
 			// Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
 			r.err = ErrUnsupported
-			return 0, r.err
+			return r.err
 		}
 		// Section 4.4 Padding (chunk type 0xfe).
 		// Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
 		if !r.readFull(r.buf[:chunkLen], false) {
-			return 0, r.err
+			return r.err
 		}
 	}
+
+	return nil
+}
+
+// Read satisfies the io.Reader interface.
+func (r *Reader) Read(p []byte) (int, error) {
+	if r.err != nil {
+		return 0, r.err
+	}
+
+	if err := r.fill(); err != nil {
+		return 0, err
+	}
+
+	n := copy(p, r.decoded[r.i:r.j])
+	r.i += n
+	return n, nil
+}
+
+// ReadByte satisfies the io.ByteReader interface.
+func (r *Reader) ReadByte() (byte, error) {
+	if r.err != nil {
+		return 0, r.err
+	}
+
+	if err := r.fill(); err != nil {
+		return 0, err
+	}
+
+	c := r.decoded[r.i]
+	r.i++
+	return c, nil
 }
diff --git a/vendor/github.com/klauspost/compress/snappy/decode_other.go b/vendor/github.com/klauspost/compress/internal/snapref/decode_other.go
similarity index 97%
rename from vendor/github.com/klauspost/compress/snappy/decode_other.go
rename to vendor/github.com/klauspost/compress/internal/snapref/decode_other.go
index 94a96c5..77395a6 100644
--- a/vendor/github.com/klauspost/compress/snappy/decode_other.go
+++ b/vendor/github.com/klauspost/compress/internal/snapref/decode_other.go
@@ -2,9 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !amd64 appengine !gc noasm
-
-package snappy
+package snapref
 
 // decode writes the decoding of src to dst. It assumes that the varint-encoded
 // length of the decompressed bytes has already been read, and that len(dst)
@@ -87,7 +85,7 @@ func decode(dst, src []byte) int {
 		}
 		// Copy from an earlier sub-slice of dst to a later sub-slice.
 		// If no overlap, use the built-in copy:
-		if offset > length {
+		if offset >= length {
 			copy(dst[d:d+length], dst[d-offset:])
 			d += length
 			continue
diff --git a/vendor/github.com/klauspost/compress/snappy/encode.go b/vendor/github.com/klauspost/compress/internal/snapref/encode.go
similarity index 98%
rename from vendor/github.com/klauspost/compress/snappy/encode.go
rename to vendor/github.com/klauspost/compress/internal/snapref/encode.go
index 8d393e9..13c6040 100644
--- a/vendor/github.com/klauspost/compress/snappy/encode.go
+++ b/vendor/github.com/klauspost/compress/internal/snapref/encode.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-package snappy
+package snapref
 
 import (
 	"encoding/binary"
@@ -15,6 +15,8 @@ import (
 // Otherwise, a newly allocated slice will be returned.
 //
 // The dst and src must not overlap. It is valid to pass a nil dst.
+//
+// Encode handles the Snappy block format, not the Snappy stream format.
 func Encode(dst, src []byte) []byte {
 	if n := MaxEncodedLen(len(src)); n < 0 {
 		panic(ErrTooLarge)
@@ -139,6 +141,8 @@ func NewBufferedWriter(w io.Writer) *Writer {
 }
 
 // Writer is an io.Writer that can write Snappy-compressed bytes.
+//
+// Writer handles the Snappy stream format, not the Snappy block format.
 type Writer struct {
 	w   io.Writer
 	err error
diff --git a/vendor/github.com/klauspost/compress/snappy/encode_other.go b/vendor/github.com/klauspost/compress/internal/snapref/encode_other.go
similarity index 99%
rename from vendor/github.com/klauspost/compress/snappy/encode_other.go
rename to vendor/github.com/klauspost/compress/internal/snapref/encode_other.go
index dbcae90..511bba6 100644
--- a/vendor/github.com/klauspost/compress/snappy/encode_other.go
+++ b/vendor/github.com/klauspost/compress/internal/snapref/encode_other.go
@@ -2,9 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !amd64 appengine !gc noasm
-
-package snappy
+package snapref
 
 func load32(b []byte, i int) uint32 {
 	b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
diff --git a/vendor/github.com/klauspost/compress/snappy/snappy.go b/vendor/github.com/klauspost/compress/internal/snapref/snappy.go
similarity index 97%
rename from vendor/github.com/klauspost/compress/snappy/snappy.go
rename to vendor/github.com/klauspost/compress/internal/snapref/snappy.go
index 74a3668..34d01f4 100644
--- a/vendor/github.com/klauspost/compress/snappy/snappy.go
+++ b/vendor/github.com/klauspost/compress/internal/snapref/snappy.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// Package snappy implements the Snappy compression format. It aims for very
+// Package snapref implements the Snappy compression format. It aims for very
 // high speeds and reasonable compression.
 //
 // There are actually two Snappy formats: block and stream. They are related,
@@ -17,7 +17,7 @@
 //
 // The canonical, C++ implementation is at https://github.com/google/snappy and
 // it only implements the block format.
-package snappy
+package snapref
 
 import (
 	"hash/crc32"
diff --git a/vendor/github.com/klauspost/compress/s2sx.mod b/vendor/github.com/klauspost/compress/s2sx.mod
new file mode 100644
index 0000000..2263853
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/s2sx.mod
@@ -0,0 +1,4 @@
+module github.com/klauspost/compress
+
+go 1.16
+
diff --git a/vendor/github.com/klauspost/compress/s2sx.sum b/vendor/github.com/klauspost/compress/s2sx.sum
new file mode 100644
index 0000000..e69de29
diff --git a/vendor/github.com/klauspost/compress/snappy/.gitignore b/vendor/github.com/klauspost/compress/snappy/.gitignore
deleted file mode 100644
index 042091d..0000000
--- a/vendor/github.com/klauspost/compress/snappy/.gitignore
+++ /dev/null
@@ -1,16 +0,0 @@
-cmd/snappytool/snappytool
-testdata/bench
-
-# These explicitly listed benchmark data files are for an obsolete version of
-# snappy_test.go.
-testdata/alice29.txt
-testdata/asyoulik.txt
-testdata/fireworks.jpeg
-testdata/geo.protodata
-testdata/html
-testdata/html_x_4
-testdata/kppkn.gtb
-testdata/lcet10.txt
-testdata/paper-100k.pdf
-testdata/plrabn12.txt
-testdata/urls.10K
diff --git a/vendor/github.com/klauspost/compress/snappy/AUTHORS b/vendor/github.com/klauspost/compress/snappy/AUTHORS
deleted file mode 100644
index bcfa195..0000000
--- a/vendor/github.com/klauspost/compress/snappy/AUTHORS
+++ /dev/null
@@ -1,15 +0,0 @@
-# This is the official list of Snappy-Go authors for copyright purposes.
-# This file is distinct from the CONTRIBUTORS files.
-# See the latter for an explanation.
-
-# Names should be added to this file as
-#	Name or Organization <email address>
-# The email address is not required for organizations.
-
-# Please keep the list sorted.
-
-Damian Gryski <dgryski@gmail.com>
-Google Inc.
-Jan Mercl <0xjnml@gmail.com>
-Rodolfo Carvalho <rhcarvalho@gmail.com>
-Sebastien Binet <seb.binet@gmail.com>
diff --git a/vendor/github.com/klauspost/compress/snappy/CONTRIBUTORS b/vendor/github.com/klauspost/compress/snappy/CONTRIBUTORS
deleted file mode 100644
index 931ae31..0000000
--- a/vendor/github.com/klauspost/compress/snappy/CONTRIBUTORS
+++ /dev/null
@@ -1,37 +0,0 @@
-# This is the official list of people who can contribute
-# (and typically have contributed) code to the Snappy-Go repository.
-# The AUTHORS file lists the copyright holders; this file
-# lists people.  For example, Google employees are listed here
-# but not in AUTHORS, because Google holds the copyright.
-#
-# The submission process automatically checks to make sure
-# that people submitting code are listed in this file (by email address).
-#
-# Names should be added to this file only after verifying that
-# the individual or the individual's organization has agreed to
-# the appropriate Contributor License Agreement, found here:
-#
-#     http://code.google.com/legal/individual-cla-v1.0.html
-#     http://code.google.com/legal/corporate-cla-v1.0.html
-#
-# The agreement for individuals can be filled out on the web.
-#
-# When adding J Random Contributor's name to this file,
-# either J's name or J's organization's name should be
-# added to the AUTHORS file, depending on whether the
-# individual or corporate CLA was used.
-
-# Names should be added to this file like so:
-#     Name <email address>
-
-# Please keep the list sorted.
-
-Damian Gryski <dgryski@gmail.com>
-Jan Mercl <0xjnml@gmail.com>
-Kai Backman <kaib@golang.org>
-Marc-Antoine Ruel <maruel@chromium.org>
-Nigel Tao <nigeltao@golang.org>
-Rob Pike <r@golang.org>
-Rodolfo Carvalho <rhcarvalho@gmail.com>
-Russ Cox <rsc@golang.org>
-Sebastien Binet <seb.binet@gmail.com>
diff --git a/vendor/github.com/klauspost/compress/snappy/README b/vendor/github.com/klauspost/compress/snappy/README
deleted file mode 100644
index cea1287..0000000
--- a/vendor/github.com/klauspost/compress/snappy/README
+++ /dev/null
@@ -1,107 +0,0 @@
-The Snappy compression format in the Go programming language.
-
-To download and install from source:
-$ go get github.com/golang/snappy
-
-Unless otherwise noted, the Snappy-Go source files are distributed
-under the BSD-style license found in the LICENSE file.
-
-
-
-Benchmarks.
-
-The golang/snappy benchmarks include compressing (Z) and decompressing (U) ten
-or so files, the same set used by the C++ Snappy code (github.com/google/snappy
-and note the "google", not "golang"). On an "Intel(R) Core(TM) i7-3770 CPU @
-3.40GHz", Go's GOARCH=amd64 numbers as of 2016-05-29:
-
-"go test -test.bench=."
-
-_UFlat0-8         2.19GB/s ± 0%  html
-_UFlat1-8         1.41GB/s ± 0%  urls
-_UFlat2-8         23.5GB/s ± 2%  jpg
-_UFlat3-8         1.91GB/s ± 0%  jpg_200
-_UFlat4-8         14.0GB/s ± 1%  pdf
-_UFlat5-8         1.97GB/s ± 0%  html4
-_UFlat6-8          814MB/s ± 0%  txt1
-_UFlat7-8          785MB/s ± 0%  txt2
-_UFlat8-8          857MB/s ± 0%  txt3
-_UFlat9-8          719MB/s ± 1%  txt4
-_UFlat10-8        2.84GB/s ± 0%  pb
-_UFlat11-8        1.05GB/s ± 0%  gaviota
-
-_ZFlat0-8         1.04GB/s ± 0%  html
-_ZFlat1-8          534MB/s ± 0%  urls
-_ZFlat2-8         15.7GB/s ± 1%  jpg
-_ZFlat3-8          740MB/s ± 3%  jpg_200
-_ZFlat4-8         9.20GB/s ± 1%  pdf
-_ZFlat5-8          991MB/s ± 0%  html4
-_ZFlat6-8          379MB/s ± 0%  txt1
-_ZFlat7-8          352MB/s ± 0%  txt2
-_ZFlat8-8          396MB/s ± 1%  txt3
-_ZFlat9-8          327MB/s ± 1%  txt4
-_ZFlat10-8        1.33GB/s ± 1%  pb
-_ZFlat11-8         605MB/s ± 1%  gaviota
-
-
-
-"go test -test.bench=. -tags=noasm"
-
-_UFlat0-8          621MB/s ± 2%  html
-_UFlat1-8          494MB/s ± 1%  urls
-_UFlat2-8         23.2GB/s ± 1%  jpg
-_UFlat3-8         1.12GB/s ± 1%  jpg_200
-_UFlat4-8         4.35GB/s ± 1%  pdf
-_UFlat5-8          609MB/s ± 0%  html4
-_UFlat6-8          296MB/s ± 0%  txt1
-_UFlat7-8          288MB/s ± 0%  txt2
-_UFlat8-8          309MB/s ± 1%  txt3
-_UFlat9-8          280MB/s ± 1%  txt4
-_UFlat10-8         753MB/s ± 0%  pb
-_UFlat11-8         400MB/s ± 0%  gaviota
-
-_ZFlat0-8          409MB/s ± 1%  html
-_ZFlat1-8          250MB/s ± 1%  urls
-_ZFlat2-8         12.3GB/s ± 1%  jpg
-_ZFlat3-8          132MB/s ± 0%  jpg_200
-_ZFlat4-8         2.92GB/s ± 0%  pdf
-_ZFlat5-8          405MB/s ± 1%  html4
-_ZFlat6-8          179MB/s ± 1%  txt1
-_ZFlat7-8          170MB/s ± 1%  txt2
-_ZFlat8-8          189MB/s ± 1%  txt3
-_ZFlat9-8          164MB/s ± 1%  txt4
-_ZFlat10-8         479MB/s ± 1%  pb
-_ZFlat11-8         270MB/s ± 1%  gaviota
-
-
-
-For comparison (Go's encoded output is byte-for-byte identical to C++'s), here
-are the numbers from C++ Snappy's
-
-make CXXFLAGS="-O2 -DNDEBUG -g" clean snappy_unittest.log && cat snappy_unittest.log
-
-BM_UFlat/0     2.4GB/s  html
-BM_UFlat/1     1.4GB/s  urls
-BM_UFlat/2    21.8GB/s  jpg
-BM_UFlat/3     1.5GB/s  jpg_200
-BM_UFlat/4    13.3GB/s  pdf
-BM_UFlat/5     2.1GB/s  html4
-BM_UFlat/6     1.0GB/s  txt1
-BM_UFlat/7   959.4MB/s  txt2
-BM_UFlat/8     1.0GB/s  txt3
-BM_UFlat/9   864.5MB/s  txt4
-BM_UFlat/10    2.9GB/s  pb
-BM_UFlat/11    1.2GB/s  gaviota
-
-BM_ZFlat/0   944.3MB/s  html (22.31 %)
-BM_ZFlat/1   501.6MB/s  urls (47.78 %)
-BM_ZFlat/2    14.3GB/s  jpg (99.95 %)
-BM_ZFlat/3   538.3MB/s  jpg_200 (73.00 %)
-BM_ZFlat/4     8.3GB/s  pdf (83.30 %)
-BM_ZFlat/5   903.5MB/s  html4 (22.52 %)
-BM_ZFlat/6   336.0MB/s  txt1 (57.88 %)
-BM_ZFlat/7   312.3MB/s  txt2 (61.91 %)
-BM_ZFlat/8   353.1MB/s  txt3 (54.99 %)
-BM_ZFlat/9   289.9MB/s  txt4 (66.26 %)
-BM_ZFlat/10    1.2GB/s  pb (19.68 %)
-BM_ZFlat/11  527.4MB/s  gaviota (37.72 %)
diff --git a/vendor/github.com/klauspost/compress/snappy/encode_amd64.go b/vendor/github.com/klauspost/compress/snappy/encode_amd64.go
deleted file mode 100644
index 150d91b..0000000
--- a/vendor/github.com/klauspost/compress/snappy/encode_amd64.go
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright 2016 The Snappy-Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !appengine
-// +build gc
-// +build !noasm
-
-package snappy
-
-// emitLiteral has the same semantics as in encode_other.go.
-//
-//go:noescape
-func emitLiteral(dst, lit []byte) int
-
-// emitCopy has the same semantics as in encode_other.go.
-//
-//go:noescape
-func emitCopy(dst []byte, offset, length int) int
-
-// extendMatch has the same semantics as in encode_other.go.
-//
-//go:noescape
-func extendMatch(src []byte, i, j int) int
-
-// encodeBlock has the same semantics as in encode_other.go.
-//
-//go:noescape
-func encodeBlock(dst, src []byte) (d int)
diff --git a/vendor/github.com/klauspost/compress/snappy/encode_amd64.s b/vendor/github.com/klauspost/compress/snappy/encode_amd64.s
deleted file mode 100644
index adfd979..0000000
--- a/vendor/github.com/klauspost/compress/snappy/encode_amd64.s
+++ /dev/null
@@ -1,730 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !appengine
-// +build gc
-// +build !noasm
-
-#include "textflag.h"
-
-// The XXX lines assemble on Go 1.4, 1.5 and 1.7, but not 1.6, due to a
-// Go toolchain regression. See https://github.com/golang/go/issues/15426 and
-// https://github.com/golang/snappy/issues/29
-//
-// As a workaround, the package was built with a known good assembler, and
-// those instructions were disassembled by "objdump -d" to yield the
-//	4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
-// style comments, in AT&T asm syntax. Note that rsp here is a physical
-// register, not Go/asm's SP pseudo-register (see https://golang.org/doc/asm).
-// The instructions were then encoded as "BYTE $0x.." sequences, which assemble
-// fine on Go 1.6.
-
-// The asm code generally follows the pure Go code in encode_other.go, except
-// where marked with a "!!!".
-
-// ----------------------------------------------------------------------------
-
-// func emitLiteral(dst, lit []byte) int
-//
-// All local variables fit into registers. The register allocation:
-//	- AX	len(lit)
-//	- BX	n
-//	- DX	return value
-//	- DI	&dst[i]
-//	- R10	&lit[0]
-//
-// The 24 bytes of stack space is to call runtime·memmove.
-//
-// The unusual register allocation of local variables, such as R10 for the
-// source pointer, matches the allocation used at the call site in encodeBlock,
-// which makes it easier to manually inline this function.
-TEXT ·emitLiteral(SB), NOSPLIT, $24-56
-	MOVQ dst_base+0(FP), DI
-	MOVQ lit_base+24(FP), R10
-	MOVQ lit_len+32(FP), AX
-	MOVQ AX, DX
-	MOVL AX, BX
-	SUBL $1, BX
-
-	CMPL BX, $60
-	JLT  oneByte
-	CMPL BX, $256
-	JLT  twoBytes
-
-threeBytes:
-	MOVB $0xf4, 0(DI)
-	MOVW BX, 1(DI)
-	ADDQ $3, DI
-	ADDQ $3, DX
-	JMP  memmove
-
-twoBytes:
-	MOVB $0xf0, 0(DI)
-	MOVB BX, 1(DI)
-	ADDQ $2, DI
-	ADDQ $2, DX
-	JMP  memmove
-
-oneByte:
-	SHLB $2, BX
-	MOVB BX, 0(DI)
-	ADDQ $1, DI
-	ADDQ $1, DX
-
-memmove:
-	MOVQ DX, ret+48(FP)
-
-	// copy(dst[i:], lit)
-	//
-	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
-	// DI, R10 and AX as arguments.
-	MOVQ DI, 0(SP)
-	MOVQ R10, 8(SP)
-	MOVQ AX, 16(SP)
-	CALL runtime·memmove(SB)
-	RET
-
-// ----------------------------------------------------------------------------
-
-// func emitCopy(dst []byte, offset, length int) int
-//
-// All local variables fit into registers. The register allocation:
-//	- AX	length
-//	- SI	&dst[0]
-//	- DI	&dst[i]
-//	- R11	offset
-//
-// The unusual register allocation of local variables, such as R11 for the
-// offset, matches the allocation used at the call site in encodeBlock, which
-// makes it easier to manually inline this function.
-TEXT ·emitCopy(SB), NOSPLIT, $0-48
-	MOVQ dst_base+0(FP), DI
-	MOVQ DI, SI
-	MOVQ offset+24(FP), R11
-	MOVQ length+32(FP), AX
-
-loop0:
-	// for length >= 68 { etc }
-	CMPL AX, $68
-	JLT  step1
-
-	// Emit a length 64 copy, encoded as 3 bytes.
-	MOVB $0xfe, 0(DI)
-	MOVW R11, 1(DI)
-	ADDQ $3, DI
-	SUBL $64, AX
-	JMP  loop0
-
-step1:
-	// if length > 64 { etc }
-	CMPL AX, $64
-	JLE  step2
-
-	// Emit a length 60 copy, encoded as 3 bytes.
-	MOVB $0xee, 0(DI)
-	MOVW R11, 1(DI)
-	ADDQ $3, DI
-	SUBL $60, AX
-
-step2:
-	// if length >= 12 || offset >= 2048 { goto step3 }
-	CMPL AX, $12
-	JGE  step3
-	CMPL R11, $2048
-	JGE  step3
-
-	// Emit the remaining copy, encoded as 2 bytes.
-	MOVB R11, 1(DI)
-	SHRL $8, R11
-	SHLB $5, R11
-	SUBB $4, AX
-	SHLB $2, AX
-	ORB  AX, R11
-	ORB  $1, R11
-	MOVB R11, 0(DI)
-	ADDQ $2, DI
-
-	// Return the number of bytes written.
-	SUBQ SI, DI
-	MOVQ DI, ret+40(FP)
-	RET
-
-step3:
-	// Emit the remaining copy, encoded as 3 bytes.
-	SUBL $1, AX
-	SHLB $2, AX
-	ORB  $2, AX
-	MOVB AX, 0(DI)
-	MOVW R11, 1(DI)
-	ADDQ $3, DI
-
-	// Return the number of bytes written.
-	SUBQ SI, DI
-	MOVQ DI, ret+40(FP)
-	RET
-
-// ----------------------------------------------------------------------------
-
-// func extendMatch(src []byte, i, j int) int
-//
-// All local variables fit into registers. The register allocation:
-//	- DX	&src[0]
-//	- SI	&src[j]
-//	- R13	&src[len(src) - 8]
-//	- R14	&src[len(src)]
-//	- R15	&src[i]
-//
-// The unusual register allocation of local variables, such as R15 for a source
-// pointer, matches the allocation used at the call site in encodeBlock, which
-// makes it easier to manually inline this function.
-TEXT ·extendMatch(SB), NOSPLIT, $0-48
-	MOVQ src_base+0(FP), DX
-	MOVQ src_len+8(FP), R14
-	MOVQ i+24(FP), R15
-	MOVQ j+32(FP), SI
-	ADDQ DX, R14
-	ADDQ DX, R15
-	ADDQ DX, SI
-	MOVQ R14, R13
-	SUBQ $8, R13
-
-cmp8:
-	// As long as we are 8 or more bytes before the end of src, we can load and
-	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
-	CMPQ SI, R13
-	JA   cmp1
-	MOVQ (R15), AX
-	MOVQ (SI), BX
-	CMPQ AX, BX
-	JNE  bsf
-	ADDQ $8, R15
-	ADDQ $8, SI
-	JMP  cmp8
-
-bsf:
-	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
-	// the index of the first byte that differs. The BSF instruction finds the
-	// least significant 1 bit, the amd64 architecture is little-endian, and
-	// the shift by 3 converts a bit index to a byte index.
-	XORQ AX, BX
-	BSFQ BX, BX
-	SHRQ $3, BX
-	ADDQ BX, SI
-
-	// Convert from &src[ret] to ret.
-	SUBQ DX, SI
-	MOVQ SI, ret+40(FP)
-	RET
-
-cmp1:
-	// In src's tail, compare 1 byte at a time.
-	CMPQ SI, R14
-	JAE  extendMatchEnd
-	MOVB (R15), AX
-	MOVB (SI), BX
-	CMPB AX, BX
-	JNE  extendMatchEnd
-	ADDQ $1, R15
-	ADDQ $1, SI
-	JMP  cmp1
-
-extendMatchEnd:
-	// Convert from &src[ret] to ret.
-	SUBQ DX, SI
-	MOVQ SI, ret+40(FP)
-	RET
-
-// ----------------------------------------------------------------------------
-
-// func encodeBlock(dst, src []byte) (d int)
-//
-// All local variables fit into registers, other than "var table". The register
-// allocation:
-//	- AX	.	.
-//	- BX	.	.
-//	- CX	56	shift (note that amd64 shifts by non-immediates must use CX).
-//	- DX	64	&src[0], tableSize
-//	- SI	72	&src[s]
-//	- DI	80	&dst[d]
-//	- R9	88	sLimit
-//	- R10	.	&src[nextEmit]
-//	- R11	96	prevHash, currHash, nextHash, offset
-//	- R12	104	&src[base], skip
-//	- R13	.	&src[nextS], &src[len(src) - 8]
-//	- R14	.	len(src), bytesBetweenHashLookups, &src[len(src)], x
-//	- R15	112	candidate
-//
-// The second column (56, 64, etc) is the stack offset to spill the registers
-// when calling other functions. We could pack this slightly tighter, but it's
-// simpler to have a dedicated spill map independent of the function called.
-//
-// "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
-// extra 56 bytes, to call other functions, and an extra 64 bytes, to spill
-// local variables (registers) during calls gives 32768 + 56 + 64 = 32888.
-TEXT ·encodeBlock(SB), 0, $32888-56
-	MOVQ dst_base+0(FP), DI
-	MOVQ src_base+24(FP), SI
-	MOVQ src_len+32(FP), R14
-
-	// shift, tableSize := uint32(32-8), 1<<8
-	MOVQ $24, CX
-	MOVQ $256, DX
-
-calcShift:
-	// for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
-	//	shift--
-	// }
-	CMPQ DX, $16384
-	JGE  varTable
-	CMPQ DX, R14
-	JGE  varTable
-	SUBQ $1, CX
-	SHLQ $1, DX
-	JMP  calcShift
-
-varTable:
-	// var table [maxTableSize]uint16
-	//
-	// In the asm code, unlike the Go code, we can zero-initialize only the
-	// first tableSize elements. Each uint16 element is 2 bytes and each MOVOU
-	// writes 16 bytes, so we can do only tableSize/8 writes instead of the
-	// 2048 writes that would zero-initialize all of table's 32768 bytes.
-	SHRQ $3, DX
-	LEAQ table-32768(SP), BX
-	PXOR X0, X0
-
-memclr:
-	MOVOU X0, 0(BX)
-	ADDQ  $16, BX
-	SUBQ  $1, DX
-	JNZ   memclr
-
-	// !!! DX = &src[0]
-	MOVQ SI, DX
-
-	// sLimit := len(src) - inputMargin
-	MOVQ R14, R9
-	SUBQ $15, R9
-
-	// !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't
-	// change for the rest of the function.
-	MOVQ CX, 56(SP)
-	MOVQ DX, 64(SP)
-	MOVQ R9, 88(SP)
-
-	// nextEmit := 0
-	MOVQ DX, R10
-
-	// s := 1
-	ADDQ $1, SI
-
-	// nextHash := hash(load32(src, s), shift)
-	MOVL  0(SI), R11
-	IMULL $0x1e35a7bd, R11
-	SHRL  CX, R11
-
-outer:
-	// for { etc }
-
-	// skip := 32
-	MOVQ $32, R12
-
-	// nextS := s
-	MOVQ SI, R13
-
-	// candidate := 0
-	MOVQ $0, R15
-
-inner0:
-	// for { etc }
-
-	// s := nextS
-	MOVQ R13, SI
-
-	// bytesBetweenHashLookups := skip >> 5
-	MOVQ R12, R14
-	SHRQ $5, R14
-
-	// nextS = s + bytesBetweenHashLookups
-	ADDQ R14, R13
-
-	// skip += bytesBetweenHashLookups
-	ADDQ R14, R12
-
-	// if nextS > sLimit { goto emitRemainder }
-	MOVQ R13, AX
-	SUBQ DX, AX
-	CMPQ AX, R9
-	JA   emitRemainder
-
-	// candidate = int(table[nextHash])
-	// XXX: MOVWQZX table-32768(SP)(R11*2), R15
-	// XXX: 4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
-	BYTE $0x4e
-	BYTE $0x0f
-	BYTE $0xb7
-	BYTE $0x7c
-	BYTE $0x5c
-	BYTE $0x78
-
-	// table[nextHash] = uint16(s)
-	MOVQ SI, AX
-	SUBQ DX, AX
-
-	// XXX: MOVW AX, table-32768(SP)(R11*2)
-	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
-	BYTE $0x66
-	BYTE $0x42
-	BYTE $0x89
-	BYTE $0x44
-	BYTE $0x5c
-	BYTE $0x78
-
-	// nextHash = hash(load32(src, nextS), shift)
-	MOVL  0(R13), R11
-	IMULL $0x1e35a7bd, R11
-	SHRL  CX, R11
-
-	// if load32(src, s) != load32(src, candidate) { continue } break
-	MOVL 0(SI), AX
-	MOVL (DX)(R15*1), BX
-	CMPL AX, BX
-	JNE  inner0
-
-fourByteMatch:
-	// As per the encode_other.go code:
-	//
-	// A 4-byte match has been found. We'll later see etc.
-
-	// !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
-	// on inputMargin in encode.go.
-	MOVQ SI, AX
-	SUBQ R10, AX
-	CMPQ AX, $16
-	JLE  emitLiteralFastPath
-
-	// ----------------------------------------
-	// Begin inline of the emitLiteral call.
-	//
-	// d += emitLiteral(dst[d:], src[nextEmit:s])
-
-	MOVL AX, BX
-	SUBL $1, BX
-
-	CMPL BX, $60
-	JLT  inlineEmitLiteralOneByte
-	CMPL BX, $256
-	JLT  inlineEmitLiteralTwoBytes
-
-inlineEmitLiteralThreeBytes:
-	MOVB $0xf4, 0(DI)
-	MOVW BX, 1(DI)
-	ADDQ $3, DI
-	JMP  inlineEmitLiteralMemmove
-
-inlineEmitLiteralTwoBytes:
-	MOVB $0xf0, 0(DI)
-	MOVB BX, 1(DI)
-	ADDQ $2, DI
-	JMP  inlineEmitLiteralMemmove
-
-inlineEmitLiteralOneByte:
-	SHLB $2, BX
-	MOVB BX, 0(DI)
-	ADDQ $1, DI
-
-inlineEmitLiteralMemmove:
-	// Spill local variables (registers) onto the stack; call; unspill.
-	//
-	// copy(dst[i:], lit)
-	//
-	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
-	// DI, R10 and AX as arguments.
-	MOVQ DI, 0(SP)
-	MOVQ R10, 8(SP)
-	MOVQ AX, 16(SP)
-	ADDQ AX, DI              // Finish the "d +=" part of "d += emitLiteral(etc)".
-	MOVQ SI, 72(SP)
-	MOVQ DI, 80(SP)
-	MOVQ R15, 112(SP)
-	CALL runtime·memmove(SB)
-	MOVQ 56(SP), CX
-	MOVQ 64(SP), DX
-	MOVQ 72(SP), SI
-	MOVQ 80(SP), DI
-	MOVQ 88(SP), R9
-	MOVQ 112(SP), R15
-	JMP  inner1
-
-inlineEmitLiteralEnd:
-	// End inline of the emitLiteral call.
-	// ----------------------------------------
-
-emitLiteralFastPath:
-	// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
-	MOVB AX, BX
-	SUBB $1, BX
-	SHLB $2, BX
-	MOVB BX, (DI)
-	ADDQ $1, DI
-
-	// !!! Implement the copy from lit to dst as a 16-byte load and store.
-	// (Encode's documentation says that dst and src must not overlap.)
-	//
-	// This always copies 16 bytes, instead of only len(lit) bytes, but that's
-	// OK. Subsequent iterations will fix up the overrun.
-	//
-	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
-	// 16-byte loads and stores. This technique probably wouldn't be as
-	// effective on architectures that are fussier about alignment.
-	MOVOU 0(R10), X0
-	MOVOU X0, 0(DI)
-	ADDQ  AX, DI
-
-inner1:
-	// for { etc }
-
-	// base := s
-	MOVQ SI, R12
-
-	// !!! offset := base - candidate
-	MOVQ R12, R11
-	SUBQ R15, R11
-	SUBQ DX, R11
-
-	// ----------------------------------------
-	// Begin inline of the extendMatch call.
-	//
-	// s = extendMatch(src, candidate+4, s+4)
-
-	// !!! R14 = &src[len(src)]
-	MOVQ src_len+32(FP), R14
-	ADDQ DX, R14
-
-	// !!! R13 = &src[len(src) - 8]
-	MOVQ R14, R13
-	SUBQ $8, R13
-
-	// !!! R15 = &src[candidate + 4]
-	ADDQ $4, R15
-	ADDQ DX, R15
-
-	// !!! s += 4
-	ADDQ $4, SI
-
-inlineExtendMatchCmp8:
-	// As long as we are 8 or more bytes before the end of src, we can load and
-	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
-	CMPQ SI, R13
-	JA   inlineExtendMatchCmp1
-	MOVQ (R15), AX
-	MOVQ (SI), BX
-	CMPQ AX, BX
-	JNE  inlineExtendMatchBSF
-	ADDQ $8, R15
-	ADDQ $8, SI
-	JMP  inlineExtendMatchCmp8
-
-inlineExtendMatchBSF:
-	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
-	// the index of the first byte that differs. The BSF instruction finds the
-	// least significant 1 bit, the amd64 architecture is little-endian, and
-	// the shift by 3 converts a bit index to a byte index.
-	XORQ AX, BX
-	BSFQ BX, BX
-	SHRQ $3, BX
-	ADDQ BX, SI
-	JMP  inlineExtendMatchEnd
-
-inlineExtendMatchCmp1:
-	// In src's tail, compare 1 byte at a time.
-	CMPQ SI, R14
-	JAE  inlineExtendMatchEnd
-	MOVB (R15), AX
-	MOVB (SI), BX
-	CMPB AX, BX
-	JNE  inlineExtendMatchEnd
-	ADDQ $1, R15
-	ADDQ $1, SI
-	JMP  inlineExtendMatchCmp1
-
-inlineExtendMatchEnd:
-	// End inline of the extendMatch call.
-	// ----------------------------------------
-
-	// ----------------------------------------
-	// Begin inline of the emitCopy call.
-	//
-	// d += emitCopy(dst[d:], base-candidate, s-base)
-
-	// !!! length := s - base
-	MOVQ SI, AX
-	SUBQ R12, AX
-
-inlineEmitCopyLoop0:
-	// for length >= 68 { etc }
-	CMPL AX, $68
-	JLT  inlineEmitCopyStep1
-
-	// Emit a length 64 copy, encoded as 3 bytes.
-	MOVB $0xfe, 0(DI)
-	MOVW R11, 1(DI)
-	ADDQ $3, DI
-	SUBL $64, AX
-	JMP  inlineEmitCopyLoop0
-
-inlineEmitCopyStep1:
-	// if length > 64 { etc }
-	CMPL AX, $64
-	JLE  inlineEmitCopyStep2
-
-	// Emit a length 60 copy, encoded as 3 bytes.
-	MOVB $0xee, 0(DI)
-	MOVW R11, 1(DI)
-	ADDQ $3, DI
-	SUBL $60, AX
-
-inlineEmitCopyStep2:
-	// if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 }
-	CMPL AX, $12
-	JGE  inlineEmitCopyStep3
-	CMPL R11, $2048
-	JGE  inlineEmitCopyStep3
-
-	// Emit the remaining copy, encoded as 2 bytes.
-	MOVB R11, 1(DI)
-	SHRL $8, R11
-	SHLB $5, R11
-	SUBB $4, AX
-	SHLB $2, AX
-	ORB  AX, R11
-	ORB  $1, R11
-	MOVB R11, 0(DI)
-	ADDQ $2, DI
-	JMP  inlineEmitCopyEnd
-
-inlineEmitCopyStep3:
-	// Emit the remaining copy, encoded as 3 bytes.
-	SUBL $1, AX
-	SHLB $2, AX
-	ORB  $2, AX
-	MOVB AX, 0(DI)
-	MOVW R11, 1(DI)
-	ADDQ $3, DI
-
-inlineEmitCopyEnd:
-	// End inline of the emitCopy call.
-	// ----------------------------------------
-
-	// nextEmit = s
-	MOVQ SI, R10
-
-	// if s >= sLimit { goto emitRemainder }
-	MOVQ SI, AX
-	SUBQ DX, AX
-	CMPQ AX, R9
-	JAE  emitRemainder
-
-	// As per the encode_other.go code:
-	//
-	// We could immediately etc.
-
-	// x := load64(src, s-1)
-	MOVQ -1(SI), R14
-
-	// prevHash := hash(uint32(x>>0), shift)
-	MOVL  R14, R11
-	IMULL $0x1e35a7bd, R11
-	SHRL  CX, R11
-
-	// table[prevHash] = uint16(s-1)
-	MOVQ SI, AX
-	SUBQ DX, AX
-	SUBQ $1, AX
-
-	// XXX: MOVW AX, table-32768(SP)(R11*2)
-	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
-	BYTE $0x66
-	BYTE $0x42
-	BYTE $0x89
-	BYTE $0x44
-	BYTE $0x5c
-	BYTE $0x78
-
-	// currHash := hash(uint32(x>>8), shift)
-	SHRQ  $8, R14
-	MOVL  R14, R11
-	IMULL $0x1e35a7bd, R11
-	SHRL  CX, R11
-
-	// candidate = int(table[currHash])
-	// XXX: MOVWQZX table-32768(SP)(R11*2), R15
-	// XXX: 4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
-	BYTE $0x4e
-	BYTE $0x0f
-	BYTE $0xb7
-	BYTE $0x7c
-	BYTE $0x5c
-	BYTE $0x78
-
-	// table[currHash] = uint16(s)
-	ADDQ $1, AX
-
-	// XXX: MOVW AX, table-32768(SP)(R11*2)
-	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
-	BYTE $0x66
-	BYTE $0x42
-	BYTE $0x89
-	BYTE $0x44
-	BYTE $0x5c
-	BYTE $0x78
-
-	// if uint32(x>>8) == load32(src, candidate) { continue }
-	MOVL (DX)(R15*1), BX
-	CMPL R14, BX
-	JEQ  inner1
-
-	// nextHash = hash(uint32(x>>16), shift)
-	SHRQ  $8, R14
-	MOVL  R14, R11
-	IMULL $0x1e35a7bd, R11
-	SHRL  CX, R11
-
-	// s++
-	ADDQ $1, SI
-
-	// break out of the inner1 for loop, i.e. continue the outer loop.
-	JMP outer
-
-emitRemainder:
-	// if nextEmit < len(src) { etc }
-	MOVQ src_len+32(FP), AX
-	ADDQ DX, AX
-	CMPQ R10, AX
-	JEQ  encodeBlockEnd
-
-	// d += emitLiteral(dst[d:], src[nextEmit:])
-	//
-	// Push args.
-	MOVQ DI, 0(SP)
-	MOVQ $0, 8(SP)   // Unnecessary, as the callee ignores it, but conservative.
-	MOVQ $0, 16(SP)  // Unnecessary, as the callee ignores it, but conservative.
-	MOVQ R10, 24(SP)
-	SUBQ R10, AX
-	MOVQ AX, 32(SP)
-	MOVQ AX, 40(SP)  // Unnecessary, as the callee ignores it, but conservative.
-
-	// Spill local variables (registers) onto the stack; call; unspill.
-	MOVQ DI, 80(SP)
-	CALL ·emitLiteral(SB)
-	MOVQ 80(SP), DI
-
-	// Finish the "d +=" part of "d += emitLiteral(etc)".
-	ADDQ 48(SP), DI
-
-encodeBlockEnd:
-	MOVQ dst_base+0(FP), AX
-	SUBQ AX, DI
-	MOVQ DI, d+48(FP)
-	RET
diff --git a/vendor/github.com/klauspost/compress/snappy/runbench.cmd b/vendor/github.com/klauspost/compress/snappy/runbench.cmd
deleted file mode 100644
index d24eb4b..0000000
--- a/vendor/github.com/klauspost/compress/snappy/runbench.cmd
+++ /dev/null
@@ -1,2 +0,0 @@
-del old.txt
-go test -bench=. >>old.txt && go test -bench=. >>old.txt && go test -bench=. >>old.txt && benchstat -delta-test=ttest old.txt new.txt
diff --git a/vendor/github.com/klauspost/compress/zstd/README.md b/vendor/github.com/klauspost/compress/zstd/README.md
index ac3640d..beb7fa8 100644
--- a/vendor/github.com/klauspost/compress/zstd/README.md
+++ b/vendor/github.com/klauspost/compress/zstd/README.md
@@ -5,7 +5,6 @@ It offers a very wide range of compression / speed trade-off, while being backed
 A high performance compression algorithm is implemented. For now focused on speed. 
 
 This package provides [compression](#Compressor) to and [decompression](#Decompressor) of Zstandard content. 
-Note that custom dictionaries are only supported for decompression.
 
 This package is pure Go and without use of "unsafe". 
 
@@ -17,30 +16,28 @@ Currently the package is heavily optimized for 64 bit processors and will be sig
 
 Install using `go get -u github.com/klauspost/compress`. The package is located in `github.com/klauspost/compress/zstd`.
 
-Godoc Documentation: https://godoc.org/github.com/klauspost/compress/zstd
-
+[![Go Reference](https://pkg.go.dev/badge/github.com/klauspost/compress/zstd.svg)](https://pkg.go.dev/github.com/klauspost/compress/zstd)
 
 ## Compressor
 
 ### Status: 
 
 STABLE - there may always be subtle bugs, a wide variety of content has been tested and the library is actively 
-used by several projects. This library is being continuously [fuzz-tested](https://github.com/klauspost/compress-fuzz),
-kindly supplied by [fuzzit.dev](https://fuzzit.dev/).
+used by several projects. This library is being [fuzz-tested](https://github.com/klauspost/compress-fuzz) for all updates.
 
 There may still be specific combinations of data types/size/settings that could lead to edge cases, 
 so as always, testing is recommended.  
 
 For now, a high speed (fastest) and medium-fast (default) compressor has been implemented. 
 
-The "Fastest" compression ratio is roughly equivalent to zstd level 1. 
-The "Default" compression ratio is roughly equivalent to zstd level 3 (default).
+* The "Fastest" compression ratio is roughly equivalent to zstd level 1. 
+* The "Default" compression ratio is roughly equivalent to zstd level 3 (default).
+* The "Better" compression ratio is roughly equivalent to zstd level 7.
+* The "Best" compression ratio is roughly equivalent to zstd level 11.
 
 In terms of speed, it is typically 2x as fast as the stdlib deflate/gzip in its fastest mode. 
 The compression ratio compared to stdlib is around level 3, but usually 3x as fast.
 
-Compared to cgo zstd, the speed is around level 3 (default), but compression slightly worse, between level 1&2.
-
  
 ### Usage
 
@@ -55,11 +52,11 @@ To create a writer with default options, do like this:
 ```Go
 // Compress input to output.
 func Compress(in io.Reader, out io.Writer) error {
-    w, err := NewWriter(output)
+    enc, err := zstd.NewWriter(out)
     if err != nil {
         return err
     }
-    _, err := io.Copy(w, input)
+    _, err = io.Copy(enc, in)
     if err != nil {
         enc.Close()
         return err
@@ -81,6 +78,9 @@ of a stream. This is independent of the `WithEncoderConcurrency(n)`, but that is
 in the future. So if you want to limit concurrency for future updates, specify the concurrency
 you would like.
 
+If you would like stream encoding to be done without spawning async goroutines, use `WithEncoderConcurrency(1)`
+which will compress input as each block is completed, blocking on writes until each has completed.
+
 You can specify your desired compression level using `WithEncoderLevel()` option. Currently only pre-defined 
 compression settings can be specified.
 
@@ -107,7 +107,8 @@ and seems to ignore concatenated streams, even though [it is part of the spec](h
 For compressing small blocks, the returned encoder has a function called `EncodeAll(src, dst []byte) []byte`.
 
 `EncodeAll` will encode all input in src and append it to dst.
-This function can be called concurrently, but each call will only run on a single goroutine.
+This function can be called concurrently. 
+Each call will only run on a same goroutine as the caller.
 
 Encoded blocks can be concatenated and the result will be the combined input stream.
 Data compressed with EncodeAll can be decoded with the Decoder, using either a stream or `DecodeAll`.
@@ -141,7 +142,7 @@ I have collected some speed examples to compare speed and compression against ot
 
 * `file` is the input file.
 * `out` is the compressor used. `zskp` is this package. `zstd` is the Datadog cgo library. `gzstd/gzkp` is gzip standard and this library.
-* `level` is the compression level used. For `zskp` level 1 is "fastest", level 2 is "default".
+* `level` is the compression level used. For `zskp` level 1 is "fastest", level 2 is "default"; 3 is "better", 4 is "best".
 * `insize`/`outsize` is the input/output size.
 * `millis` is the number of milliseconds used for compression.
 * `mb/s` is megabytes (2^20 bytes) per second.
@@ -152,121 +153,108 @@ http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip
 
 This package:
 file    out     level   insize      outsize     millis  mb/s
-silesia.tar zskp    1   211947520   73101992    643     313.87
-silesia.tar zskp    2   211947520   67504318    969     208.38
-silesia.tar zskp    3   211947520   65177448    1899    106.44
+silesia.tar zskp    1   211947520   73821326    634     318.47
+silesia.tar zskp    2   211947520   67655404    1508    133.96
+silesia.tar zskp    3   211947520   64746933    3000    67.37
+silesia.tar zskp    4   211947520   60073508    16926   11.94
 
 cgo zstd:
 silesia.tar zstd    1   211947520   73605392    543     371.56
 silesia.tar zstd    3   211947520   66793289    864     233.68
 silesia.tar zstd    6   211947520   62916450    1913    105.66
+silesia.tar zstd    9   211947520   60212393    5063    39.92
 
 gzip, stdlib/this package:
-silesia.tar gzstd   1   211947520   80007735    1654    122.21
-silesia.tar gzkp    1   211947520   80369488    1168    173.06
+silesia.tar gzstd   1   211947520   80007735    1498    134.87
+silesia.tar gzkp    1   211947520   80088272    1009    200.31
 
 GOB stream of binary data. Highly compressible.
 https://files.klauspost.com/compress/gob-stream.7z
 
 file        out     level   insize  outsize     millis  mb/s
-gob-stream  zskp    1   1911399616  235022249   3088    590.30
-gob-stream  zskp    2   1911399616  205669791   3786    481.34
-gob-stream  zskp    3   1911399616  185792019   9324    195.48
+gob-stream  zskp    1   1911399616  233948096   3230    564.34
+gob-stream  zskp    2   1911399616  203997694   4997    364.73
+gob-stream  zskp    3   1911399616  173526523   13435   135.68
+gob-stream  zskp    4   1911399616  162195235   47559   38.33
+
 gob-stream  zstd    1   1911399616  249810424   2637    691.26
 gob-stream  zstd    3   1911399616  208192146   3490    522.31
 gob-stream  zstd    6   1911399616  193632038   6687    272.56
-gob-stream  gzstd   1   1911399616  357382641   10251   177.82
-gob-stream  gzkp    1   1911399616  362156523   5695    320.08
+gob-stream  zstd    9   1911399616  177620386   16175   112.70
+
+gob-stream  gzstd   1   1911399616  357382013   9046    201.49
+gob-stream  gzkp    1   1911399616  359136669   4885    373.08
 
 The test data for the Large Text Compression Benchmark is the first
 10^9 bytes of the English Wikipedia dump on Mar. 3, 2006.
 http://mattmahoney.net/dc/textdata.html
 
 file    out level   insize      outsize     millis  mb/s
-enwik9  zskp    1   1000000000  343848582   3609    264.18
-enwik9  zskp    2   1000000000  317276632   5746    165.97
-enwik9  zskp    3   1000000000  294540704   11725   81.34
+enwik9  zskp    1   1000000000  343833605   3687    258.64
+enwik9  zskp    2   1000000000  317001237   7672    124.29
+enwik9  zskp    3   1000000000  291915823   15923   59.89
+enwik9  zskp    4   1000000000  261710291   77697   12.27
+
 enwik9  zstd    1   1000000000  358072021   3110    306.65
 enwik9  zstd    3   1000000000  313734672   4784    199.35
 enwik9  zstd    6   1000000000  295138875   10290   92.68
-enwik9  gzstd   1   1000000000  382578136   9604    99.30
-enwik9  gzkp    1   1000000000  383825945   6544    145.73
+enwik9  zstd    9   1000000000  278348700   28549   33.40
+
+enwik9  gzstd   1   1000000000  382578136   8608    110.78
+enwik9  gzkp    1   1000000000  382781160   5628    169.45
 
 Highly compressible JSON file.
 https://files.klauspost.com/compress/github-june-2days-2019.json.zst
 
 file                        out level   insize      outsize     millis  mb/s
-github-june-2days-2019.json zskp    1   6273951764  699045015   10620   563.40
-github-june-2days-2019.json zskp    2   6273951764  617881763   11687   511.96
-github-june-2days-2019.json zskp    3   6273951764  537511906   29252   204.54
+github-june-2days-2019.json zskp    1   6273951764  697439532   9789    611.17
+github-june-2days-2019.json zskp    2   6273951764  610876538   18553   322.49
+github-june-2days-2019.json zskp    3   6273951764  517662858   44186   135.41
+github-june-2days-2019.json zskp    4   6273951764  464617114   165373  36.18
+
 github-june-2days-2019.json zstd    1   6273951764  766284037   8450    708.00
 github-june-2days-2019.json zstd    3   6273951764  661889476   10927   547.57
 github-june-2days-2019.json zstd    6   6273951764  642756859   22996   260.18
-github-june-2days-2019.json gzstd   1   6273951764  1164400847  29948   199.79
-github-june-2days-2019.json gzkp    1   6273951764  1128755542  19236   311.03
+github-june-2days-2019.json zstd    9   6273951764  601974523   52413   114.16
+
+github-june-2days-2019.json gzstd   1   6273951764  1164397768  26793   223.32
+github-june-2days-2019.json gzkp    1   6273951764  1120631856  17693   338.16
 
 VM Image, Linux mint with a few installed applications:
 https://files.klauspost.com/compress/rawstudio-mint14.7z
 
 file                    out level   insize      outsize     millis  mb/s
-rawstudio-mint14.tar    zskp    1   8558382592  3667489370  20210   403.84
-rawstudio-mint14.tar    zskp    2   8558382592  3364592300  31873   256.07
-rawstudio-mint14.tar    zskp    3   8558382592  3224594213  71751   113.75
+rawstudio-mint14.tar    zskp    1   8558382592  3718400221  18206   448.29
+rawstudio-mint14.tar    zskp    2   8558382592  3326118337  37074   220.15
+rawstudio-mint14.tar    zskp    3   8558382592  3163842361  87306   93.49
+rawstudio-mint14.tar    zskp    4   8558382592  2970480650  783862  10.41
+
 rawstudio-mint14.tar    zstd    1   8558382592  3609250104  17136   476.27
 rawstudio-mint14.tar    zstd    3   8558382592  3341679997  29262   278.92
 rawstudio-mint14.tar    zstd    6   8558382592  3235846406  77904   104.77
-rawstudio-mint14.tar    gzstd   1   8558382592  3926257486  57722   141.40
-rawstudio-mint14.tar    gzkp    1   8558382592  3970463184  41749   195.49
+rawstudio-mint14.tar    zstd    9   8558382592  3160778861  140946  57.91
+
+rawstudio-mint14.tar    gzstd   1   8558382592  3926234992  51345   158.96
+rawstudio-mint14.tar    gzkp    1   8558382592  3960117298  36722   222.26
 
 CSV data:
 https://files.klauspost.com/compress/nyc-taxi-data-10M.csv.zst
 
 file                    out level   insize      outsize     millis  mb/s
-nyc-taxi-data-10M.csv   zskp    1   3325605752  641339945   8925    355.35
-nyc-taxi-data-10M.csv   zskp    2   3325605752  591748091   11268   281.44
-nyc-taxi-data-10M.csv   zskp    3   3325605752  538490114   19880   159.53
+nyc-taxi-data-10M.csv   zskp    1   3325605752  641319332   9462    335.17
+nyc-taxi-data-10M.csv   zskp    2   3325605752  588976126   17570   180.50
+nyc-taxi-data-10M.csv   zskp    3   3325605752  529329260   32432   97.79
+nyc-taxi-data-10M.csv   zskp    4   3325605752  474949772   138025  22.98
+
 nyc-taxi-data-10M.csv   zstd    1   3325605752  687399637   8233    385.18
 nyc-taxi-data-10M.csv   zstd    3   3325605752  598514411   10065   315.07
 nyc-taxi-data-10M.csv   zstd    6   3325605752  570522953   20038   158.27
-nyc-taxi-data-10M.csv   gzstd   1   3325605752  928656485   23876   132.83
-nyc-taxi-data-10M.csv   gzkp    1   3325605752  924718719   16388   193.53
-```
+nyc-taxi-data-10M.csv   zstd    9   3325605752  517554797   64565   49.12
 
-### Converters
-
-As part of the development process a *Snappy* -> *Zstandard* converter was also built.
-
-This can convert a *framed* [Snappy Stream](https://godoc.org/github.com/golang/snappy#Writer) to a zstd stream. 
-Note that a single block is not framed.
-
-Conversion is done by converting the stream directly from Snappy without intermediate full decoding.
-Therefore the compression ratio is much less than what can be done by a full decompression
-and compression, and a faulty Snappy stream may lead to a faulty Zstandard stream without
-any errors being generated.
-No CRC value is being generated and not all CRC values of the Snappy stream are checked.
-However, it provides really fast re-compression of Snappy streams.
-
-
-```
-BenchmarkSnappy_ConvertSilesia-8           1  1156001600 ns/op   183.35 MB/s
-Snappy len 103008711 -> zstd len 82687318
-
-BenchmarkSnappy_Enwik9-8           1  6472998400 ns/op   154.49 MB/s
-Snappy len 508028601 -> zstd len 390921079
+nyc-taxi-data-10M.csv   gzstd   1   3325605752  928654908   21270   149.11
+nyc-taxi-data-10M.csv   gzkp    1   3325605752  922273214   13929   227.68
 ```
 
-
-```Go
-    s := zstd.SnappyConverter{}
-    n, err = s.Convert(input, output)
-    if err != nil {
-        fmt.Println("Re-compressed stream to", n, "bytes")
-    }
-```
-
-The converter `s` can be reused to avoid allocations, even after errors.
-
-
 ## Decompressor
 
 Staus: STABLE - there may still be subtle bugs, but a wide variety of content has been tested.
@@ -287,20 +275,25 @@ For streaming use a simple setup could look like this:
 import "github.com/klauspost/compress/zstd"
 
 func Decompress(in io.Reader, out io.Writer) error {
-    d, err := zstd.NewReader(input)
+    d, err := zstd.NewReader(in)
     if err != nil {
         return err
     }
     defer d.Close()
     
     // Copy content...
-    _, err := io.Copy(out, d)
+    _, err = io.Copy(out, d)
     return err
 }
 ```
 
-It is important to use the "Close" function when you no longer need the Reader to stop running goroutines. 
-See "Allocation-less operation" below.
+It is important to use the "Close" function when you no longer need the Reader to stop running goroutines, 
+when running with default settings.
+Goroutines will exit once an error has been returned, including `io.EOF` at the end of a stream.
+
+Streams are decoded concurrently in 4 asynchronous stages to give the best possible throughput.
+However, if you prefer synchronous decompression, use `WithDecoderConcurrency(1)` which will decompress data 
+as it is being requested only.
 
 For decoding buffers, it could look something like this:
 
@@ -309,7 +302,7 @@ import "github.com/klauspost/compress/zstd"
 
 // Create a reader that caches decompressors.
 // For this operation type we supply a nil Reader.
-var decoder, _ = zstd.NewReader(nil)
+var decoder, _ = zstd.NewReader(nil, WithDecoderConcurrency(0))
 
 // Decompress a buffer. We don't supply a destination buffer,
 // so it will be allocated by the decoder.
@@ -319,9 +312,12 @@ func Decompress(src []byte) ([]byte, error) {
 ```
 
 Both of these cases should provide the functionality needed. 
-The decoder can be used for *concurrent* decompression of multiple buffers. 
+The decoder can be used for *concurrent* decompression of multiple buffers.
+By default 4 decompressors will be created. 
+
 It will only allow a certain number of concurrent operations to run. 
-To tweak that yourself use the `WithDecoderConcurrency(n)` option when creating the decoder.   
+To tweak that yourself use the `WithDecoderConcurrency(n)` option when creating the decoder.
+It is possible to use `WithDecoderConcurrency(0)` to create GOMAXPROCS decoders.
 
 ### Dictionaries
 
@@ -337,6 +333,21 @@ A re-used Decoder will still contain the dictionaries registered.
 
 When registering multiple dictionaries with the same ID, the last one will be used.
 
+It is possible to use dictionaries when compressing data.
+
+To enable a dictionary use `WithEncoderDict(dict []byte)`. Here only one dictionary will be used 
+and it will likely be used even if it doesn't improve compression. 
+
+The used dictionary must be used to decompress the content.
+
+For any real gains, the dictionary should be built with similar data. 
+If an unsuitable dictionary is used the output may be slightly larger than using no dictionary.
+Use the [zstd commandline tool](https://github.com/facebook/zstd/releases) to build a dictionary from sample data.
+For information see [zstd dictionary information](https://github.com/facebook/zstd#the-case-for-small-data-compression). 
+
+For now there is a fixed startup performance penalty for compressing content with dictionaries. 
+This will likely be improved over time. Just be aware to test performance when implementing.  
+
 ### Allocation-less operation
 
 The decoder has been designed to operate without allocations after a warmup. 
@@ -358,70 +369,71 @@ In this case no unneeded allocations should be made.
 The buffer decoder does everything on the same goroutine and does nothing concurrently.
 It can however decode several buffers concurrently. Use `WithDecoderConcurrency(n)` to limit that.
 
-The stream decoder operates on
+The stream decoder will create goroutines that:
 
-* One goroutine reads input and splits the input to several block decoders.
-* A number of decoders will decode blocks.
-* A goroutine coordinates these blocks and sends history from one to the next.
+1) Reads input and splits the input into blocks.
+2) Decompression of literals.
+3) Decompression of sequences.
+4) Reconstruction of output stream.
 
 So effectively this also means the decoder will "read ahead" and prepare data to always be available for output.
 
+The concurrency level will, for streams, determine how many blocks ahead the compression will start.
+
 Since "blocks" are quite dependent on the output of the previous block stream decoding will only have limited concurrency.
 
-In practice this means that concurrency is often limited to utilizing about 2 cores effectively.
- 
- 
+In practice this means that concurrency is often limited to utilizing about 3 cores effectively.
+  
 ### Benchmarks
 
-These are some examples of performance compared to [datadog cgo library](https://github.com/DataDog/zstd).
-
 The first two are streaming decodes and the last are smaller inputs. 
- 
+
+Running on AMD Ryzen 9 3950X 16-Core Processor. AMD64 assembly used.
+
 ```
-BenchmarkDecoderSilesia-8                          3     385000067 ns/op     550.51 MB/s        5498 B/op          8 allocs/op
-BenchmarkDecoderSilesiaCgo-8                       6     197666567 ns/op    1072.25 MB/s      270672 B/op          8 allocs/op
-
-BenchmarkDecoderEnwik9-8                           1    2027001600 ns/op     493.34 MB/s       10496 B/op         18 allocs/op
-BenchmarkDecoderEnwik9Cgo-8                        2     979499200 ns/op    1020.93 MB/s      270672 B/op          8 allocs/op
-
-Concurrent performance:
-
-BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-16                28915         42469 ns/op    4340.07 MB/s         114 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-16           116505          9965 ns/op    11900.16 MB/s         16 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-16              8952        134272 ns/op    3588.70 MB/s         915 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-16               11820        102538 ns/op    4161.90 MB/s         594 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-16             34782         34184 ns/op    3661.88 MB/s          60 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-16              27712         43447 ns/op    3500.58 MB/s          99 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-16                 62826         18750 ns/op    21845.10 MB/s        104 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-16          631545          1794 ns/op    57078.74 MB/s          2 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-16         1690140           712 ns/op    172938.13 MB/s         1 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-16                 10432        113593 ns/op    6180.73 MB/s        1143 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/html.zst-16                    113206         10671 ns/op    9596.27 MB/s          15 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-16          1530615           779 ns/op    5229.49 MB/s           0 B/op          0 allocs/op
-
-BenchmarkDecoder_DecodeAllParallelCgo/kppkn.gtb.zst-16             65217         16192 ns/op    11383.34 MB/s         46 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/geo.protodata.zst-16        292671          4039 ns/op    29363.19 MB/s          6 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/plrabn12.txt.zst-16          26314         46021 ns/op    10470.43 MB/s        293 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/lcet10.txt.zst-16            33897         34900 ns/op    12227.96 MB/s        205 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/asyoulik.txt.zst-16         104348         11433 ns/op    10949.01 MB/s         20 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/alice29.txt.zst-16           75949         15510 ns/op    9805.60 MB/s          32 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/html_x_4.zst-16             173910          6756 ns/op    60624.29 MB/s         37 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/paper-100k.pdf.zst-16       923076          1339 ns/op    76474.87 MB/s          1 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/fireworks.jpeg.zst-16       922920          1351 ns/op    91102.57 MB/s          2 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/urls.10K.zst-16              27649         43618 ns/op    16096.19 MB/s        407 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/html.zst-16                 279073          4160 ns/op    24614.18 MB/s          6 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllParallelCgo/comp-data.bin.zst-16        749938          1579 ns/op    2581.71 MB/s           0 B/op          0 allocs/op
+BenchmarkDecoderSilesia-32    	                   5	 206878840 ns/op	1024.50 MB/s	   49808 B/op	      43 allocs/op
+BenchmarkDecoderEnwik9-32                          1	1271809000 ns/op	 786.28 MB/s	   72048 B/op	      52 allocs/op
+
+Concurrent blocks, performance:
+
+BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-32         	   67356	     17857 ns/op	10321.96 MB/s	        22.48 pct	     102 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-32     	  266656	      4421 ns/op	26823.21 MB/s	        11.89 pct	      19 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-32      	   20992	     56842 ns/op	8477.17 MB/s	        39.90 pct	     754 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-32        	   27456	     43932 ns/op	9714.01 MB/s	        33.27 pct	     524 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-32      	   78432	     15047 ns/op	8319.15 MB/s	        40.34 pct	      66 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-32       	   65800	     18436 ns/op	8249.63 MB/s	        37.75 pct	      88 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-32          	  102993	     11523 ns/op	35546.09 MB/s	         3.637 pct	     143 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-32    	 1000000	      1070 ns/op	95720.98 MB/s	        80.53 pct	       3 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-32    	  749802	      1752 ns/op	70272.35 MB/s	       100.0 pct	       5 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-32          	   22640	     52934 ns/op	13263.37 MB/s	        26.25 pct	    1014 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/html.zst-32              	  226412	      5232 ns/op	19572.27 MB/s	        14.49 pct	      20 B/op	       0 allocs/op
+BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-32     	  923041	      1276 ns/op	3194.71 MB/s	        31.26 pct	       0 B/op	       0 allocs/op
 ```
 
-This reflects the performance around May 2020, but this may be out of date.
+This reflects the performance around May 2022, but this may be out of date.
+
+## Zstd inside ZIP files
+
+It is possible to use zstandard to compress individual files inside zip archives.
+While this isn't widely supported it can be useful for internal files.
+
+To support the compression and decompression of these files you must register a compressor and decompressor.
+
+It is highly recommended registering the (de)compressors on individual zip Reader/Writer and NOT
+use the global registration functions. The main reason for this is that 2 registrations from 
+different packages will result in a panic.
+
+It is a good idea to only have a single compressor and decompressor, since they can be used for multiple zip
+files concurrently, and using a single instance will allow reusing some resources.
+
+See [this example](https://pkg.go.dev/github.com/klauspost/compress/zstd#example-ZipCompressor) for 
+how to compress and decompress files inside zip archives.
 
 # Contributions
 
 Contributions are always welcome. 
 For new features/fixes, remember to add tests and for performance enhancements include benchmarks.
 
-For sending files for reproducing errors use a service like [goobox](https://goobox.io/#/upload) or similar to share your files.
-
 For general feedback and experience reports, feel free to open an issue or write me on [Twitter](https://twitter.com/sh0dan).
 
 This package includes the excellent [`github.com/cespare/xxhash`](https://github.com/cespare/xxhash) package Copyright (c) 2016 Caleb Spare.
diff --git a/vendor/github.com/klauspost/compress/zstd/bitreader.go b/vendor/github.com/klauspost/compress/zstd/bitreader.go
index 8544585..d7cd15b 100644
--- a/vendor/github.com/klauspost/compress/zstd/bitreader.go
+++ b/vendor/github.com/klauspost/compress/zstd/bitreader.go
@@ -7,6 +7,7 @@ package zstd
 import (
 	"encoding/binary"
 	"errors"
+	"fmt"
 	"io"
 	"math/bits"
 )
@@ -50,16 +51,23 @@ func (b *bitReader) getBits(n uint8) int {
 	if n == 0 /*|| b.bitsRead >= 64 */ {
 		return 0
 	}
-	return b.getBitsFast(n)
+	return int(b.get32BitsFast(n))
 }
 
-// getBitsFast requires that at least one bit is requested every time.
+// get32BitsFast requires that at least one bit is requested every time.
 // There are no checks if the buffer is filled.
-func (b *bitReader) getBitsFast(n uint8) int {
+func (b *bitReader) get32BitsFast(n uint8) uint32 {
 	const regMask = 64 - 1
 	v := uint32((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
 	b.bitsRead += n
-	return int(v)
+	return v
+}
+
+func (b *bitReader) get16BitsFast(n uint8) uint16 {
+	const regMask = 64 - 1
+	v := uint16((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
+	b.bitsRead += n
+	return v
 }
 
 // fillFast() will make sure at least 32 bits are available.
@@ -125,6 +133,9 @@ func (b *bitReader) remain() uint {
 func (b *bitReader) close() error {
 	// Release reference.
 	b.in = nil
+	if !b.finished() {
+		return fmt.Errorf("%d extra bits on block, should be 0", b.remain())
+	}
 	if b.bitsRead > 64 {
 		return io.ErrUnexpectedEOF
 	}
diff --git a/vendor/github.com/klauspost/compress/zstd/bitwriter.go b/vendor/github.com/klauspost/compress/zstd/bitwriter.go
index 303ae90..b366182 100644
--- a/vendor/github.com/klauspost/compress/zstd/bitwriter.go
+++ b/vendor/github.com/klauspost/compress/zstd/bitwriter.go
@@ -38,7 +38,7 @@ func (b *bitWriter) addBits16NC(value uint16, bits uint8) {
 	b.nBits += bits
 }
 
-// addBits32NC will add up to 32 bits.
+// addBits32NC will add up to 31 bits.
 // It will not check if there is space for them,
 // so the caller must ensure that it has flushed recently.
 func (b *bitWriter) addBits32NC(value uint32, bits uint8) {
@@ -46,6 +46,26 @@ func (b *bitWriter) addBits32NC(value uint32, bits uint8) {
 	b.nBits += bits
 }
 
+// addBits64NC will add up to 64 bits.
+// There must be space for 32 bits.
+func (b *bitWriter) addBits64NC(value uint64, bits uint8) {
+	if bits <= 31 {
+		b.addBits32Clean(uint32(value), bits)
+		return
+	}
+	b.addBits32Clean(uint32(value), 32)
+	b.flush32()
+	b.addBits32Clean(uint32(value>>32), bits-32)
+}
+
+// addBits32Clean will add up to 32 bits.
+// It will not check if there is space for them.
+// The input must not contain more bits than specified.
+func (b *bitWriter) addBits32Clean(value uint32, bits uint8) {
+	b.bitContainer |= uint64(value) << (b.nBits & 63)
+	b.nBits += bits
+}
+
 // addBits16Clean will add up to 16 bits. value may not contain more set bits than indicated.
 // It will not check if there is space for them, so the caller must ensure that it has flushed recently.
 func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
diff --git a/vendor/github.com/klauspost/compress/zstd/blockdec.go b/vendor/github.com/klauspost/compress/zstd/blockdec.go
index c8ec6e3..b2bca33 100644
--- a/vendor/github.com/klauspost/compress/zstd/blockdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockdec.go
@@ -5,9 +5,14 @@
 package zstd
 
 import (
+	"bytes"
+	"encoding/binary"
 	"errors"
 	"fmt"
 	"io"
+	"io/ioutil"
+	"os"
+	"path/filepath"
 	"sync"
 
 	"github.com/klauspost/compress/huff0"
@@ -38,6 +43,9 @@ const (
 	// maxCompressedBlockSize is the biggest allowed compressed block size (128KB)
 	maxCompressedBlockSize = 128 << 10
 
+	compressedBlockOverAlloc    = 16
+	maxCompressedBlockSizeAlloc = 128<<10 + compressedBlockOverAlloc
+
 	// Maximum possible block size (all Raw+Uncompressed).
 	maxBlockSize = (1 << 21) - 1
 
@@ -76,17 +84,25 @@ type blockDec struct {
 	// Window size of the block.
 	WindowSize uint64
 
-	history     chan *history
-	input       chan struct{}
-	result      chan decodeOutput
-	sequenceBuf []seq
-	err         error
-	decWG       sync.WaitGroup
+	err error
+
+	// Check against this crc
+	checkCRC []byte
 
 	// Frame to use for singlethreaded decoding.
 	// Should not be used by the decoder itself since parent may be another frame.
 	localFrame *frameDec
 
+	sequence []seqVals
+
+	async struct {
+		newHist  *history
+		literals []byte
+		seqData  []byte
+		seqSize  int // Size of uncompressed sequences
+		fcs      uint64
+	}
+
 	// Block is RLE, this is the size.
 	RLESize uint32
 	tmp     [4]byte
@@ -109,13 +125,8 @@ func (b *blockDec) String() string {
 
 func newBlockDec(lowMem bool) *blockDec {
 	b := blockDec{
-		lowMem:  lowMem,
-		result:  make(chan decodeOutput, 1),
-		input:   make(chan struct{}, 1),
-		history: make(chan *history, 1),
+		lowMem: lowMem,
 	}
-	b.decWG.Add(1)
-	go b.startDecoder()
 	return &b
 }
 
@@ -123,44 +134,60 @@ func newBlockDec(lowMem bool) *blockDec {
 // Input must be a start of a block and will be at the end of the block when returned.
 func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
 	b.WindowSize = windowSize
-	tmp := br.readSmall(3)
-	if tmp == nil {
-		if debug {
-			println("Reading block header:", io.ErrUnexpectedEOF)
-		}
-		return io.ErrUnexpectedEOF
+	tmp, err := br.readSmall(3)
+	if err != nil {
+		println("Reading block header:", err)
+		return err
 	}
 	bh := uint32(tmp[0]) | (uint32(tmp[1]) << 8) | (uint32(tmp[2]) << 16)
 	b.Last = bh&1 != 0
 	b.Type = blockType((bh >> 1) & 3)
 	// find size.
 	cSize := int(bh >> 3)
-	maxSize := maxBlockSize
+	maxSize := maxCompressedBlockSizeAlloc
 	switch b.Type {
 	case blockTypeReserved:
 		return ErrReservedBlockType
 	case blockTypeRLE:
+		if cSize > maxCompressedBlockSize || cSize > int(b.WindowSize) {
+			if debugDecoder {
+				printf("rle block too big: csize:%d block: %+v\n", uint64(cSize), b)
+			}
+			return ErrWindowSizeExceeded
+		}
 		b.RLESize = uint32(cSize)
 		if b.lowMem {
 			maxSize = cSize
 		}
 		cSize = 1
 	case blockTypeCompressed:
-		if debug {
+		if debugDecoder {
 			println("Data size on stream:", cSize)
 		}
 		b.RLESize = 0
-		maxSize = maxCompressedBlockSize
+		maxSize = maxCompressedBlockSizeAlloc
 		if windowSize < maxCompressedBlockSize && b.lowMem {
-			maxSize = int(windowSize)
+			maxSize = int(windowSize) + compressedBlockOverAlloc
 		}
 		if cSize > maxCompressedBlockSize || uint64(cSize) > b.WindowSize {
-			if debug {
+			if debugDecoder {
 				printf("compressed block too big: csize:%d block: %+v\n", uint64(cSize), b)
 			}
 			return ErrCompressedSizeTooBig
 		}
+		// Empty compressed blocks must at least be 2 bytes
+		// for Literals_Block_Type and one for Sequences_Section_Header.
+		if cSize < 2 {
+			return ErrBlockTooSmall
+		}
 	case blockTypeRaw:
+		if cSize > maxCompressedBlockSize || cSize > int(b.WindowSize) {
+			if debugDecoder {
+				printf("rle block too big: csize:%d block: %+v\n", uint64(cSize), b)
+			}
+			return ErrWindowSizeExceeded
+		}
+
 		b.RLESize = 0
 		// We do not need a destination for raw blocks.
 		maxSize = -1
@@ -170,19 +197,18 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
 
 	// Read block data.
 	if cap(b.dataStorage) < cSize {
-		if b.lowMem {
-			b.dataStorage = make([]byte, 0, cSize)
+		if b.lowMem || cSize > maxCompressedBlockSize {
+			b.dataStorage = make([]byte, 0, cSize+compressedBlockOverAlloc)
 		} else {
-			b.dataStorage = make([]byte, 0, maxBlockSize)
+			b.dataStorage = make([]byte, 0, maxCompressedBlockSizeAlloc)
 		}
 	}
 	if cap(b.dst) <= maxSize {
 		b.dst = make([]byte, 0, maxSize+1)
 	}
-	var err error
 	b.data, err = br.readBig(cSize, b.dataStorage)
 	if err != nil {
-		if debug {
+		if debugDecoder {
 			println("Reading block:", err, "(", cSize, ")", len(b.data))
 			printf("%T", br)
 		}
@@ -196,85 +222,14 @@ func (b *blockDec) sendErr(err error) {
 	b.Last = true
 	b.Type = blockTypeReserved
 	b.err = err
-	b.input <- struct{}{}
 }
 
 // Close will release resources.
 // Closed blockDec cannot be reset.
 func (b *blockDec) Close() {
-	close(b.input)
-	close(b.history)
-	close(b.result)
-	b.decWG.Wait()
-}
-
-// decodeAsync will prepare decoding the block when it receives input.
-// This will separate output and history.
-func (b *blockDec) startDecoder() {
-	defer b.decWG.Done()
-	for range b.input {
-		//println("blockDec: Got block input")
-		switch b.Type {
-		case blockTypeRLE:
-			if cap(b.dst) < int(b.RLESize) {
-				if b.lowMem {
-					b.dst = make([]byte, b.RLESize)
-				} else {
-					b.dst = make([]byte, maxBlockSize)
-				}
-			}
-			o := decodeOutput{
-				d:   b,
-				b:   b.dst[:b.RLESize],
-				err: nil,
-			}
-			v := b.data[0]
-			for i := range o.b {
-				o.b[i] = v
-			}
-			hist := <-b.history
-			hist.append(o.b)
-			b.result <- o
-		case blockTypeRaw:
-			o := decodeOutput{
-				d:   b,
-				b:   b.data,
-				err: nil,
-			}
-			hist := <-b.history
-			hist.append(o.b)
-			b.result <- o
-		case blockTypeCompressed:
-			b.dst = b.dst[:0]
-			err := b.decodeCompressed(nil)
-			o := decodeOutput{
-				d:   b,
-				b:   b.dst,
-				err: err,
-			}
-			if debug {
-				println("Decompressed to", len(b.dst), "bytes, error:", err)
-			}
-			b.result <- o
-		case blockTypeReserved:
-			// Used for returning errors.
-			<-b.history
-			b.result <- decodeOutput{
-				d:   b,
-				b:   nil,
-				err: b.err,
-			}
-		default:
-			panic("Invalid block type")
-		}
-		if debug {
-			println("blockDec: Finished block")
-		}
-	}
 }
 
-// decodeAsync will prepare decoding the block when it receives the history.
-// If history is provided, it will not fetch it from the channel.
+// decodeBuf
 func (b *blockDec) decodeBuf(hist *history) error {
 	switch b.Type {
 	case blockTypeRLE:
@@ -297,14 +252,23 @@ func (b *blockDec) decodeBuf(hist *history) error {
 		return nil
 	case blockTypeCompressed:
 		saved := b.dst
-		b.dst = hist.b
-		hist.b = nil
+		// Append directly to history
+		if hist.ignoreBuffer == 0 {
+			b.dst = hist.b
+			hist.b = nil
+		} else {
+			b.dst = b.dst[:0]
+		}
 		err := b.decodeCompressed(hist)
-		if debug {
+		if debugDecoder {
 			println("Decompressed to total", len(b.dst), "bytes, hash:", xxhash.Sum64(b.dst), "error:", err)
 		}
-		hist.b = b.dst
-		b.dst = saved
+		if hist.ignoreBuffer == 0 {
+			hist.b = b.dst
+			b.dst = saved
+		} else {
+			hist.appendKeep(b.dst)
+		}
 		return err
 	case blockTypeReserved:
 		// Used for returning errors.
@@ -314,30 +278,18 @@ func (b *blockDec) decodeBuf(hist *history) error {
 	}
 }
 
-// decodeCompressed will start decompressing a block.
-// If no history is supplied the decoder will decodeAsync as much as possible
-// before fetching from blockDec.history
-func (b *blockDec) decodeCompressed(hist *history) error {
-	in := b.data
-	delayedHistory := hist == nil
-
-	if delayedHistory {
-		// We must always grab history.
-		defer func() {
-			if hist == nil {
-				<-b.history
-			}
-		}()
-	}
+func (b *blockDec) decodeLiterals(in []byte, hist *history) (remain []byte, err error) {
 	// There must be at least one byte for Literals_Block_Type and one for Sequences_Section_Header
 	if len(in) < 2 {
-		return ErrBlockTooSmall
+		return in, ErrBlockTooSmall
 	}
+
 	litType := literalsBlockType(in[0] & 3)
 	var litRegenSize int
 	var litCompSize int
 	sizeFormat := (in[0] >> 2) & 3
 	var fourStreams bool
+	var literals []byte
 	switch litType {
 	case literalsBlockRaw, literalsBlockRLE:
 		switch sizeFormat {
@@ -353,7 +305,7 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 			//  Regenerated_Size uses 20 bits (0-1048575). Literals_Section_Header uses 3 bytes.
 			if len(in) < 3 {
 				println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in))
-				return ErrBlockTooSmall
+				return in, ErrBlockTooSmall
 			}
 			litRegenSize = int(in[0]>>4) + (int(in[1]) << 4) + (int(in[2]) << 12)
 			in = in[3:]
@@ -364,7 +316,7 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 			// Both Regenerated_Size and Compressed_Size use 10 bits (0-1023).
 			if len(in) < 3 {
 				println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in))
-				return ErrBlockTooSmall
+				return in, ErrBlockTooSmall
 			}
 			n := uint64(in[0]>>4) + (uint64(in[1]) << 4) + (uint64(in[2]) << 12)
 			litRegenSize = int(n & 1023)
@@ -375,7 +327,7 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 			fourStreams = true
 			if len(in) < 4 {
 				println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in))
-				return ErrBlockTooSmall
+				return in, ErrBlockTooSmall
 			}
 			n := uint64(in[0]>>4) + (uint64(in[1]) << 4) + (uint64(in[2]) << 12) + (uint64(in[3]) << 20)
 			litRegenSize = int(n & 16383)
@@ -385,7 +337,7 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 			fourStreams = true
 			if len(in) < 5 {
 				println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in))
-				return ErrBlockTooSmall
+				return in, ErrBlockTooSmall
 			}
 			n := uint64(in[0]>>4) + (uint64(in[1]) << 4) + (uint64(in[2]) << 12) + (uint64(in[3]) << 20) + (uint64(in[4]) << 28)
 			litRegenSize = int(n & 262143)
@@ -393,16 +345,18 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 			in = in[5:]
 		}
 	}
-	if debug {
+	if debugDecoder {
 		println("literals type:", litType, "litRegenSize:", litRegenSize, "litCompSize:", litCompSize, "sizeFormat:", sizeFormat, "4X:", fourStreams)
 	}
-	var literals []byte
-	var huff *huff0.Scratch
+	if litRegenSize > int(b.WindowSize) || litRegenSize > maxCompressedBlockSize {
+		return in, ErrWindowSizeExceeded
+	}
+
 	switch litType {
 	case literalsBlockRaw:
 		if len(in) < litRegenSize {
 			println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", litRegenSize)
-			return ErrBlockTooSmall
+			return in, ErrBlockTooSmall
 		}
 		literals = in[:litRegenSize]
 		in = in[litRegenSize:]
@@ -410,7 +364,7 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 	case literalsBlockRLE:
 		if len(in) < 1 {
 			println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", 1)
-			return ErrBlockTooSmall
+			return in, ErrBlockTooSmall
 		}
 		if cap(b.literalBuf) < litRegenSize {
 			if b.lowMem {
@@ -421,7 +375,6 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 					b.literalBuf = make([]byte, litRegenSize)
 				} else {
 					b.literalBuf = make([]byte, litRegenSize, maxCompressedLiteralSize)
-
 				}
 			}
 		}
@@ -431,45 +384,79 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 			literals[i] = v
 		}
 		in = in[1:]
-		if debug {
+		if debugDecoder {
 			printf("Found %d RLE compressed literals\n", litRegenSize)
 		}
 	case literalsBlockTreeless:
 		if len(in) < litCompSize {
 			println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", litCompSize)
-			return ErrBlockTooSmall
+			return in, ErrBlockTooSmall
 		}
 		// Store compressed literals, so we defer decoding until we get history.
 		literals = in[:litCompSize]
 		in = in[litCompSize:]
-		if debug {
+		if debugDecoder {
 			printf("Found %d compressed literals\n", litCompSize)
 		}
+		huff := hist.huffTree
+		if huff == nil {
+			return in, errors.New("literal block was treeless, but no history was defined")
+		}
+		// Ensure we have space to store it.
+		if cap(b.literalBuf) < litRegenSize {
+			if b.lowMem {
+				b.literalBuf = make([]byte, 0, litRegenSize)
+			} else {
+				b.literalBuf = make([]byte, 0, maxCompressedLiteralSize)
+			}
+		}
+		var err error
+		// Use our out buffer.
+		huff.MaxDecodedSize = maxCompressedBlockSize
+		if fourStreams {
+			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
+		} else {
+			literals, err = huff.Decoder().Decompress1X(b.literalBuf[:0:litRegenSize], literals)
+		}
+		// Make sure we don't leak our literals buffer
+		if err != nil {
+			println("decompressing literals:", err)
+			return in, err
+		}
+		if len(literals) != litRegenSize {
+			return in, fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
+		}
+
 	case literalsBlockCompressed:
 		if len(in) < litCompSize {
 			println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", litCompSize)
-			return ErrBlockTooSmall
+			return in, ErrBlockTooSmall
 		}
 		literals = in[:litCompSize]
 		in = in[litCompSize:]
-		huff = huffDecoderPool.Get().(*huff0.Scratch)
-		var err error
 		// Ensure we have space to store it.
 		if cap(b.literalBuf) < litRegenSize {
 			if b.lowMem {
 				b.literalBuf = make([]byte, 0, litRegenSize)
 			} else {
-				b.literalBuf = make([]byte, 0, maxCompressedLiteralSize)
+				b.literalBuf = make([]byte, 0, maxCompressedBlockSize)
 			}
 		}
-		if huff == nil {
-			huff = &huff0.Scratch{}
+		huff := hist.huffTree
+		if huff == nil || (hist.dict != nil && huff == hist.dict.litEnc) {
+			huff = huffDecoderPool.Get().(*huff0.Scratch)
+			if huff == nil {
+				huff = &huff0.Scratch{}
+			}
 		}
+		var err error
 		huff, literals, err = huff0.ReadTable(literals, huff)
 		if err != nil {
 			println("reading huffman table:", err)
-			return err
+			return in, err
 		}
+		hist.huffTree = huff
+		huff.MaxDecodedSize = maxCompressedBlockSize
 		// Use our out buffer.
 		if fourStreams {
 			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
@@ -478,27 +465,61 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 		}
 		if err != nil {
 			println("decoding compressed literals:", err)
-			return err
+			return in, err
 		}
 		// Make sure we don't leak our literals buffer
 		if len(literals) != litRegenSize {
-			return fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
+			return in, fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
 		}
-		if debug {
+		if debugDecoder {
 			printf("Decompressed %d literals into %d bytes\n", litCompSize, litRegenSize)
 		}
 	}
+	hist.decoders.literals = literals
+	return in, nil
+}
+
+// decodeCompressed will start decompressing a block.
+func (b *blockDec) decodeCompressed(hist *history) error {
+	in := b.data
+	in, err := b.decodeLiterals(in, hist)
+	if err != nil {
+		return err
+	}
+	err = b.prepareSequences(in, hist)
+	if err != nil {
+		return err
+	}
+	if hist.decoders.nSeqs == 0 {
+		b.dst = append(b.dst, hist.decoders.literals...)
+		return nil
+	}
+	before := len(hist.decoders.out)
+	err = hist.decoders.decodeSync(hist.b[hist.ignoreBuffer:])
+	if err != nil {
+		return err
+	}
+	if hist.decoders.maxSyncLen > 0 {
+		hist.decoders.maxSyncLen += uint64(before)
+		hist.decoders.maxSyncLen -= uint64(len(hist.decoders.out))
+	}
+	b.dst = hist.decoders.out
+	hist.recentOffsets = hist.decoders.prevOffset
+	return nil
+}
 
+func (b *blockDec) prepareSequences(in []byte, hist *history) (err error) {
+	if debugDecoder {
+		printf("prepareSequences: %d byte(s) input\n", len(in))
+	}
 	// Decode Sequences
 	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#sequences-section
 	if len(in) < 1 {
 		return ErrBlockTooSmall
 	}
+	var nSeqs int
 	seqHeader := in[0]
-	nSeqs := 0
 	switch {
-	case seqHeader == 0:
-		in = in[1:]
 	case seqHeader < 128:
 		nSeqs = int(seqHeader)
 		in = in[1:]
@@ -515,19 +536,16 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 		nSeqs = 0x7f00 + int(in[1]) + (int(in[2]) << 8)
 		in = in[3:]
 	}
-	// Allocate sequences
-	if cap(b.sequenceBuf) < nSeqs {
-		if b.lowMem {
-			b.sequenceBuf = make([]seq, nSeqs)
-		} else {
-			// Allocate max
-			b.sequenceBuf = make([]seq, nSeqs, maxSequences)
+	if nSeqs == 0 && len(in) != 0 {
+		// When no sequences, there should not be any more data...
+		if debugDecoder {
+			printf("prepareSequences: 0 sequences, but %d byte(s) left on stream\n", len(in))
 		}
-	} else {
-		// Reuse buffer
-		b.sequenceBuf = b.sequenceBuf[:nSeqs]
+		return ErrUnexpectedBlockSize
 	}
-	var seqs = &sequenceDecs{}
+
+	var seqs = &hist.decoders
+	seqs.nSeqs = nSeqs
 	if nSeqs > 0 {
 		if len(in) < 1 {
 			return ErrBlockTooSmall
@@ -535,12 +553,12 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 		br := byteReader{b: in, off: 0}
 		compMode := br.Uint8()
 		br.advance(1)
-		if debug {
+		if debugDecoder {
 			printf("Compression modes: 0b%b", compMode)
 		}
 		for i := uint(0); i < 3; i++ {
 			mode := seqCompMode((compMode >> (6 - i*2)) & 3)
-			if debug {
+			if debugDecoder {
 				println("Table", tableIndex(i), "is", mode)
 			}
 			var seq *sequenceDec
@@ -556,6 +574,9 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 			}
 			switch mode {
 			case compModePredefined:
+				if seq.fse != nil && !seq.fse.preDefined {
+					fseDecoderPool.Put(seq.fse)
+				}
 				seq.fse = &fsePredef[i]
 			case compModeRLE:
 				if br.remain() < 1 {
@@ -563,34 +584,36 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 				}
 				v := br.Uint8()
 				br.advance(1)
-				dec := fseDecoderPool.Get().(*fseDecoder)
+				if seq.fse == nil || seq.fse.preDefined {
+					seq.fse = fseDecoderPool.Get().(*fseDecoder)
+				}
 				symb, err := decSymbolValue(v, symbolTableX[i])
 				if err != nil {
 					printf("RLE Transform table (%v) error: %v", tableIndex(i), err)
 					return err
 				}
-				dec.setRLE(symb)
-				seq.fse = dec
-				if debug {
+				seq.fse.setRLE(symb)
+				if debugDecoder {
 					printf("RLE set to %+v, code: %v", symb, v)
 				}
 			case compModeFSE:
 				println("Reading table for", tableIndex(i))
-				dec := fseDecoderPool.Get().(*fseDecoder)
-				err := dec.readNCount(&br, uint16(maxTableSymbol[i]))
+				if seq.fse == nil || seq.fse.preDefined {
+					seq.fse = fseDecoderPool.Get().(*fseDecoder)
+				}
+				err := seq.fse.readNCount(&br, uint16(maxTableSymbol[i]))
 				if err != nil {
 					println("Read table error:", err)
 					return err
 				}
-				err = dec.transform(symbolTableX[i])
+				err = seq.fse.transform(symbolTableX[i])
 				if err != nil {
 					println("Transform table error:", err)
 					return err
 				}
-				if debug {
-					println("Read table ok", "symbolLen:", dec.symbolLen)
+				if debugDecoder {
+					println("Read table ok", "symbolLen:", seq.fse.symbolLen)
 				}
-				seq.fse = dec
 			case compModeRepeat:
 				seq.repeat = true
 			}
@@ -600,140 +623,106 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 		}
 		in = br.unread()
 	}
-
-	// Wait for history.
-	// All time spent after this is critical since it is strictly sequential.
-	if hist == nil {
-		hist = <-b.history
-		if hist.error {
-			return ErrDecoderClosed
-		}
+	if debugDecoder {
+		println("Literals:", len(seqs.literals), "hash:", xxhash.Sum64(seqs.literals), "and", seqs.nSeqs, "sequences.")
 	}
 
-	// Decode treeless literal block.
-	if litType == literalsBlockTreeless {
-		// TODO: We could send the history early WITHOUT the stream history.
-		//   This would allow decoding treeless literials before the byte history is available.
-		//   Silencia stats: Treeless 4393, with: 32775, total: 37168, 11% treeless.
-		//   So not much obvious gain here.
-
-		if hist.huffTree == nil {
-			return errors.New("literal block was treeless, but no history was defined")
-		}
-		// Ensure we have space to store it.
-		if cap(b.literalBuf) < litRegenSize {
-			if b.lowMem {
-				b.literalBuf = make([]byte, 0, litRegenSize)
-			} else {
-				b.literalBuf = make([]byte, 0, maxCompressedLiteralSize)
-			}
-		}
-		var err error
-		// Use our out buffer.
-		huff = hist.huffTree
-		if fourStreams {
-			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
-		} else {
-			literals, err = huff.Decoder().Decompress1X(b.literalBuf[:0:litRegenSize], literals)
-		}
-		// Make sure we don't leak our literals buffer
-		if err != nil {
-			println("decompressing literals:", err)
-			return err
-		}
-		if len(literals) != litRegenSize {
-			return fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
-		}
-	} else {
-		if hist.huffTree != nil && huff != nil {
-			if hist.dict == nil || hist.dict.litDec != hist.huffTree {
-				huffDecoderPool.Put(hist.huffTree)
-			}
-			hist.huffTree = nil
+	if nSeqs == 0 {
+		if len(b.sequence) > 0 {
+			b.sequence = b.sequence[:0]
 		}
+		return nil
 	}
-	if huff != nil {
-		hist.huffTree = huff
+	br := seqs.br
+	if br == nil {
+		br = &bitReader{}
 	}
-	if debug {
-		println("Final literals:", len(literals), "hash:", xxhash.Sum64(literals), "and", nSeqs, "sequences.")
+	if err := br.init(in); err != nil {
+		return err
 	}
 
-	if nSeqs == 0 {
-		// Decompressed content is defined entirely as Literals Section content.
-		b.dst = append(b.dst, literals...)
-		if delayedHistory {
-			hist.append(literals)
+	if err := seqs.initialize(br, hist, b.dst); err != nil {
+		println("initializing sequences:", err)
+		return err
+	}
+	// Extract blocks...
+	if false && hist.dict == nil {
+		fatalErr := func(err error) {
+			if err != nil {
+				panic(err)
+			}
 		}
-		return nil
+		fn := fmt.Sprintf("n-%d-lits-%d-prev-%d-%d-%d-win-%d.blk", hist.decoders.nSeqs, len(hist.decoders.literals), hist.recentOffsets[0], hist.recentOffsets[1], hist.recentOffsets[2], hist.windowSize)
+		var buf bytes.Buffer
+		fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.litLengths.fse))
+		fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.matchLengths.fse))
+		fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.offsets.fse))
+		buf.Write(in)
+		ioutil.WriteFile(filepath.Join("testdata", "seqs", fn), buf.Bytes(), os.ModePerm)
 	}
 
-	seqs, err := seqs.mergeHistory(&hist.decoders)
-	if err != nil {
-		return err
-	}
-	if debug {
-		println("History merged ok")
+	return nil
+}
+
+func (b *blockDec) decodeSequences(hist *history) error {
+	if cap(b.sequence) < hist.decoders.nSeqs {
+		if b.lowMem {
+			b.sequence = make([]seqVals, 0, hist.decoders.nSeqs)
+		} else {
+			b.sequence = make([]seqVals, 0, 0x7F00+0xffff)
+		}
 	}
-	br := &bitReader{}
-	if err := br.init(in); err != nil {
-		return err
+	b.sequence = b.sequence[:hist.decoders.nSeqs]
+	if hist.decoders.nSeqs == 0 {
+		hist.decoders.seqSize = len(hist.decoders.literals)
+		return nil
 	}
+	hist.decoders.windowSize = hist.windowSize
+	hist.decoders.prevOffset = hist.recentOffsets
 
-	// TODO: Investigate if sending history without decoders are faster.
-	//   This would allow the sequences to be decoded async and only have to construct stream history.
-	//   If only recent offsets were not transferred, this would be an obvious win.
-	// 	 Also, if first 3 sequences don't reference recent offsets, all sequences can be decoded.
+	err := hist.decoders.decode(b.sequence)
+	hist.recentOffsets = hist.decoders.prevOffset
+	return err
+}
 
+func (b *blockDec) executeSequences(hist *history) error {
 	hbytes := hist.b
 	if len(hbytes) > hist.windowSize {
 		hbytes = hbytes[len(hbytes)-hist.windowSize:]
-		// We do not need history any more.
+		// We do not need history anymore.
 		if hist.dict != nil {
 			hist.dict.content = nil
 		}
 	}
-
-	if err := seqs.initialize(br, hist, literals, b.dst); err != nil {
-		println("initializing sequences:", err)
-		return err
-	}
-
-	err = seqs.decode(nSeqs, br, hbytes)
+	hist.decoders.windowSize = hist.windowSize
+	hist.decoders.out = b.dst[:0]
+	err := hist.decoders.execute(b.sequence, hbytes)
 	if err != nil {
 		return err
 	}
-	if !br.finished() {
-		return fmt.Errorf("%d extra bits on block, should be 0", br.remain())
-	}
+	return b.updateHistory(hist)
+}
 
-	err = br.close()
-	if err != nil {
-		printf("Closing sequences: %v, %+v\n", err, *br)
-	}
+func (b *blockDec) updateHistory(hist *history) error {
 	if len(b.data) > maxCompressedBlockSize {
 		return fmt.Errorf("compressed block size too large (%d)", len(b.data))
 	}
 	// Set output and release references.
-	b.dst = seqs.out
-	seqs.out, seqs.literals, seqs.hist = nil, nil, nil
+	b.dst = hist.decoders.out
+	hist.recentOffsets = hist.decoders.prevOffset
 
-	if !delayedHistory {
-		// If we don't have delayed history, no need to update.
-		hist.recentOffsets = seqs.prevOffset
-		return nil
-	}
 	if b.Last {
 		// if last block we don't care about history.
 		println("Last block, no history returned")
 		hist.b = hist.b[:0]
 		return nil
+	} else {
+		hist.append(b.dst)
+		if debugDecoder {
+			println("Finished block with ", len(b.sequence), "sequences. Added", len(b.dst), "to history, now length", len(hist.b))
+		}
 	}
-	hist.append(b.dst)
-	hist.recentOffsets = seqs.prevOffset
-	if debug {
-		println("Finished block with literals:", len(literals), "and", nSeqs, "sequences.")
-	}
+	hist.decoders.out, hist.decoders.literals = nil, nil
 
 	return nil
 }
diff --git a/vendor/github.com/klauspost/compress/zstd/blockenc.go b/vendor/github.com/klauspost/compress/zstd/blockenc.go
index c584f6a..12e8f6f 100644
--- a/vendor/github.com/klauspost/compress/zstd/blockenc.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockenc.go
@@ -14,35 +14,52 @@ import (
 )
 
 type blockEnc struct {
-	size      int
-	literals  []byte
-	sequences []seq
-	coders    seqCoders
-	litEnc    *huff0.Scratch
-	wr        bitWriter
-
-	extraLits int
-	last      bool
-
+	size       int
+	literals   []byte
+	sequences  []seq
+	coders     seqCoders
+	litEnc     *huff0.Scratch
+	dictLitEnc *huff0.Scratch
+	wr         bitWriter
+
+	extraLits         int
 	output            []byte
 	recentOffsets     [3]uint32
 	prevRecentOffsets [3]uint32
+
+	last   bool
+	lowMem bool
 }
 
 // init should be used once the block has been created.
 // If called more than once, the effect is the same as calling reset.
 func (b *blockEnc) init() {
-	if cap(b.literals) < maxCompressedLiteralSize {
-		b.literals = make([]byte, 0, maxCompressedLiteralSize)
-	}
-	const defSeqs = 200
-	b.literals = b.literals[:0]
-	if cap(b.sequences) < defSeqs {
-		b.sequences = make([]seq, 0, defSeqs)
-	}
-	if cap(b.output) < maxCompressedBlockSize {
-		b.output = make([]byte, 0, maxCompressedBlockSize)
+	if b.lowMem {
+		// 1K literals
+		if cap(b.literals) < 1<<10 {
+			b.literals = make([]byte, 0, 1<<10)
+		}
+		const defSeqs = 20
+		if cap(b.sequences) < defSeqs {
+			b.sequences = make([]seq, 0, defSeqs)
+		}
+		// 1K
+		if cap(b.output) < 1<<10 {
+			b.output = make([]byte, 0, 1<<10)
+		}
+	} else {
+		if cap(b.literals) < maxCompressedBlockSize {
+			b.literals = make([]byte, 0, maxCompressedBlockSize)
+		}
+		const defSeqs = 2000
+		if cap(b.sequences) < defSeqs {
+			b.sequences = make([]seq, 0, defSeqs)
+		}
+		if cap(b.output) < maxCompressedBlockSize {
+			b.output = make([]byte, 0, maxCompressedBlockSize)
+		}
 	}
+
 	if b.coders.mlEnc == nil {
 		b.coders.mlEnc = &fseEncoder{}
 		b.coders.mlPrev = &fseEncoder{}
@@ -75,6 +92,7 @@ func (b *blockEnc) reset(prev *blockEnc) {
 	if prev != nil {
 		b.recentOffsets = prev.prevRecentOffsets
 	}
+	b.dictLitEnc = nil
 }
 
 // reset will reset the block for a new encode, but in the same stream,
@@ -138,7 +156,7 @@ func (h *literalsHeader) setSize(regenLen int) {
 	switch {
 	case inBits < 5:
 		lh |= (uint64(regenLen) << 3) | (1 << 60)
-		if debug {
+		if debugEncoder {
 			got := int(lh>>3) & 0xff
 			if got != regenLen {
 				panic(fmt.Sprint("litRegenSize = ", regenLen, "(want) != ", got, "(got)"))
@@ -166,7 +184,7 @@ func (h *literalsHeader) setSizes(compLen, inLen int, single bool) {
 			lh |= 1 << 2
 		}
 		lh |= (uint64(inLen) << 4) | (uint64(compLen) << (10 + 4)) | (3 << 60)
-		if debug {
+		if debugEncoder {
 			const mmask = (1 << 24) - 1
 			n := (lh >> 4) & mmask
 			if int(n&1023) != inLen {
@@ -294,8 +312,8 @@ func (b *blockEnc) encodeRaw(a []byte) {
 	bh.setType(blockTypeRaw)
 	b.output = bh.appendTo(b.output[:0])
 	b.output = append(b.output, a...)
-	if debug {
-		println("Adding RAW block, length", len(a))
+	if debugEncoder {
+		println("Adding RAW block, length", len(a), "last:", b.last)
 	}
 }
 
@@ -307,26 +325,26 @@ func (b *blockEnc) encodeRawTo(dst, src []byte) []byte {
 	bh.setType(blockTypeRaw)
 	dst = bh.appendTo(dst)
 	dst = append(dst, src...)
-	if debug {
-		println("Adding RAW block, length", len(src))
+	if debugEncoder {
+		println("Adding RAW block, length", len(src), "last:", b.last)
 	}
 	return dst
 }
 
 // encodeLits can be used if the block is only litLen.
-func (b *blockEnc) encodeLits(raw bool) error {
+func (b *blockEnc) encodeLits(lits []byte, raw bool) error {
 	var bh blockHeader
 	bh.setLast(b.last)
-	bh.setSize(uint32(len(b.literals)))
+	bh.setSize(uint32(len(lits)))
 
 	// Don't compress extremely small blocks
-	if len(b.literals) < 32 || raw {
-		if debug {
-			println("Adding RAW block, length", len(b.literals))
+	if len(lits) < 8 || (len(lits) < 32 && b.dictLitEnc == nil) || raw {
+		if debugEncoder {
+			println("Adding RAW block, length", len(lits), "last:", b.last)
 		}
 		bh.setType(blockTypeRaw)
 		b.output = bh.appendTo(b.output)
-		b.output = append(b.output, b.literals...)
+		b.output = append(b.output, lits...)
 		return nil
 	}
 
@@ -335,37 +353,42 @@ func (b *blockEnc) encodeLits(raw bool) error {
 		reUsed, single bool
 		err            error
 	)
-	if len(b.literals) >= 1024 {
+	if b.dictLitEnc != nil {
+		b.litEnc.TransferCTable(b.dictLitEnc)
+		b.litEnc.Reuse = huff0.ReusePolicyAllow
+		b.dictLitEnc = nil
+	}
+	if len(lits) >= 1024 {
 		// Use 4 Streams.
-		out, reUsed, err = huff0.Compress4X(b.literals, b.litEnc)
-	} else if len(b.literals) > 32 {
+		out, reUsed, err = huff0.Compress4X(lits, b.litEnc)
+	} else if len(lits) > 32 {
 		// Use 1 stream
 		single = true
-		out, reUsed, err = huff0.Compress1X(b.literals, b.litEnc)
+		out, reUsed, err = huff0.Compress1X(lits, b.litEnc)
 	} else {
 		err = huff0.ErrIncompressible
 	}
 
 	switch err {
 	case huff0.ErrIncompressible:
-		if debug {
-			println("Adding RAW block, length", len(b.literals))
+		if debugEncoder {
+			println("Adding RAW block, length", len(lits), "last:", b.last)
 		}
 		bh.setType(blockTypeRaw)
 		b.output = bh.appendTo(b.output)
-		b.output = append(b.output, b.literals...)
+		b.output = append(b.output, lits...)
 		return nil
 	case huff0.ErrUseRLE:
-		if debug {
-			println("Adding RLE block, length", len(b.literals))
+		if debugEncoder {
+			println("Adding RLE block, length", len(lits))
 		}
 		bh.setType(blockTypeRLE)
 		b.output = bh.appendTo(b.output)
-		b.output = append(b.output, b.literals[0])
+		b.output = append(b.output, lits[0])
 		return nil
+	case nil:
 	default:
 		return err
-	case nil:
 	}
 	// Compressed...
 	// Now, allow reuse
@@ -373,18 +396,18 @@ func (b *blockEnc) encodeLits(raw bool) error {
 	bh.setType(blockTypeCompressed)
 	var lh literalsHeader
 	if reUsed {
-		if debug {
+		if debugEncoder {
 			println("Reused tree, compressed to", len(out))
 		}
 		lh.setType(literalsBlockTreeless)
 	} else {
-		if debug {
+		if debugEncoder {
 			println("New tree, compressed to", len(out), "tree size:", len(b.litEnc.OutTable))
 		}
 		lh.setType(literalsBlockCompressed)
 	}
 	// Set sizes
-	lh.setSizes(len(out), len(b.literals), single)
+	lh.setSizes(len(out), len(lits), single)
 	bh.setSize(uint32(len(out) + lh.size() + 1))
 
 	// Write block headers.
@@ -403,7 +426,7 @@ func fuzzFseEncoder(data []byte) int {
 		return 0
 	}
 	enc := fseEncoder{}
-	hist := enc.Histogram()[:256]
+	hist := enc.Histogram()
 	maxSym := uint8(0)
 	for i, v := range data {
 		v = v & 63
@@ -444,13 +467,19 @@ func fuzzFseEncoder(data []byte) int {
 }
 
 // encode will encode the block and append the output in b.output.
-func (b *blockEnc) encode(raw, rawAllLits bool) error {
+// Previous offset codes must be pushed if more blocks are expected.
+func (b *blockEnc) encode(org []byte, raw, rawAllLits bool) error {
 	if len(b.sequences) == 0 {
-		return b.encodeLits(rawAllLits)
+		return b.encodeLits(b.literals, rawAllLits)
 	}
-	// We want some difference
-	if len(b.literals) > (b.size - (b.size >> 5)) {
-		return errIncompressible
+	// We want some difference to at least account for the headers.
+	saved := b.size - len(b.literals) - (b.size >> 5)
+	if saved < 16 {
+		if org == nil {
+			return errIncompressible
+		}
+		b.popOffsets()
+		return b.encodeLits(org, rawAllLits)
 	}
 
 	var bh blockHeader
@@ -466,6 +495,11 @@ func (b *blockEnc) encode(raw, rawAllLits bool) error {
 		reUsed, single bool
 		err            error
 	)
+	if b.dictLitEnc != nil {
+		b.litEnc.TransferCTable(b.dictLitEnc)
+		b.litEnc.Reuse = huff0.ReusePolicyAllow
+		b.dictLitEnc = nil
+	}
 	if len(b.literals) >= 1024 && !raw {
 		// Use 4 Streams.
 		out, reUsed, err = huff0.Compress4X(b.literals, b.litEnc)
@@ -483,7 +517,7 @@ func (b *blockEnc) encode(raw, rawAllLits bool) error {
 		lh.setSize(len(b.literals))
 		b.output = lh.appendTo(b.output)
 		b.output = append(b.output, b.literals...)
-		if debug {
+		if debugEncoder {
 			println("Adding literals RAW, length", len(b.literals))
 		}
 	case huff0.ErrUseRLE:
@@ -491,27 +525,22 @@ func (b *blockEnc) encode(raw, rawAllLits bool) error {
 		lh.setSize(len(b.literals))
 		b.output = lh.appendTo(b.output)
 		b.output = append(b.output, b.literals[0])
-		if debug {
+		if debugEncoder {
 			println("Adding literals RLE")
 		}
-	default:
-		if debug {
-			println("Adding literals ERROR:", err)
-		}
-		return err
 	case nil:
 		// Compressed litLen...
 		if reUsed {
-			if debug {
+			if debugEncoder {
 				println("reused tree")
 			}
 			lh.setType(literalsBlockTreeless)
 		} else {
-			if debug {
+			if debugEncoder {
 				println("new tree, size:", len(b.litEnc.OutTable))
 			}
 			lh.setType(literalsBlockCompressed)
-			if debug {
+			if debugEncoder {
 				_, _, err := huff0.ReadTable(out, nil)
 				if err != nil {
 					panic(err)
@@ -519,16 +548,21 @@ func (b *blockEnc) encode(raw, rawAllLits bool) error {
 			}
 		}
 		lh.setSizes(len(out), len(b.literals), single)
-		if debug {
+		if debugEncoder {
 			printf("Compressed %d literals to %d bytes", len(b.literals), len(out))
 			println("Adding literal header:", lh)
 		}
 		b.output = lh.appendTo(b.output)
 		b.output = append(b.output, out...)
 		b.litEnc.Reuse = huff0.ReusePolicyAllow
-		if debug {
+		if debugEncoder {
 			println("Adding literals compressed")
 		}
+	default:
+		if debugEncoder {
+			println("Adding literals ERROR:", err)
+		}
+		return err
 	}
 	// Sequence compression
 
@@ -543,7 +577,7 @@ func (b *blockEnc) encode(raw, rawAllLits bool) error {
 		n := len(b.sequences) - 0x7f00
 		b.output = append(b.output, 255, uint8(n), uint8(n>>8))
 	}
-	if debug {
+	if debugEncoder {
 		println("Encoding", len(b.sequences), "sequences")
 	}
 	b.genCodes()
@@ -577,17 +611,17 @@ func (b *blockEnc) encode(raw, rawAllLits bool) error {
 		nSize = nSize + (nSize+2*8*16)>>4
 		switch {
 		case predefSize <= prevSize && predefSize <= nSize || forcePreDef:
-			if debug {
+			if debugEncoder {
 				println("Using predefined", predefSize>>3, "<=", nSize>>3)
 			}
 			return preDef, compModePredefined
 		case prevSize <= nSize:
-			if debug {
+			if debugEncoder {
 				println("Using previous", prevSize>>3, "<=", nSize>>3)
 			}
 			return prev, compModeRepeat
 		default:
-			if debug {
+			if debugEncoder {
 				println("Using new, predef", predefSize>>3, ". previous:", prevSize>>3, ">", nSize>>3, "header max:", cur.maxHeaderSize()>>3, "bytes")
 				println("tl:", cur.actualTableLog, "symbolLen:", cur.symbolLen, "norm:", cur.norm[:cur.symbolLen], "hist", cur.count[:cur.symbolLen])
 			}
@@ -600,7 +634,7 @@ func (b *blockEnc) encode(raw, rawAllLits bool) error {
 	if llEnc.useRLE {
 		mode |= uint8(compModeRLE) << 6
 		llEnc.setRLE(b.sequences[0].llCode)
-		if debug {
+		if debugEncoder {
 			println("llEnc.useRLE")
 		}
 	} else {
@@ -611,7 +645,7 @@ func (b *blockEnc) encode(raw, rawAllLits bool) error {
 	if ofEnc.useRLE {
 		mode |= uint8(compModeRLE) << 4
 		ofEnc.setRLE(b.sequences[0].ofCode)
-		if debug {
+		if debugEncoder {
 			println("ofEnc.useRLE")
 		}
 	} else {
@@ -623,7 +657,7 @@ func (b *blockEnc) encode(raw, rawAllLits bool) error {
 	if mlEnc.useRLE {
 		mode |= uint8(compModeRLE) << 2
 		mlEnc.setRLE(b.sequences[0].mlCode)
-		if debug {
+		if debugEncoder {
 			println("mlEnc.useRLE, code: ", b.sequences[0].mlCode, "value", b.sequences[0].matchLen)
 		}
 	} else {
@@ -632,7 +666,7 @@ func (b *blockEnc) encode(raw, rawAllLits bool) error {
 		mode |= uint8(m) << 2
 	}
 	b.output = append(b.output, mode)
-	if debug {
+	if debugEncoder {
 		printf("Compression modes: 0b%b", mode)
 	}
 	b.output, err = llEnc.writeCount(b.output)
@@ -688,52 +722,53 @@ func (b *blockEnc) encode(raw, rawAllLits bool) error {
 		println("Encoded seq", seq, s, "codes:", s.llCode, s.mlCode, s.ofCode, "states:", ll.state, ml.state, of.state, "bits:", llB, mlB, ofB)
 	}
 	seq--
-	if llEnc.maxBits+mlEnc.maxBits+ofEnc.maxBits <= 32 {
-		// No need to flush (common)
-		for seq >= 0 {
-			s = b.sequences[seq]
-			wr.flush32()
-			llB, ofB, mlB := llTT[s.llCode], ofTT[s.ofCode], mlTT[s.mlCode]
-			// tabelog max is 8 for all.
-			of.encode(ofB)
-			ml.encode(mlB)
-			ll.encode(llB)
-			wr.flush32()
-
-			// We checked that all can stay within 32 bits
-			wr.addBits32NC(s.litLen, llB.outBits)
-			wr.addBits32NC(s.matchLen, mlB.outBits)
-			wr.addBits32NC(s.offset, ofB.outBits)
-
-			if debugSequences {
-				println("Encoded seq", seq, s)
-			}
-
-			seq--
+	// Store sequences in reverse...
+	for seq >= 0 {
+		s = b.sequences[seq]
+
+		ofB := ofTT[s.ofCode]
+		wr.flush32() // tablelog max is below 8 for each, so it will fill max 24 bits.
+		//of.encode(ofB)
+		nbBitsOut := (uint32(of.state) + ofB.deltaNbBits) >> 16
+		dstState := int32(of.state>>(nbBitsOut&15)) + int32(ofB.deltaFindState)
+		wr.addBits16NC(of.state, uint8(nbBitsOut))
+		of.state = of.stateTable[dstState]
+
+		// Accumulate extra bits.
+		outBits := ofB.outBits & 31
+		extraBits := uint64(s.offset & bitMask32[outBits])
+		extraBitsN := outBits
+
+		mlB := mlTT[s.mlCode]
+		//ml.encode(mlB)
+		nbBitsOut = (uint32(ml.state) + mlB.deltaNbBits) >> 16
+		dstState = int32(ml.state>>(nbBitsOut&15)) + int32(mlB.deltaFindState)
+		wr.addBits16NC(ml.state, uint8(nbBitsOut))
+		ml.state = ml.stateTable[dstState]
+
+		outBits = mlB.outBits & 31
+		extraBits = extraBits<<outBits | uint64(s.matchLen&bitMask32[outBits])
+		extraBitsN += outBits
+
+		llB := llTT[s.llCode]
+		//ll.encode(llB)
+		nbBitsOut = (uint32(ll.state) + llB.deltaNbBits) >> 16
+		dstState = int32(ll.state>>(nbBitsOut&15)) + int32(llB.deltaFindState)
+		wr.addBits16NC(ll.state, uint8(nbBitsOut))
+		ll.state = ll.stateTable[dstState]
+
+		outBits = llB.outBits & 31
+		extraBits = extraBits<<outBits | uint64(s.litLen&bitMask32[outBits])
+		extraBitsN += outBits
+
+		wr.flush32()
+		wr.addBits64NC(extraBits, extraBitsN)
+
+		if debugSequences {
+			println("Encoded seq", seq, s)
 		}
-	} else {
-		for seq >= 0 {
-			s = b.sequences[seq]
-			wr.flush32()
-			llB, ofB, mlB := llTT[s.llCode], ofTT[s.ofCode], mlTT[s.mlCode]
-			// tabelog max is below 8 for each.
-			of.encode(ofB)
-			ml.encode(mlB)
-			ll.encode(llB)
-			wr.flush32()
-
-			// ml+ll = max 32 bits total
-			wr.addBits32NC(s.litLen, llB.outBits)
-			wr.addBits32NC(s.matchLen, mlB.outBits)
-			wr.flush32()
-			wr.addBits32NC(s.offset, ofB.outBits)
-
-			if debugSequences {
-				println("Encoded seq", seq, s)
-			}
 
-			seq--
-		}
+		seq--
 	}
 	ml.flush(mlEnc.actualTableLog)
 	of.flush(ofEnc.actualTableLog)
@@ -752,7 +787,7 @@ func (b *blockEnc) encode(raw, rawAllLits bool) error {
 
 	// Size is output minus block header.
 	bh.setSize(uint32(len(b.output)-bhOffset) - 3)
-	if debug {
+	if debugEncoder {
 		println("Rewriting block header", bh)
 	}
 	_ = bh.appendTo(b.output[bhOffset:bhOffset])
@@ -767,14 +802,13 @@ func (b *blockEnc) genCodes() {
 		// nothing to do
 		return
 	}
-
 	if len(b.sequences) > math.MaxUint16 {
 		panic("can only encode up to 64K sequences")
 	}
 	// No bounds checks after here:
-	llH := b.coders.llEnc.Histogram()[:256]
-	ofH := b.coders.ofEnc.Histogram()[:256]
-	mlH := b.coders.mlEnc.Histogram()[:256]
+	llH := b.coders.llEnc.Histogram()
+	ofH := b.coders.ofEnc.Histogram()
+	mlH := b.coders.mlEnc.Histogram()
 	for i := range llH {
 		llH[i] = 0
 	}
@@ -786,7 +820,8 @@ func (b *blockEnc) genCodes() {
 	}
 
 	var llMax, ofMax, mlMax uint8
-	for i, seq := range b.sequences {
+	for i := range b.sequences {
+		seq := &b.sequences[i]
 		v := llCode(seq.litLen)
 		seq.llCode = v
 		llH[v]++
@@ -810,7 +845,6 @@ func (b *blockEnc) genCodes() {
 				panic(fmt.Errorf("mlMax > maxMatchLengthSymbol (%d), matchlen: %d", mlMax, seq.matchLen))
 			}
 		}
-		b.sequences[i] = seq
 	}
 	maxCount := func(a []uint32) int {
 		var max uint32
diff --git a/vendor/github.com/klauspost/compress/zstd/bytebuf.go b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
index 658ef78..b80191e 100644
--- a/vendor/github.com/klauspost/compress/zstd/bytebuf.go
+++ b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
@@ -12,8 +12,8 @@ import (
 
 type byteBuffer interface {
 	// Read up to 8 bytes.
-	// Returns nil if no more input is available.
-	readSmall(n int) []byte
+	// Returns io.ErrUnexpectedEOF if this cannot be satisfied.
+	readSmall(n int) ([]byte, error)
 
 	// Read >8 bytes.
 	// MAY use the destination slice.
@@ -29,17 +29,17 @@ type byteBuffer interface {
 // in-memory buffer
 type byteBuf []byte
 
-func (b *byteBuf) readSmall(n int) []byte {
+func (b *byteBuf) readSmall(n int) ([]byte, error) {
 	if debugAsserts && n > 8 {
 		panic(fmt.Errorf("small read > 8 (%d). use readBig", n))
 	}
 	bb := *b
 	if len(bb) < n {
-		return nil
+		return nil, io.ErrUnexpectedEOF
 	}
 	r := bb[:n]
 	*b = bb[n:]
-	return r
+	return r, nil
 }
 
 func (b *byteBuf) readBig(n int, dst []byte) ([]byte, error) {
@@ -81,19 +81,22 @@ type readerWrapper struct {
 	tmp [8]byte
 }
 
-func (r *readerWrapper) readSmall(n int) []byte {
+func (r *readerWrapper) readSmall(n int) ([]byte, error) {
 	if debugAsserts && n > 8 {
 		panic(fmt.Errorf("small read > 8 (%d). use readBig", n))
 	}
 	n2, err := io.ReadFull(r.r, r.tmp[:n])
 	// We only really care about the actual bytes read.
-	if n2 != n {
-		if debug {
+	if err != nil {
+		if err == io.EOF {
+			return nil, io.ErrUnexpectedEOF
+		}
+		if debugDecoder {
 			println("readSmall: got", n2, "want", n, "err", err)
 		}
-		return nil
+		return nil, err
 	}
-	return r.tmp[:n]
+	return r.tmp[:n], nil
 }
 
 func (r *readerWrapper) readBig(n int, dst []byte) ([]byte, error) {
@@ -110,6 +113,9 @@ func (r *readerWrapper) readBig(n int, dst []byte) ([]byte, error) {
 func (r *readerWrapper) readByte() (byte, error) {
 	n2, err := r.r.Read(r.tmp[:1])
 	if err != nil {
+		if err == io.EOF {
+			err = io.ErrUnexpectedEOF
+		}
 		return 0, err
 	}
 	if n2 != 1 {
diff --git a/vendor/github.com/klauspost/compress/zstd/decodeheader.go b/vendor/github.com/klauspost/compress/zstd/decodeheader.go
new file mode 100644
index 0000000..5022e71
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/decodeheader.go
@@ -0,0 +1,230 @@
+// Copyright 2020+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+
+package zstd
+
+import (
+	"bytes"
+	"encoding/binary"
+	"errors"
+	"io"
+)
+
+// HeaderMaxSize is the maximum size of a Frame and Block Header.
+// If less is sent to Header.Decode it *may* still contain enough information.
+const HeaderMaxSize = 14 + 3
+
+// Header contains information about the first frame and block within that.
+type Header struct {
+	// SingleSegment specifies whether the data is to be decompressed into a
+	// single contiguous memory segment.
+	// It implies that WindowSize is invalid and that FrameContentSize is valid.
+	SingleSegment bool
+
+	// WindowSize is the window of data to keep while decoding.
+	// Will only be set if SingleSegment is false.
+	WindowSize uint64
+
+	// Dictionary ID.
+	// If 0, no dictionary.
+	DictionaryID uint32
+
+	// HasFCS specifies whether FrameContentSize has a valid value.
+	HasFCS bool
+
+	// FrameContentSize is the expected uncompressed size of the entire frame.
+	FrameContentSize uint64
+
+	// Skippable will be true if the frame is meant to be skipped.
+	// This implies that FirstBlock.OK is false.
+	Skippable bool
+
+	// SkippableID is the user-specific ID for the skippable frame.
+	// Valid values are between 0 to 15, inclusive.
+	SkippableID int
+
+	// SkippableSize is the length of the user data to skip following
+	// the header.
+	SkippableSize uint32
+
+	// HeaderSize is the raw size of the frame header.
+	//
+	// For normal frames, it includes the size of the magic number and
+	// the size of the header (per section 3.1.1.1).
+	// It does not include the size for any data blocks (section 3.1.1.2) nor
+	// the size for the trailing content checksum.
+	//
+	// For skippable frames, this counts the size of the magic number
+	// along with the size of the size field of the payload.
+	// It does not include the size of the skippable payload itself.
+	// The total frame size is the HeaderSize plus the SkippableSize.
+	HeaderSize int
+
+	// First block information.
+	FirstBlock struct {
+		// OK will be set if first block could be decoded.
+		OK bool
+
+		// Is this the last block of a frame?
+		Last bool
+
+		// Is the data compressed?
+		// If true CompressedSize will be populated.
+		// Unfortunately DecompressedSize cannot be determined
+		// without decoding the blocks.
+		Compressed bool
+
+		// DecompressedSize is the expected decompressed size of the block.
+		// Will be 0 if it cannot be determined.
+		DecompressedSize int
+
+		// CompressedSize of the data in the block.
+		// Does not include the block header.
+		// Will be equal to DecompressedSize if not Compressed.
+		CompressedSize int
+	}
+
+	// If set there is a checksum present for the block content.
+	// The checksum field at the end is always 4 bytes long.
+	HasCheckSum bool
+}
+
+// Decode the header from the beginning of the stream.
+// This will decode the frame header and the first block header if enough bytes are provided.
+// It is recommended to provide at least HeaderMaxSize bytes.
+// If the frame header cannot be read an error will be returned.
+// If there isn't enough input, io.ErrUnexpectedEOF is returned.
+// The FirstBlock.OK will indicate if enough information was available to decode the first block header.
+func (h *Header) Decode(in []byte) error {
+	*h = Header{}
+	if len(in) < 4 {
+		return io.ErrUnexpectedEOF
+	}
+	h.HeaderSize += 4
+	b, in := in[:4], in[4:]
+	if !bytes.Equal(b, frameMagic) {
+		if !bytes.Equal(b[1:4], skippableFrameMagic) || b[0]&0xf0 != 0x50 {
+			return ErrMagicMismatch
+		}
+		if len(in) < 4 {
+			return io.ErrUnexpectedEOF
+		}
+		h.HeaderSize += 4
+		h.Skippable = true
+		h.SkippableID = int(b[0] & 0xf)
+		h.SkippableSize = binary.LittleEndian.Uint32(in)
+		return nil
+	}
+
+	// Read Window_Descriptor
+	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#window_descriptor
+	if len(in) < 1 {
+		return io.ErrUnexpectedEOF
+	}
+	fhd, in := in[0], in[1:]
+	h.HeaderSize++
+	h.SingleSegment = fhd&(1<<5) != 0
+	h.HasCheckSum = fhd&(1<<2) != 0
+	if fhd&(1<<3) != 0 {
+		return errors.New("reserved bit set on frame header")
+	}
+
+	if !h.SingleSegment {
+		if len(in) < 1 {
+			return io.ErrUnexpectedEOF
+		}
+		var wd byte
+		wd, in = in[0], in[1:]
+		h.HeaderSize++
+		windowLog := 10 + (wd >> 3)
+		windowBase := uint64(1) << windowLog
+		windowAdd := (windowBase / 8) * uint64(wd&0x7)
+		h.WindowSize = windowBase + windowAdd
+	}
+
+	// Read Dictionary_ID
+	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary_id
+	if size := fhd & 3; size != 0 {
+		if size == 3 {
+			size = 4
+		}
+		if len(in) < int(size) {
+			return io.ErrUnexpectedEOF
+		}
+		b, in = in[:size], in[size:]
+		h.HeaderSize += int(size)
+		switch size {
+		case 1:
+			h.DictionaryID = uint32(b[0])
+		case 2:
+			h.DictionaryID = uint32(b[0]) | (uint32(b[1]) << 8)
+		case 4:
+			h.DictionaryID = uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
+		}
+	}
+
+	// Read Frame_Content_Size
+	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frame_content_size
+	var fcsSize int
+	v := fhd >> 6
+	switch v {
+	case 0:
+		if h.SingleSegment {
+			fcsSize = 1
+		}
+	default:
+		fcsSize = 1 << v
+	}
+
+	if fcsSize > 0 {
+		h.HasFCS = true
+		if len(in) < fcsSize {
+			return io.ErrUnexpectedEOF
+		}
+		b, in = in[:fcsSize], in[fcsSize:]
+		h.HeaderSize += int(fcsSize)
+		switch fcsSize {
+		case 1:
+			h.FrameContentSize = uint64(b[0])
+		case 2:
+			// When FCS_Field_Size is 2, the offset of 256 is added.
+			h.FrameContentSize = uint64(b[0]) | (uint64(b[1]) << 8) + 256
+		case 4:
+			h.FrameContentSize = uint64(b[0]) | (uint64(b[1]) << 8) | (uint64(b[2]) << 16) | (uint64(b[3]) << 24)
+		case 8:
+			d1 := uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
+			d2 := uint32(b[4]) | (uint32(b[5]) << 8) | (uint32(b[6]) << 16) | (uint32(b[7]) << 24)
+			h.FrameContentSize = uint64(d1) | (uint64(d2) << 32)
+		}
+	}
+
+	// Frame Header done, we will not fail from now on.
+	if len(in) < 3 {
+		return nil
+	}
+	tmp := in[:3]
+	bh := uint32(tmp[0]) | (uint32(tmp[1]) << 8) | (uint32(tmp[2]) << 16)
+	h.FirstBlock.Last = bh&1 != 0
+	blockType := blockType((bh >> 1) & 3)
+	// find size.
+	cSize := int(bh >> 3)
+	switch blockType {
+	case blockTypeReserved:
+		return nil
+	case blockTypeRLE:
+		h.FirstBlock.Compressed = true
+		h.FirstBlock.DecompressedSize = cSize
+		h.FirstBlock.CompressedSize = 1
+	case blockTypeCompressed:
+		h.FirstBlock.Compressed = true
+		h.FirstBlock.CompressedSize = cSize
+	case blockTypeRaw:
+		h.FirstBlock.DecompressedSize = cSize
+		h.FirstBlock.CompressedSize = cSize
+	default:
+		panic("Invalid block type")
+	}
+
+	h.FirstBlock.OK = true
+	return nil
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder.go b/vendor/github.com/klauspost/compress/zstd/decoder.go
index 66b51bf..36119f3 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder.go
@@ -6,9 +6,12 @@ package zstd
 
 import (
 	"bytes"
-	"errors"
+	"context"
+	"encoding/binary"
 	"io"
 	"sync"
+
+	"github.com/klauspost/compress/zstd/internal/xxhash"
 )
 
 // Decoder provides decoding of zstandard streams.
@@ -23,12 +26,19 @@ type Decoder struct {
 	// Unreferenced decoders, ready for use.
 	decoders chan *blockDec
 
-	// Streams ready to be decoded.
-	stream chan decodeStream
-
 	// Current read position used for Reader functionality.
 	current decoderState
 
+	// sync stream decoding
+	syncStream struct {
+		decodedFrame uint64
+		br           readerWrapper
+		enabled      bool
+		inFrame      bool
+	}
+
+	frame *frameDec
+
 	// Custom dictionaries.
 	// Always uses copies.
 	dicts map[uint32]dict
@@ -47,7 +57,10 @@ type decoderState struct {
 	output chan decodeOutput
 
 	// cancel remaining output.
-	cancel chan struct{}
+	cancel context.CancelFunc
+
+	// crc of current frame
+	crc *xxhash.Digest
 
 	flushed bool
 }
@@ -82,9 +95,13 @@ func NewReader(r io.Reader, opts ...DOption) (*Decoder, error) {
 			return nil, err
 		}
 	}
-	d.current.output = make(chan decodeOutput, d.o.concurrent)
+	d.current.crc = xxhash.New()
 	d.current.flushed = true
 
+	if r == nil {
+		d.current.err = ErrDecoderNilInput
+	}
+
 	// Transfer option dicts.
 	d.dicts = make(map[uint32]dict, len(d.o.dicts))
 	for _, dc := range d.o.dicts {
@@ -110,9 +127,6 @@ func NewReader(r io.Reader, opts ...DOption) (*Decoder, error) {
 // Returns the number of bytes written and any error that occurred.
 // When the stream is done, io.EOF will be returned.
 func (d *Decoder) Read(p []byte) (int, error) {
-	if d.stream == nil {
-		return 0, errors.New("no input has been initialized")
-	}
 	var n int
 	for {
 		if len(d.current.b) > 0 {
@@ -130,12 +144,12 @@ func (d *Decoder) Read(p []byte) (int, error) {
 				break
 			}
 			if !d.nextBlock(n == 0) {
-				return n, nil
+				return n, d.current.err
 			}
 		}
 	}
 	if len(d.current.b) > 0 {
-		if debug {
+		if debugDecoder {
 			println("returning", n, "still bytes left:", len(d.current.b))
 		}
 		// Only return error at end of block
@@ -144,7 +158,7 @@ func (d *Decoder) Read(p []byte) (int, error) {
 	if d.current.err != nil {
 		d.drainOutput()
 	}
-	if debug {
+	if debugDecoder {
 		println("returning", n, d.current.err, len(d.decoders))
 	}
 	return n, d.current.err
@@ -152,28 +166,33 @@ func (d *Decoder) Read(p []byte) (int, error) {
 
 // Reset will reset the decoder the supplied stream after the current has finished processing.
 // Note that this functionality cannot be used after Close has been called.
+// Reset can be called with a nil reader to release references to the previous reader.
+// After being called with a nil reader, no other operations than Reset or DecodeAll or Close
+// should be used.
 func (d *Decoder) Reset(r io.Reader) error {
 	if d.current.err == ErrDecoderClosed {
 		return d.current.err
 	}
-	if r == nil {
-		return errors.New("nil Reader sent as input")
-	}
-
-	if d.stream == nil {
-		d.stream = make(chan decodeStream, 1)
-		d.streamWg.Add(1)
-		go d.startStreamDecoder(d.stream)
-	}
 
 	d.drainOutput()
 
-	// If bytes buffer and < 1MB, do sync decoding anyway.
-	if bb, ok := r.(*bytes.Buffer); ok && bb.Len() < 1<<20 {
-		if debug {
+	d.syncStream.br.r = nil
+	if r == nil {
+		d.current.err = ErrDecoderNilInput
+		if len(d.current.b) > 0 {
+			d.current.b = d.current.b[:0]
+		}
+		d.current.flushed = true
+		return nil
+	}
+
+	// If bytes buffer and < 5MB, do sync decoding anyway.
+	if bb, ok := r.(byter); ok && bb.Len() < 5<<20 {
+		bb2 := bb
+		if debugDecoder {
 			println("*bytes.Buffer detected, doing sync decode, len:", bb.Len())
 		}
-		b := bb.Bytes()
+		b := bb2.Bytes()
 		var dst []byte
 		if cap(d.current.b) > 0 {
 			dst = d.current.b
@@ -186,36 +205,48 @@ func (d *Decoder) Reset(r io.Reader) error {
 		d.current.b = dst
 		d.current.err = err
 		d.current.flushed = true
-		if debug {
-			println("sync decode to ", len(dst), "bytes, err:", err)
+		if debugDecoder {
+			println("sync decode to", len(dst), "bytes, err:", err)
 		}
 		return nil
 	}
-
 	// Remove current block.
+	d.stashDecoder()
 	d.current.decodeOutput = decodeOutput{}
 	d.current.err = nil
-	d.current.cancel = make(chan struct{})
 	d.current.flushed = false
 	d.current.d = nil
 
-	d.stream <- decodeStream{
-		r:      r,
-		output: d.current.output,
-		cancel: d.current.cancel,
+	// Ensure no-one else is still running...
+	d.streamWg.Wait()
+	if d.frame == nil {
+		d.frame = newFrameDec(d.o)
+	}
+
+	if d.o.concurrent == 1 {
+		return d.startSyncDecoder(r)
 	}
+
+	d.current.output = make(chan decodeOutput, d.o.concurrent)
+	ctx, cancel := context.WithCancel(context.Background())
+	d.current.cancel = cancel
+	d.streamWg.Add(1)
+	go d.startStreamDecoder(ctx, r, d.current.output)
+
 	return nil
 }
 
 // drainOutput will drain the output until errEndOfStream is sent.
 func (d *Decoder) drainOutput() {
 	if d.current.cancel != nil {
-		println("cancelling current")
-		close(d.current.cancel)
+		if debugDecoder {
+			println("cancelling current")
+		}
+		d.current.cancel()
 		d.current.cancel = nil
 	}
 	if d.current.d != nil {
-		if debug {
+		if debugDecoder {
 			printf("re-adding current decoder %p, decoders: %d", d.current.d, len(d.decoders))
 		}
 		d.decoders <- d.current.d
@@ -226,39 +257,31 @@ func (d *Decoder) drainOutput() {
 		println("current already flushed")
 		return
 	}
-	for {
-		select {
-		case v := <-d.current.output:
-			if v.d != nil {
-				if debug {
-					printf("re-adding decoder %p", v.d)
-				}
-				d.decoders <- v.d
-			}
-			if v.err == errEndOfStream {
-				println("current flushed")
-				d.current.flushed = true
-				return
+	for v := range d.current.output {
+		if v.d != nil {
+			if debugDecoder {
+				printf("re-adding decoder %p", v.d)
 			}
+			d.decoders <- v.d
 		}
 	}
+	d.current.output = nil
+	d.current.flushed = true
 }
 
 // WriteTo writes data to w until there's no more data to write or when an error occurs.
 // The return value n is the number of bytes written.
 // Any error encountered during the write is also returned.
 func (d *Decoder) WriteTo(w io.Writer) (int64, error) {
-	if d.stream == nil {
-		return 0, errors.New("no input has been initialized")
-	}
 	var n int64
 	for {
 		if len(d.current.b) > 0 {
 			n2, err2 := w.Write(d.current.b)
 			n += int64(n2)
-			if err2 != nil && d.current.err == nil {
+			if err2 != nil && (d.current.err == nil || d.current.err == io.EOF) {
 				d.current.err = err2
-				break
+			} else if n2 != len(d.current.b) {
+				d.current.err = io.ErrShortWrite
 			}
 		}
 		if d.current.err != nil {
@@ -282,7 +305,7 @@ func (d *Decoder) WriteTo(w io.Writer) (int64, error) {
 // DecodeAll can be used concurrently.
 // The Decoder concurrency limits will be respected.
 func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
-	if d.current.err == ErrDecoderClosed {
+	if d.decoders == nil {
 		return dst, ErrDecoderClosed
 	}
 
@@ -290,11 +313,14 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 	block := <-d.decoders
 	frame := block.localFrame
 	defer func() {
-		if debug {
+		if debugDecoder {
 			printf("re-adding decoder: %p", block)
 		}
 		frame.rawInput = nil
 		frame.bBuf = nil
+		if frame.history.decoders.br != nil {
+			frame.history.decoders.br.in = nil
+		}
 		d.decoders <- block
 	}()
 	frame.bBuf = input
@@ -302,37 +328,50 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 	for {
 		frame.history.reset()
 		err := frame.reset(&frame.bBuf)
-		if err == io.EOF {
-			return dst, nil
+		if err != nil {
+			if err == io.EOF {
+				if debugDecoder {
+					println("frame reset return EOF")
+				}
+				return dst, nil
+			}
+			return dst, err
 		}
 		if frame.DictionaryID != nil {
 			dict, ok := d.dicts[*frame.DictionaryID]
 			if !ok {
 				return nil, ErrUnknownDictionary
 			}
+			if debugDecoder {
+				println("setting dict", frame.DictionaryID)
+			}
 			frame.history.setDict(&dict)
 		}
-		if err != nil {
-			return dst, err
-		}
-		if frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)) {
-			return dst, ErrDecoderSizeExceeded
+		if frame.WindowSize > d.o.maxWindowSize {
+			return dst, ErrWindowSizeExceeded
 		}
-		if frame.FrameContentSize > 0 && frame.FrameContentSize < 1<<30 {
-			// Never preallocate moe than 1 GB up front.
-			if uint64(cap(dst)) < frame.FrameContentSize {
-				dst2 := make([]byte, len(dst), len(dst)+int(frame.FrameContentSize))
+		if frame.FrameContentSize != fcsUnknown {
+			if frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)) {
+				return dst, ErrDecoderSizeExceeded
+			}
+			if cap(dst)-len(dst) < int(frame.FrameContentSize) {
+				dst2 := make([]byte, len(dst), len(dst)+int(frame.FrameContentSize)+compressedBlockOverAlloc)
 				copy(dst2, dst)
 				dst = dst2
 			}
 		}
+
 		if cap(dst) == 0 {
-			// Allocate window size * 2 by default if nothing is provided and we didn't get frame content size.
-			size := frame.WindowSize * 2
+			// Allocate len(input) * 2 by default if nothing is provided
+			// and we didn't get frame content size.
+			size := len(input) * 2
 			// Cap to 1 MB.
 			if size > 1<<20 {
 				size = 1 << 20
 			}
+			if uint64(size) > d.o.maxDecodedSize {
+				size = int(d.o.maxDecodedSize)
+			}
 			dst = make([]byte, 0, size)
 		}
 
@@ -341,6 +380,9 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 			return dst, err
 		}
 		if len(frame.bBuf) == 0 {
+			if debugDecoder {
+				println("frame dbuf empty")
+			}
 			break
 		}
 	}
@@ -353,33 +395,176 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 // If non-blocking mode is used the returned boolean will be false
 // if no data was available without blocking.
 func (d *Decoder) nextBlock(blocking bool) (ok bool) {
-	if d.current.d != nil {
-		if debug {
-			printf("re-adding current decoder %p", d.current.d)
-		}
-		d.decoders <- d.current.d
-		d.current.d = nil
-	}
 	if d.current.err != nil {
 		// Keep error state.
-		return blocking
+		return false
+	}
+	d.current.b = d.current.b[:0]
+
+	// SYNC:
+	if d.syncStream.enabled {
+		if !blocking {
+			return false
+		}
+		ok = d.nextBlockSync()
+		if !ok {
+			d.stashDecoder()
+		}
+		return ok
 	}
 
+	//ASYNC:
+	d.stashDecoder()
 	if blocking {
-		d.current.decodeOutput = <-d.current.output
+		d.current.decodeOutput, ok = <-d.current.output
 	} else {
 		select {
-		case d.current.decodeOutput = <-d.current.output:
+		case d.current.decodeOutput, ok = <-d.current.output:
 		default:
 			return false
 		}
 	}
-	if debug {
-		println("got", len(d.current.b), "bytes, error:", d.current.err)
+	if !ok {
+		// This should not happen, so signal error state...
+		d.current.err = io.ErrUnexpectedEOF
+		return false
+	}
+	next := d.current.decodeOutput
+	if next.d != nil && next.d.async.newHist != nil {
+		d.current.crc.Reset()
+	}
+	if debugDecoder {
+		var tmp [4]byte
+		binary.LittleEndian.PutUint32(tmp[:], uint32(xxhash.Sum64(next.b)))
+		println("got", len(d.current.b), "bytes, error:", d.current.err, "data crc:", tmp)
+	}
+
+	if !d.o.ignoreChecksum && len(next.b) > 0 {
+		n, err := d.current.crc.Write(next.b)
+		if err == nil {
+			if n != len(next.b) {
+				d.current.err = io.ErrShortWrite
+			}
+		}
+	}
+	if next.err == nil && next.d != nil && len(next.d.checkCRC) != 0 {
+		got := d.current.crc.Sum64()
+		var tmp [4]byte
+		binary.LittleEndian.PutUint32(tmp[:], uint32(got))
+		if !d.o.ignoreChecksum && !bytes.Equal(tmp[:], next.d.checkCRC) {
+			if debugDecoder {
+				println("CRC Check Failed:", tmp[:], " (got) !=", next.d.checkCRC, "(on stream)")
+			}
+			d.current.err = ErrCRCMismatch
+		} else {
+			if debugDecoder {
+				println("CRC ok", tmp[:])
+			}
+		}
+	}
+
+	return true
+}
+
+func (d *Decoder) nextBlockSync() (ok bool) {
+	if d.current.d == nil {
+		d.current.d = <-d.decoders
+	}
+	for len(d.current.b) == 0 {
+		if !d.syncStream.inFrame {
+			d.frame.history.reset()
+			d.current.err = d.frame.reset(&d.syncStream.br)
+			if d.current.err != nil {
+				return false
+			}
+			if d.frame.DictionaryID != nil {
+				dict, ok := d.dicts[*d.frame.DictionaryID]
+				if !ok {
+					d.current.err = ErrUnknownDictionary
+					return false
+				} else {
+					d.frame.history.setDict(&dict)
+				}
+			}
+			if d.frame.WindowSize > d.o.maxDecodedSize || d.frame.WindowSize > d.o.maxWindowSize {
+				d.current.err = ErrDecoderSizeExceeded
+				return false
+			}
+
+			d.syncStream.decodedFrame = 0
+			d.syncStream.inFrame = true
+		}
+		d.current.err = d.frame.next(d.current.d)
+		if d.current.err != nil {
+			return false
+		}
+		d.frame.history.ensureBlock()
+		if debugDecoder {
+			println("History trimmed:", len(d.frame.history.b), "decoded already:", d.syncStream.decodedFrame)
+		}
+		histBefore := len(d.frame.history.b)
+		d.current.err = d.current.d.decodeBuf(&d.frame.history)
+
+		if d.current.err != nil {
+			println("error after:", d.current.err)
+			return false
+		}
+		d.current.b = d.frame.history.b[histBefore:]
+		if debugDecoder {
+			println("history after:", len(d.frame.history.b))
+		}
+
+		// Check frame size (before CRC)
+		d.syncStream.decodedFrame += uint64(len(d.current.b))
+		if d.syncStream.decodedFrame > d.frame.FrameContentSize {
+			if debugDecoder {
+				printf("DecodedFrame (%d) > FrameContentSize (%d)\n", d.syncStream.decodedFrame, d.frame.FrameContentSize)
+			}
+			d.current.err = ErrFrameSizeExceeded
+			return false
+		}
+
+		// Check FCS
+		if d.current.d.Last && d.frame.FrameContentSize != fcsUnknown && d.syncStream.decodedFrame != d.frame.FrameContentSize {
+			if debugDecoder {
+				printf("DecodedFrame (%d) != FrameContentSize (%d)\n", d.syncStream.decodedFrame, d.frame.FrameContentSize)
+			}
+			d.current.err = ErrFrameSizeMismatch
+			return false
+		}
+
+		// Update/Check CRC
+		if d.frame.HasCheckSum {
+			if !d.o.ignoreChecksum {
+				d.frame.crc.Write(d.current.b)
+			}
+			if d.current.d.Last {
+				if !d.o.ignoreChecksum {
+					d.current.err = d.frame.checkCRC()
+				} else {
+					d.current.err = d.frame.consumeCRC()
+				}
+				if d.current.err != nil {
+					println("CRC error:", d.current.err)
+					return false
+				}
+			}
+		}
+		d.syncStream.inFrame = !d.current.d.Last
 	}
 	return true
 }
 
+func (d *Decoder) stashDecoder() {
+	if d.current.d != nil {
+		if debugDecoder {
+			printf("re-adding current decoder %p", d.current.d)
+		}
+		d.decoders <- d.current.d
+		d.current.d = nil
+	}
+}
+
 // Close will release all resources.
 // It is NOT possible to reuse the decoder after this.
 func (d *Decoder) Close() {
@@ -387,10 +572,10 @@ func (d *Decoder) Close() {
 		return
 	}
 	d.drainOutput()
-	if d.stream != nil {
-		close(d.stream)
+	if d.current.cancel != nil {
+		d.current.cancel()
 		d.streamWg.Wait()
-		d.stream = nil
+		d.current.cancel = nil
 	}
 	if d.decoders != nil {
 		close(d.decoders)
@@ -441,100 +626,307 @@ type decodeOutput struct {
 	err error
 }
 
-type decodeStream struct {
-	r io.Reader
-
-	// Blocks ready to be written to output.
-	output chan decodeOutput
-
-	// cancel reading from the input
-	cancel chan struct{}
+func (d *Decoder) startSyncDecoder(r io.Reader) error {
+	d.frame.history.reset()
+	d.syncStream.br = readerWrapper{r: r}
+	d.syncStream.inFrame = false
+	d.syncStream.enabled = true
+	d.syncStream.decodedFrame = 0
+	return nil
 }
 
-// errEndOfStream indicates that everything from the stream was read.
-var errEndOfStream = errors.New("end-of-stream")
-
 // Create Decoder:
-// Spawn n block decoders. These accept tasks to decode a block.
-// Create goroutine that handles stream processing, this will send history to decoders as they are available.
-// Decoders update the history as they decode.
-// When a block is returned:
-// 		a) history is sent to the next decoder,
-// 		b) content written to CRC.
-// 		c) return data to WRITER.
-// 		d) wait for next block to return data.
-// Once WRITTEN, the decoders reused by the writer frame decoder for re-use.
-func (d *Decoder) startStreamDecoder(inStream chan decodeStream) {
+// ASYNC:
+// Spawn 4 go routines.
+// 0: Read frames and decode blocks.
+// 1: Decode block and literals. Receives hufftree and seqdecs, returns seqdecs and huff tree.
+// 2: Wait for recentOffsets if needed. Decode sequences, send recentOffsets.
+// 3: Wait for stream history, execute sequences, send stream history.
+func (d *Decoder) startStreamDecoder(ctx context.Context, r io.Reader, output chan decodeOutput) {
 	defer d.streamWg.Done()
-	frame := newFrameDec(d.o)
-	for stream := range inStream {
-		if debug {
-			println("got new stream")
+	br := readerWrapper{r: r}
+
+	var seqPrepare = make(chan *blockDec, d.o.concurrent)
+	var seqDecode = make(chan *blockDec, d.o.concurrent)
+	var seqExecute = make(chan *blockDec, d.o.concurrent)
+
+	// Async 1: Prepare blocks...
+	go func() {
+		var hist history
+		var hasErr bool
+		for block := range seqPrepare {
+			if hasErr {
+				if block != nil {
+					seqDecode <- block
+				}
+				continue
+			}
+			if block.async.newHist != nil {
+				if debugDecoder {
+					println("Async 1: new history")
+				}
+				hist.reset()
+				if block.async.newHist.dict != nil {
+					hist.setDict(block.async.newHist.dict)
+				}
+			}
+			if block.err != nil || block.Type != blockTypeCompressed {
+				hasErr = block.err != nil
+				seqDecode <- block
+				continue
+			}
+
+			remain, err := block.decodeLiterals(block.data, &hist)
+			block.err = err
+			hasErr = block.err != nil
+			if err == nil {
+				block.async.literals = hist.decoders.literals
+				block.async.seqData = remain
+			} else if debugDecoder {
+				println("decodeLiterals error:", err)
+			}
+			seqDecode <- block
 		}
-		br := readerWrapper{r: stream.r}
-	decodeStream:
-		for {
-			frame.history.reset()
-			err := frame.reset(&br)
-			if debug && err != nil {
-				println("Frame decoder returned", err)
+		close(seqDecode)
+	}()
+
+	// Async 2: Decode sequences...
+	go func() {
+		var hist history
+		var hasErr bool
+
+		for block := range seqDecode {
+			if hasErr {
+				if block != nil {
+					seqExecute <- block
+				}
+				continue
 			}
-			if err == nil && frame.DictionaryID != nil {
-				dict, ok := d.dicts[*frame.DictionaryID]
-				if !ok {
-					err = ErrUnknownDictionary
-				} else {
-					frame.history.setDict(&dict)
+			if block.async.newHist != nil {
+				if debugDecoder {
+					println("Async 2: new history, recent:", block.async.newHist.recentOffsets)
+				}
+				hist.decoders = block.async.newHist.decoders
+				hist.recentOffsets = block.async.newHist.recentOffsets
+				hist.windowSize = block.async.newHist.windowSize
+				if block.async.newHist.dict != nil {
+					hist.setDict(block.async.newHist.dict)
 				}
 			}
-			if err != nil {
-				stream.output <- decodeOutput{
-					err: err,
+			if block.err != nil || block.Type != blockTypeCompressed {
+				hasErr = block.err != nil
+				seqExecute <- block
+				continue
+			}
+
+			hist.decoders.literals = block.async.literals
+			block.err = block.prepareSequences(block.async.seqData, &hist)
+			if debugDecoder && block.err != nil {
+				println("prepareSequences returned:", block.err)
+			}
+			hasErr = block.err != nil
+			if block.err == nil {
+				block.err = block.decodeSequences(&hist)
+				if debugDecoder && block.err != nil {
+					println("decodeSequences returned:", block.err)
 				}
-				break
+				hasErr = block.err != nil
+				//				block.async.sequence = hist.decoders.seq[:hist.decoders.nSeqs]
+				block.async.seqSize = hist.decoders.seqSize
 			}
-			if debug {
-				println("starting frame decoder")
+			seqExecute <- block
+		}
+		close(seqExecute)
+	}()
+
+	var wg sync.WaitGroup
+	wg.Add(1)
+
+	// Async 3: Execute sequences...
+	frameHistCache := d.frame.history.b
+	go func() {
+		var hist history
+		var decodedFrame uint64
+		var fcs uint64
+		var hasErr bool
+		for block := range seqExecute {
+			out := decodeOutput{err: block.err, d: block}
+			if block.err != nil || hasErr {
+				hasErr = true
+				output <- out
+				continue
 			}
+			if block.async.newHist != nil {
+				if debugDecoder {
+					println("Async 3: new history")
+				}
+				hist.windowSize = block.async.newHist.windowSize
+				hist.allocFrameBuffer = block.async.newHist.allocFrameBuffer
+				if block.async.newHist.dict != nil {
+					hist.setDict(block.async.newHist.dict)
+				}
 
-			// This goroutine will forward history between frames.
-			frame.frameDone.Add(1)
-			frame.initAsync()
+				if cap(hist.b) < hist.allocFrameBuffer {
+					if cap(frameHistCache) >= hist.allocFrameBuffer {
+						hist.b = frameHistCache
+					} else {
+						hist.b = make([]byte, 0, hist.allocFrameBuffer)
+						println("Alloc history sized", hist.allocFrameBuffer)
+					}
+				}
+				hist.b = hist.b[:0]
+				fcs = block.async.fcs
+				decodedFrame = 0
+			}
+			do := decodeOutput{err: block.err, d: block}
+			switch block.Type {
+			case blockTypeRLE:
+				if debugDecoder {
+					println("add rle block length:", block.RLESize)
+				}
 
-			go frame.startDecoder(stream.output)
-		decodeFrame:
-			// Go through all blocks of the frame.
-			for {
-				dec := <-d.decoders
-				select {
-				case <-stream.cancel:
-					if !frame.sendErr(dec, io.EOF) {
-						// To not let the decoder dangle, send it back.
-						stream.output <- decodeOutput{d: dec}
+				if cap(block.dst) < int(block.RLESize) {
+					if block.lowMem {
+						block.dst = make([]byte, block.RLESize)
+					} else {
+						block.dst = make([]byte, maxBlockSize)
 					}
-					break decodeStream
-				default:
 				}
-				err := frame.next(dec)
-				switch err {
-				case io.EOF:
-					// End of current frame, no error
-					println("EOF on next block")
-					break decodeFrame
-				case nil:
-					continue
-				default:
-					println("block decoder returned", err)
-					break decodeStream
+				block.dst = block.dst[:block.RLESize]
+				v := block.data[0]
+				for i := range block.dst {
+					block.dst[i] = v
+				}
+				hist.append(block.dst)
+				do.b = block.dst
+			case blockTypeRaw:
+				if debugDecoder {
+					println("add raw block length:", len(block.data))
+				}
+				hist.append(block.data)
+				do.b = block.data
+			case blockTypeCompressed:
+				if debugDecoder {
+					println("execute with history length:", len(hist.b), "window:", hist.windowSize)
+				}
+				hist.decoders.seqSize = block.async.seqSize
+				hist.decoders.literals = block.async.literals
+				do.err = block.executeSequences(&hist)
+				hasErr = do.err != nil
+				if debugDecoder && hasErr {
+					println("executeSequences returned:", do.err)
+				}
+				do.b = block.dst
+			}
+			if !hasErr {
+				decodedFrame += uint64(len(do.b))
+				if decodedFrame > fcs {
+					println("fcs exceeded", block.Last, fcs, decodedFrame)
+					do.err = ErrFrameSizeExceeded
+					hasErr = true
+				} else if block.Last && fcs != fcsUnknown && decodedFrame != fcs {
+					do.err = ErrFrameSizeMismatch
+					hasErr = true
+				} else {
+					if debugDecoder {
+						println("fcs ok", block.Last, fcs, decodedFrame)
+					}
+				}
+			}
+			output <- do
+		}
+		close(output)
+		frameHistCache = hist.b
+		wg.Done()
+		if debugDecoder {
+			println("decoder goroutines finished")
+		}
+	}()
+
+decodeStream:
+	for {
+		frame := d.frame
+		if debugDecoder {
+			println("New frame...")
+		}
+		var historySent bool
+		frame.history.reset()
+		err := frame.reset(&br)
+		if debugDecoder && err != nil {
+			println("Frame decoder returned", err)
+		}
+		if err == nil && frame.DictionaryID != nil {
+			dict, ok := d.dicts[*frame.DictionaryID]
+			if !ok {
+				err = ErrUnknownDictionary
+			} else {
+				frame.history.setDict(&dict)
+			}
+		}
+		if err == nil && d.frame.WindowSize > d.o.maxWindowSize {
+			err = ErrDecoderSizeExceeded
+		}
+		if err != nil {
+			select {
+			case <-ctx.Done():
+			case dec := <-d.decoders:
+				dec.sendErr(err)
+				seqPrepare <- dec
+			}
+			break decodeStream
+		}
+
+		// Go through all blocks of the frame.
+		for {
+			var dec *blockDec
+			select {
+			case <-ctx.Done():
+				break decodeStream
+			case dec = <-d.decoders:
+				// Once we have a decoder, we MUST return it.
+			}
+			err := frame.next(dec)
+			if !historySent {
+				h := frame.history
+				if debugDecoder {
+					println("Alloc History:", h.allocFrameBuffer)
+				}
+				dec.async.newHist = &h
+				dec.async.fcs = frame.FrameContentSize
+				historySent = true
+			} else {
+				dec.async.newHist = nil
+			}
+			if debugDecoder && err != nil {
+				println("next block returned error:", err)
+			}
+			dec.err = err
+			dec.checkCRC = nil
+			if dec.Last && frame.HasCheckSum && err == nil {
+				crc, err := frame.rawInput.readSmall(4)
+				if err != nil {
+					println("CRC missing?", err)
+					dec.err = err
+				}
+				var tmp [4]byte
+				copy(tmp[:], crc)
+				dec.checkCRC = tmp[:]
+				if debugDecoder {
+					println("found crc to check:", dec.checkCRC)
 				}
 			}
-			// All blocks have started decoding, check if there are more frames.
-			println("waiting for done")
-			frame.frameDone.Wait()
-			println("done waiting...")
+			err = dec.err
+			last := dec.Last
+			seqPrepare <- dec
+			if err != nil {
+				break decodeStream
+			}
+			if last {
+				break
+			}
 		}
-		frame.frameDone.Wait()
-		println("Sending EOS")
-		stream.output <- decodeOutput{err: errEndOfStream}
 	}
+	close(seqPrepare)
+	wg.Wait()
+	d.frame.history.b = frameHistCache
 }
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder_options.go b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
index 284d384..c70e6fa 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
@@ -6,7 +6,6 @@ package zstd
 
 import (
 	"errors"
-	"fmt"
 	"runtime"
 )
 
@@ -18,16 +17,22 @@ type decoderOptions struct {
 	lowMem         bool
 	concurrent     int
 	maxDecodedSize uint64
+	maxWindowSize  uint64
 	dicts          []dict
+	ignoreChecksum bool
 }
 
 func (o *decoderOptions) setDefault() {
 	*o = decoderOptions{
 		// use less ram: true for now, but may change.
-		lowMem:     true,
-		concurrent: runtime.GOMAXPROCS(0),
+		lowMem:        true,
+		concurrent:    runtime.GOMAXPROCS(0),
+		maxWindowSize: MaxWindowSize,
 	}
-	o.maxDecodedSize = 1 << 63
+	if o.concurrent > 4 {
+		o.concurrent = 4
+	}
+	o.maxDecodedSize = 64 << 30
 }
 
 // WithDecoderLowmem will set whether to use a lower amount of memory,
@@ -36,16 +41,25 @@ func WithDecoderLowmem(b bool) DOption {
 	return func(o *decoderOptions) error { o.lowMem = b; return nil }
 }
 
-// WithDecoderConcurrency will set the concurrency,
-// meaning the maximum number of decoders to run concurrently.
-// The value supplied must be at least 1.
-// By default this will be set to GOMAXPROCS.
+// WithDecoderConcurrency sets the number of created decoders.
+// When decoding block with DecodeAll, this will limit the number
+// of possible concurrently running decodes.
+// When decoding streams, this will limit the number of
+// inflight blocks.
+// When decoding streams and setting maximum to 1,
+// no async decoding will be done.
+// When a value of 0 is provided GOMAXPROCS will be used.
+// By default this will be set to 4 or GOMAXPROCS, whatever is lower.
 func WithDecoderConcurrency(n int) DOption {
 	return func(o *decoderOptions) error {
-		if n <= 0 {
-			return fmt.Errorf("Concurrency must be at least 1")
+		if n < 0 {
+			return errors.New("concurrency must be at least 1")
+		}
+		if n == 0 {
+			o.concurrent = runtime.GOMAXPROCS(0)
+		} else {
+			o.concurrent = n
 		}
-		o.concurrent = n
 		return nil
 	}
 }
@@ -53,15 +67,14 @@ func WithDecoderConcurrency(n int) DOption {
 // WithDecoderMaxMemory allows to set a maximum decoded size for in-memory
 // non-streaming operations or maximum window size for streaming operations.
 // This can be used to control memory usage of potentially hostile content.
-// For streaming operations, the maximum window size is capped at 1<<30 bytes.
-// Maximum and default is 1 << 63 bytes.
+// Maximum is 1 << 63 bytes. Default is 64GiB.
 func WithDecoderMaxMemory(n uint64) DOption {
 	return func(o *decoderOptions) error {
 		if n == 0 {
 			return errors.New("WithDecoderMaxMemory must be at least 1")
 		}
 		if n > 1<<63 {
-			return fmt.Errorf("WithDecoderMaxmemory must be less than 1 << 63")
+			return errors.New("WithDecoderMaxmemory must be less than 1 << 63")
 		}
 		o.maxDecodedSize = n
 		return nil
@@ -82,3 +95,29 @@ func WithDecoderDicts(dicts ...[]byte) DOption {
 		return nil
 	}
 }
+
+// WithDecoderMaxWindow allows to set a maximum window size for decodes.
+// This allows rejecting packets that will cause big memory usage.
+// The Decoder will likely allocate more memory based on the WithDecoderLowmem setting.
+// If WithDecoderMaxMemory is set to a lower value, that will be used.
+// Default is 512MB, Maximum is ~3.75 TB as per zstandard spec.
+func WithDecoderMaxWindow(size uint64) DOption {
+	return func(o *decoderOptions) error {
+		if size < MinWindowSize {
+			return errors.New("WithMaxWindowSize must be at least 1KB, 1024 bytes")
+		}
+		if size > (1<<41)+7*(1<<38) {
+			return errors.New("WithMaxWindowSize must be less than (1<<41) + 7*(1<<38) ~ 3.75TB")
+		}
+		o.maxWindowSize = size
+		return nil
+	}
+}
+
+// IgnoreChecksum allows to forcibly ignore checksum checking.
+func IgnoreChecksum(b bool) DOption {
+	return func(o *decoderOptions) error {
+		o.ignoreChecksum = b
+		return nil
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/dict.go b/vendor/github.com/klauspost/compress/zstd/dict.go
index 8eb6f6b..a36ae83 100644
--- a/vendor/github.com/klauspost/compress/zstd/dict.go
+++ b/vendor/github.com/klauspost/compress/zstd/dict.go
@@ -13,14 +13,31 @@ import (
 type dict struct {
 	id uint32
 
-	litDec              *huff0.Scratch
+	litEnc              *huff0.Scratch
 	llDec, ofDec, mlDec sequenceDec
-	offsets             [3]int
-	content             []byte
+	//llEnc, ofEnc, mlEnc []*fseEncoder
+	offsets [3]int
+	content []byte
 }
 
 var dictMagic = [4]byte{0x37, 0xa4, 0x30, 0xec}
 
+// ID returns the dictionary id or 0 if d is nil.
+func (d *dict) ID() uint32 {
+	if d == nil {
+		return 0
+	}
+	return d.id
+}
+
+// DictContentSize returns the dictionary content size or 0 if d is nil.
+func (d *dict) DictContentSize() int {
+	if d == nil {
+		return 0
+	}
+	return len(d.content)
+}
+
 // Load a dictionary as described in
 // https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#dictionary-format
 func loadDict(b []byte) (*dict, error) {
@@ -43,10 +60,11 @@ func loadDict(b []byte) (*dict, error) {
 
 	// Read literal table
 	var err error
-	d.litDec, b, err = huff0.ReadTable(b[8:], nil)
+	d.litEnc, b, err = huff0.ReadTable(b[8:], nil)
 	if err != nil {
 		return nil, err
 	}
+	d.litEnc.Reuse = huff0.ReusePolicyMust
 
 	br := byteReader{
 		b:   b,
@@ -64,7 +82,7 @@ func loadDict(b []byte) (*dict, error) {
 			println("Transform table error:", err)
 			return err
 		}
-		if debug {
+		if debugDecoder || debugEncoder {
 			println("Read table ok", "symbolLen:", dec.symbolLen)
 		}
 		// Set decoders as predefined so they aren't reused.
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_base.go b/vendor/github.com/klauspost/compress/zstd/enc_base.go
new file mode 100644
index 0000000..15ae8ee
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/enc_base.go
@@ -0,0 +1,188 @@
+package zstd
+
+import (
+	"fmt"
+	"math/bits"
+
+	"github.com/klauspost/compress/zstd/internal/xxhash"
+)
+
+const (
+	dictShardBits = 6
+)
+
+type fastBase struct {
+	// cur is the offset at the start of hist
+	cur int32
+	// maximum offset. Should be at least 2x block size.
+	maxMatchOff int32
+	hist        []byte
+	crc         *xxhash.Digest
+	tmp         [8]byte
+	blk         *blockEnc
+	lastDictID  uint32
+	lowMem      bool
+}
+
+// CRC returns the underlying CRC writer.
+func (e *fastBase) CRC() *xxhash.Digest {
+	return e.crc
+}
+
+// AppendCRC will append the CRC to the destination slice and return it.
+func (e *fastBase) AppendCRC(dst []byte) []byte {
+	crc := e.crc.Sum(e.tmp[:0])
+	dst = append(dst, crc[7], crc[6], crc[5], crc[4])
+	return dst
+}
+
+// WindowSize returns the window size of the encoder,
+// or a window size small enough to contain the input size, if > 0.
+func (e *fastBase) WindowSize(size int64) int32 {
+	if size > 0 && size < int64(e.maxMatchOff) {
+		b := int32(1) << uint(bits.Len(uint(size)))
+		// Keep minimum window.
+		if b < 1024 {
+			b = 1024
+		}
+		return b
+	}
+	return e.maxMatchOff
+}
+
+// Block returns the current block.
+func (e *fastBase) Block() *blockEnc {
+	return e.blk
+}
+
+func (e *fastBase) addBlock(src []byte) int32 {
+	if debugAsserts && e.cur > bufferReset {
+		panic(fmt.Sprintf("ecur (%d) > buffer reset (%d)", e.cur, bufferReset))
+	}
+	// check if we have space already
+	if len(e.hist)+len(src) > cap(e.hist) {
+		if cap(e.hist) == 0 {
+			e.ensureHist(len(src))
+		} else {
+			if cap(e.hist) < int(e.maxMatchOff+maxCompressedBlockSize) {
+				panic(fmt.Errorf("unexpected buffer cap %d, want at least %d with window %d", cap(e.hist), e.maxMatchOff+maxCompressedBlockSize, e.maxMatchOff))
+			}
+			// Move down
+			offset := int32(len(e.hist)) - e.maxMatchOff
+			copy(e.hist[0:e.maxMatchOff], e.hist[offset:])
+			e.cur += offset
+			e.hist = e.hist[:e.maxMatchOff]
+		}
+	}
+	s := int32(len(e.hist))
+	e.hist = append(e.hist, src...)
+	return s
+}
+
+// ensureHist will ensure that history can keep at least this many bytes.
+func (e *fastBase) ensureHist(n int) {
+	if cap(e.hist) >= n {
+		return
+	}
+	l := e.maxMatchOff
+	if (e.lowMem && e.maxMatchOff > maxCompressedBlockSize) || e.maxMatchOff <= maxCompressedBlockSize {
+		l += maxCompressedBlockSize
+	} else {
+		l += e.maxMatchOff
+	}
+	// Make it at least 1MB.
+	if l < 1<<20 && !e.lowMem {
+		l = 1 << 20
+	}
+	// Make it at least the requested size.
+	if l < int32(n) {
+		l = int32(n)
+	}
+	e.hist = make([]byte, 0, l)
+}
+
+// useBlock will replace the block with the provided one,
+// but transfer recent offsets from the previous.
+func (e *fastBase) UseBlock(enc *blockEnc) {
+	enc.reset(e.blk)
+	e.blk = enc
+}
+
+func (e *fastBase) matchlen(s, t int32, src []byte) int32 {
+	if debugAsserts {
+		if s < 0 {
+			err := fmt.Sprintf("s (%d) < 0", s)
+			panic(err)
+		}
+		if t < 0 {
+			err := fmt.Sprintf("s (%d) < 0", s)
+			panic(err)
+		}
+		if s-t > e.maxMatchOff {
+			err := fmt.Sprintf("s (%d) - t (%d) > maxMatchOff (%d)", s, t, e.maxMatchOff)
+			panic(err)
+		}
+		if len(src)-int(s) > maxCompressedBlockSize {
+			panic(fmt.Sprintf("len(src)-s (%d) > maxCompressedBlockSize (%d)", len(src)-int(s), maxCompressedBlockSize))
+		}
+	}
+	a := src[s:]
+	b := src[t:]
+	b = b[:len(a)]
+	end := int32((len(a) >> 3) << 3)
+	for i := int32(0); i < end; i += 8 {
+		if diff := load6432(a, i) ^ load6432(b, i); diff != 0 {
+			return i + int32(bits.TrailingZeros64(diff)>>3)
+		}
+	}
+
+	a = a[end:]
+	b = b[end:]
+	for i := range a {
+		if a[i] != b[i] {
+			return int32(i) + end
+		}
+	}
+	return int32(len(a)) + end
+}
+
+// Reset the encoding table.
+func (e *fastBase) resetBase(d *dict, singleBlock bool) {
+	if e.blk == nil {
+		e.blk = &blockEnc{lowMem: e.lowMem}
+		e.blk.init()
+	} else {
+		e.blk.reset(nil)
+	}
+	e.blk.initNewEncode()
+	if e.crc == nil {
+		e.crc = xxhash.New()
+	} else {
+		e.crc.Reset()
+	}
+	if d != nil {
+		low := e.lowMem
+		if singleBlock {
+			e.lowMem = true
+		}
+		e.ensureHist(d.DictContentSize() + maxCompressedBlockSize)
+		e.lowMem = low
+	}
+
+	// We offset current position so everything will be out of reach.
+	// If above reset line, history will be purged.
+	if e.cur < bufferReset {
+		e.cur += e.maxMatchOff + int32(len(e.hist))
+	}
+	e.hist = e.hist[:0]
+	if d != nil {
+		// Set offsets (currently not used)
+		for i, off := range d.offsets {
+			e.blk.recentOffsets[i] = uint32(off)
+			e.blk.prevRecentOffsets[i] = e.blk.recentOffsets[i]
+		}
+		// Transfer litenc.
+		e.blk.dictLitEnc = d.litEnc
+		e.hist = append(e.hist, d.content...)
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_best.go b/vendor/github.com/klauspost/compress/zstd/enc_best.go
new file mode 100644
index 0000000..96028ec
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/enc_best.go
@@ -0,0 +1,558 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+// Based on work by Yann Collet, released under BSD License.
+
+package zstd
+
+import (
+	"bytes"
+	"fmt"
+
+	"github.com/klauspost/compress"
+)
+
+const (
+	bestLongTableBits = 22                     // Bits used in the long match table
+	bestLongTableSize = 1 << bestLongTableBits // Size of the table
+	bestLongLen       = 8                      // Bytes used for table hash
+
+	// Note: Increasing the short table bits or making the hash shorter
+	// can actually lead to compression degradation since it will 'steal' more from the
+	// long match table and match offsets are quite big.
+	// This greatly depends on the type of input.
+	bestShortTableBits = 18                      // Bits used in the short match table
+	bestShortTableSize = 1 << bestShortTableBits // Size of the table
+	bestShortLen       = 4                       // Bytes used for table hash
+
+)
+
+type match struct {
+	offset int32
+	s      int32
+	length int32
+	rep    int32
+	est    int32
+}
+
+const highScore = 25000
+
+// estBits will estimate output bits from predefined tables.
+func (m *match) estBits(bitsPerByte int32) {
+	mlc := mlCode(uint32(m.length - zstdMinMatch))
+	var ofc uint8
+	if m.rep < 0 {
+		ofc = ofCode(uint32(m.s-m.offset) + 3)
+	} else {
+		ofc = ofCode(uint32(m.rep))
+	}
+	// Cost, excluding
+	ofTT, mlTT := fsePredefEnc[tableOffsets].ct.symbolTT[ofc], fsePredefEnc[tableMatchLengths].ct.symbolTT[mlc]
+
+	// Add cost of match encoding...
+	m.est = int32(ofTT.outBits + mlTT.outBits)
+	m.est += int32(ofTT.deltaNbBits>>16 + mlTT.deltaNbBits>>16)
+	// Subtract savings compared to literal encoding...
+	m.est -= (m.length * bitsPerByte) >> 10
+	if m.est > 0 {
+		// Unlikely gain..
+		m.length = 0
+		m.est = highScore
+	}
+}
+
+// bestFastEncoder uses 2 tables, one for short matches (5 bytes) and one for long matches.
+// The long match table contains the previous entry with the same hash,
+// effectively making it a "chain" of length 2.
+// When we find a long match we choose between the two values and select the longest.
+// When we find a short match, after checking the long, we check if we can find a long at n+1
+// and that it is longer (lazy matching).
+type bestFastEncoder struct {
+	fastBase
+	table         [bestShortTableSize]prevEntry
+	longTable     [bestLongTableSize]prevEntry
+	dictTable     []prevEntry
+	dictLongTable []prevEntry
+}
+
+// Encode improves compression...
+func (e *bestFastEncoder) Encode(blk *blockEnc, src []byte) {
+	const (
+		// Input margin is the number of bytes we read (8)
+		// and the maximum we will read ahead (2)
+		inputMargin            = 8 + 4
+		minNonLiteralBlockSize = 16
+	)
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = prevEntry{}
+			}
+			for i := range e.longTable[:] {
+				e.longTable[i] = prevEntry{}
+			}
+			e.cur = e.maxMatchOff
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			v2 := e.table[i].prev
+			if v < minOff {
+				v = 0
+				v2 = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+				if v2 < minOff {
+					v2 = 0
+				} else {
+					v2 = v2 - e.cur + e.maxMatchOff
+				}
+			}
+			e.table[i] = prevEntry{
+				offset: v,
+				prev:   v2,
+			}
+		}
+		for i := range e.longTable[:] {
+			v := e.longTable[i].offset
+			v2 := e.longTable[i].prev
+			if v < minOff {
+				v = 0
+				v2 = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+				if v2 < minOff {
+					v2 = 0
+				} else {
+					v2 = v2 - e.cur + e.maxMatchOff
+				}
+			}
+			e.longTable[i] = prevEntry{
+				offset: v,
+				prev:   v2,
+			}
+		}
+		e.cur = e.maxMatchOff
+		break
+	}
+
+	s := e.addBlock(src)
+	blk.size = len(src)
+	if len(src) < minNonLiteralBlockSize {
+		blk.extraLits = len(src)
+		blk.literals = blk.literals[:len(src)]
+		copy(blk.literals, src)
+		return
+	}
+
+	// Use this to estimate literal cost.
+	// Scaled by 10 bits.
+	bitsPerByte := int32((compress.ShannonEntropyBits(src) * 1024) / len(src))
+	// Huffman can never go < 1 bit/byte
+	if bitsPerByte < 1024 {
+		bitsPerByte = 1024
+	}
+
+	// Override src
+	src = e.hist
+	sLimit := int32(len(src)) - inputMargin
+	const kSearchStrength = 10
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := s
+	cv := load6432(src, s)
+
+	// Relative offsets
+	offset1 := int32(blk.recentOffsets[0])
+	offset2 := int32(blk.recentOffsets[1])
+	offset3 := int32(blk.recentOffsets[2])
+
+	addLiterals := func(s *seq, until int32) {
+		if until == nextEmit {
+			return
+		}
+		blk.literals = append(blk.literals, src[nextEmit:until]...)
+		s.litLen = uint32(until - nextEmit)
+	}
+	_ = addLiterals
+
+	if debugEncoder {
+		println("recent offsets:", blk.recentOffsets)
+	}
+
+encodeLoop:
+	for {
+		// We allow the encoder to optionally turn off repeat offsets across blocks
+		canRepeat := len(blk.sequences) > 2
+
+		if debugAsserts && canRepeat && offset1 == 0 {
+			panic("offset0 was 0")
+		}
+
+		bestOf := func(a, b match) match {
+			if a.est+(a.s-b.s)*bitsPerByte>>10 < b.est+(b.s-a.s)*bitsPerByte>>10 {
+				return a
+			}
+			return b
+		}
+		const goodEnough = 100
+
+		nextHashL := hashLen(cv, bestLongTableBits, bestLongLen)
+		nextHashS := hashLen(cv, bestShortTableBits, bestShortLen)
+		candidateL := e.longTable[nextHashL]
+		candidateS := e.table[nextHashS]
+
+		matchAt := func(offset int32, s int32, first uint32, rep int32) match {
+			if s-offset >= e.maxMatchOff || load3232(src, offset) != first {
+				return match{s: s, est: highScore}
+			}
+			if debugAsserts {
+				if !bytes.Equal(src[s:s+4], src[offset:offset+4]) {
+					panic(fmt.Sprintf("first match mismatch: %v != %v, first: %08x", src[s:s+4], src[offset:offset+4], first))
+				}
+			}
+			m := match{offset: offset, s: s, length: 4 + e.matchlen(s+4, offset+4, src), rep: rep}
+			m.estBits(bitsPerByte)
+			return m
+		}
+
+		best := bestOf(matchAt(candidateL.offset-e.cur, s, uint32(cv), -1), matchAt(candidateL.prev-e.cur, s, uint32(cv), -1))
+		best = bestOf(best, matchAt(candidateS.offset-e.cur, s, uint32(cv), -1))
+		best = bestOf(best, matchAt(candidateS.prev-e.cur, s, uint32(cv), -1))
+
+		if canRepeat && best.length < goodEnough {
+			cv32 := uint32(cv >> 8)
+			spp := s + 1
+			best = bestOf(best, matchAt(spp-offset1, spp, cv32, 1))
+			best = bestOf(best, matchAt(spp-offset2, spp, cv32, 2))
+			best = bestOf(best, matchAt(spp-offset3, spp, cv32, 3))
+			if best.length > 0 {
+				cv32 = uint32(cv >> 24)
+				spp += 2
+				best = bestOf(best, matchAt(spp-offset1, spp, cv32, 1))
+				best = bestOf(best, matchAt(spp-offset2, spp, cv32, 2))
+				best = bestOf(best, matchAt(spp-offset3, spp, cv32, 3))
+			}
+		}
+		// Load next and check...
+		e.longTable[nextHashL] = prevEntry{offset: s + e.cur, prev: candidateL.offset}
+		e.table[nextHashS] = prevEntry{offset: s + e.cur, prev: candidateS.offset}
+
+		// Look far ahead, unless we have a really long match already...
+		if best.length < goodEnough {
+			// No match found, move forward on input, no need to check forward...
+			if best.length < 4 {
+				s += 1 + (s-nextEmit)>>(kSearchStrength-1)
+				if s >= sLimit {
+					break encodeLoop
+				}
+				cv = load6432(src, s)
+				continue
+			}
+
+			s++
+			candidateS = e.table[hashLen(cv>>8, bestShortTableBits, bestShortLen)]
+			cv = load6432(src, s)
+			cv2 := load6432(src, s+1)
+			candidateL = e.longTable[hashLen(cv, bestLongTableBits, bestLongLen)]
+			candidateL2 := e.longTable[hashLen(cv2, bestLongTableBits, bestLongLen)]
+
+			// Short at s+1
+			best = bestOf(best, matchAt(candidateS.offset-e.cur, s, uint32(cv), -1))
+			// Long at s+1, s+2
+			best = bestOf(best, matchAt(candidateL.offset-e.cur, s, uint32(cv), -1))
+			best = bestOf(best, matchAt(candidateL.prev-e.cur, s, uint32(cv), -1))
+			best = bestOf(best, matchAt(candidateL2.offset-e.cur, s+1, uint32(cv2), -1))
+			best = bestOf(best, matchAt(candidateL2.prev-e.cur, s+1, uint32(cv2), -1))
+			if false {
+				// Short at s+3.
+				// Too often worse...
+				best = bestOf(best, matchAt(e.table[hashLen(cv2>>8, bestShortTableBits, bestShortLen)].offset-e.cur, s+2, uint32(cv2>>8), -1))
+			}
+			// See if we can find a better match by checking where the current best ends.
+			// Use that offset to see if we can find a better full match.
+			if sAt := best.s + best.length; sAt < sLimit {
+				nextHashL := hashLen(load6432(src, sAt), bestLongTableBits, bestLongLen)
+				candidateEnd := e.longTable[nextHashL]
+				if pos := candidateEnd.offset - e.cur - best.length; pos >= 0 {
+					bestEnd := bestOf(best, matchAt(pos, best.s, load3232(src, best.s), -1))
+					if pos := candidateEnd.prev - e.cur - best.length; pos >= 0 {
+						bestEnd = bestOf(bestEnd, matchAt(pos, best.s, load3232(src, best.s), -1))
+					}
+					best = bestEnd
+				}
+			}
+		}
+
+		if debugAsserts {
+			if !bytes.Equal(src[best.s:best.s+best.length], src[best.offset:best.offset+best.length]) {
+				panic(fmt.Sprintf("match mismatch: %v != %v", src[best.s:best.s+best.length], src[best.offset:best.offset+best.length]))
+			}
+		}
+
+		// We have a match, we can store the forward value
+		if best.rep > 0 {
+			s = best.s
+			var seq seq
+			seq.matchLen = uint32(best.length - zstdMinMatch)
+
+			// We might be able to match backwards.
+			// Extend as long as we can.
+			start := best.s
+			// We end the search early, so we don't risk 0 literals
+			// and have to do special offset treatment.
+			startLimit := nextEmit + 1
+
+			tMin := s - e.maxMatchOff
+			if tMin < 0 {
+				tMin = 0
+			}
+			repIndex := best.offset
+			for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
+				repIndex--
+				start--
+				seq.matchLen++
+			}
+			addLiterals(&seq, start)
+
+			// rep 0
+			seq.offset = uint32(best.rep)
+			if debugSequences {
+				println("repeat sequence", seq, "next s:", s)
+			}
+			blk.sequences = append(blk.sequences, seq)
+
+			// Index match start+1 (long) -> s - 1
+			index0 := s
+			s = best.s + best.length
+
+			nextEmit = s
+			if s >= sLimit {
+				if debugEncoder {
+					println("repeat ended", s, best.length)
+
+				}
+				break encodeLoop
+			}
+			// Index skipped...
+			off := index0 + e.cur
+			for index0 < s-1 {
+				cv0 := load6432(src, index0)
+				h0 := hashLen(cv0, bestLongTableBits, bestLongLen)
+				h1 := hashLen(cv0, bestShortTableBits, bestShortLen)
+				e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
+				e.table[h1] = prevEntry{offset: off, prev: e.table[h1].offset}
+				off++
+				index0++
+			}
+			switch best.rep {
+			case 2:
+				offset1, offset2 = offset2, offset1
+			case 3:
+				offset1, offset2, offset3 = offset3, offset1, offset2
+			}
+			cv = load6432(src, s)
+			continue
+		}
+
+		// A 4-byte match has been found. Update recent offsets.
+		// We'll later see if more than 4 bytes.
+		s = best.s
+		t := best.offset
+		offset1, offset2, offset3 = s-t, offset1, offset2
+
+		if debugAsserts && s <= t {
+			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+		}
+
+		if debugAsserts && int(offset1) > len(src) {
+			panic("invalid offset")
+		}
+
+		// Extend the n-byte match as long as possible.
+		l := best.length
+
+		// Extend backwards
+		tMin := s - e.maxMatchOff
+		if tMin < 0 {
+			tMin = 0
+		}
+		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
+			s--
+			t--
+			l++
+		}
+
+		// Write our sequence
+		var seq seq
+		seq.litLen = uint32(s - nextEmit)
+		seq.matchLen = uint32(l - zstdMinMatch)
+		if seq.litLen > 0 {
+			blk.literals = append(blk.literals, src[nextEmit:s]...)
+		}
+		seq.offset = uint32(s-t) + 3
+		s += l
+		if debugSequences {
+			println("sequence", seq, "next s:", s)
+		}
+		blk.sequences = append(blk.sequences, seq)
+		nextEmit = s
+		if s >= sLimit {
+			break encodeLoop
+		}
+
+		// Index match start+1 (long) -> s - 1
+		index0 := s - l + 1
+		// every entry
+		for index0 < s-1 {
+			cv0 := load6432(src, index0)
+			h0 := hashLen(cv0, bestLongTableBits, bestLongLen)
+			h1 := hashLen(cv0, bestShortTableBits, bestShortLen)
+			off := index0 + e.cur
+			e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
+			e.table[h1] = prevEntry{offset: off, prev: e.table[h1].offset}
+			index0++
+		}
+
+		cv = load6432(src, s)
+		if !canRepeat {
+			continue
+		}
+
+		// Check offset 2
+		for {
+			o2 := s - offset2
+			if load3232(src, o2) != uint32(cv) {
+				// Do regular search
+				break
+			}
+
+			// Store this, since we have it.
+			nextHashS := hashLen(cv, bestShortTableBits, bestShortLen)
+			nextHashL := hashLen(cv, bestLongTableBits, bestLongLen)
+
+			// We have at least 4 byte match.
+			// No need to check backwards. We come straight from a match
+			l := 4 + e.matchlen(s+4, o2+4, src)
+
+			e.longTable[nextHashL] = prevEntry{offset: s + e.cur, prev: e.longTable[nextHashL].offset}
+			e.table[nextHashS] = prevEntry{offset: s + e.cur, prev: e.table[nextHashS].offset}
+			seq.matchLen = uint32(l) - zstdMinMatch
+			seq.litLen = 0
+
+			// Since litlen is always 0, this is offset 1.
+			seq.offset = 1
+			s += l
+			nextEmit = s
+			if debugSequences {
+				println("sequence", seq, "next s:", s)
+			}
+			blk.sequences = append(blk.sequences, seq)
+
+			// Swap offset 1 and 2.
+			offset1, offset2 = offset2, offset1
+			if s >= sLimit {
+				// Finished
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+	}
+
+	if int(nextEmit) < len(src) {
+		blk.literals = append(blk.literals, src[nextEmit:]...)
+		blk.extraLits = len(src) - int(nextEmit)
+	}
+	blk.recentOffsets[0] = uint32(offset1)
+	blk.recentOffsets[1] = uint32(offset2)
+	blk.recentOffsets[2] = uint32(offset3)
+	if debugEncoder {
+		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
+	}
+}
+
+// EncodeNoHist will encode a block with no history and no following blocks.
+// Most notable difference is that src will not be copied for history and
+// we do not need to check for max match length.
+func (e *bestFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
+	e.ensureHist(len(src))
+	e.Encode(blk, src)
+}
+
+// Reset will reset and set a dictionary if not nil
+func (e *bestFastEncoder) Reset(d *dict, singleBlock bool) {
+	e.resetBase(d, singleBlock)
+	if d == nil {
+		return
+	}
+	// Init or copy dict table
+	if len(e.dictTable) != len(e.table) || d.id != e.lastDictID {
+		if len(e.dictTable) != len(e.table) {
+			e.dictTable = make([]prevEntry, len(e.table))
+		}
+		end := int32(len(d.content)) - 8 + e.maxMatchOff
+		for i := e.maxMatchOff; i < end; i += 4 {
+			const hashLog = bestShortTableBits
+
+			cv := load6432(d.content, i-e.maxMatchOff)
+			nextHash := hashLen(cv, hashLog, bestShortLen)      // 0 -> 4
+			nextHash1 := hashLen(cv>>8, hashLog, bestShortLen)  // 1 -> 5
+			nextHash2 := hashLen(cv>>16, hashLog, bestShortLen) // 2 -> 6
+			nextHash3 := hashLen(cv>>24, hashLog, bestShortLen) // 3 -> 7
+			e.dictTable[nextHash] = prevEntry{
+				prev:   e.dictTable[nextHash].offset,
+				offset: i,
+			}
+			e.dictTable[nextHash1] = prevEntry{
+				prev:   e.dictTable[nextHash1].offset,
+				offset: i + 1,
+			}
+			e.dictTable[nextHash2] = prevEntry{
+				prev:   e.dictTable[nextHash2].offset,
+				offset: i + 2,
+			}
+			e.dictTable[nextHash3] = prevEntry{
+				prev:   e.dictTable[nextHash3].offset,
+				offset: i + 3,
+			}
+		}
+		e.lastDictID = d.id
+	}
+
+	// Init or copy dict table
+	if len(e.dictLongTable) != len(e.longTable) || d.id != e.lastDictID {
+		if len(e.dictLongTable) != len(e.longTable) {
+			e.dictLongTable = make([]prevEntry, len(e.longTable))
+		}
+		if len(d.content) >= 8 {
+			cv := load6432(d.content, 0)
+			h := hashLen(cv, bestLongTableBits, bestLongLen)
+			e.dictLongTable[h] = prevEntry{
+				offset: e.maxMatchOff,
+				prev:   e.dictLongTable[h].offset,
+			}
+
+			end := int32(len(d.content)) - 8 + e.maxMatchOff
+			off := 8 // First to read
+			for i := e.maxMatchOff + 1; i < end; i++ {
+				cv = cv>>8 | (uint64(d.content[off]) << 56)
+				h := hashLen(cv, bestLongTableBits, bestLongLen)
+				e.dictLongTable[h] = prevEntry{
+					offset: i,
+					prev:   e.dictLongTable[h].offset,
+				}
+				off++
+			}
+		}
+		e.lastDictID = d.id
+	}
+	// Reset table to initial state
+	copy(e.longTable[:], e.dictLongTable)
+
+	e.cur = e.maxMatchOff
+	// Reset table to initial state
+	copy(e.table[:], e.dictTable)
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_better.go b/vendor/github.com/klauspost/compress/zstd/enc_better.go
index c120d90..602c05e 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_better.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_better.go
@@ -9,6 +9,7 @@ import "fmt"
 const (
 	betterLongTableBits = 19                       // Bits used in the long match table
 	betterLongTableSize = 1 << betterLongTableBits // Size of the table
+	betterLongLen       = 8                        // Bytes used for table hash
 
 	// Note: Increasing the short table bits or making the hash shorter
 	// can actually lead to compression degradation since it will 'steal' more from the
@@ -16,6 +17,13 @@ const (
 	// This greatly depends on the type of input.
 	betterShortTableBits = 13                        // Bits used in the short match table
 	betterShortTableSize = 1 << betterShortTableBits // Size of the table
+	betterShortLen       = 5                         // Bytes used for table hash
+
+	betterLongTableShardCnt  = 1 << (betterLongTableBits - dictShardBits)    // Number of shards in the table
+	betterLongTableShardSize = betterLongTableSize / betterLongTableShardCnt // Size of an individual shard
+
+	betterShortTableShardCnt  = 1 << (betterShortTableBits - dictShardBits)     // Number of shards in the table
+	betterShortTableShardSize = betterShortTableSize / betterShortTableShardCnt // Size of an individual shard
 )
 
 type prevEntry struct {
@@ -35,6 +43,15 @@ type betterFastEncoder struct {
 	longTable [betterLongTableSize]prevEntry
 }
 
+type betterFastEncoderDict struct {
+	betterFastEncoder
+	dictTable            []tableEntry
+	dictLongTable        []prevEntry
+	shortTableShardDirty [betterShortTableShardCnt]bool
+	longTableShardDirty  [betterLongTableShardCnt]bool
+	allDirty             bool
+}
+
 // Encode improves compression...
 func (e *betterFastEncoder) Encode(blk *blockEnc, src []byte) {
 	const (
@@ -123,7 +140,7 @@ func (e *betterFastEncoder) Encode(blk *blockEnc, src []byte) {
 		blk.literals = append(blk.literals, src[nextEmit:until]...)
 		s.litLen = uint32(until - nextEmit)
 	}
-	if debug {
+	if debugEncoder {
 		println("recent offsets:", blk.recentOffsets)
 	}
 
@@ -139,8 +156,8 @@ encodeLoop:
 				panic("offset0 was 0")
 			}
 
-			nextHashS := hash5(cv, betterShortTableBits)
-			nextHashL := hash8(cv, betterLongTableBits)
+			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
+			nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
 			candidateL := e.longTable[nextHashL]
 			candidateS := e.table[nextHashS]
 
@@ -189,7 +206,7 @@ encodeLoop:
 
 					nextEmit = s
 					if s >= sLimit {
-						if debug {
+						if debugEncoder {
 							println("repeat ended", s, lenght)
 
 						}
@@ -199,10 +216,10 @@ encodeLoop:
 					for index0 < s-1 {
 						cv0 := load6432(src, index0)
 						cv1 := cv0 >> 8
-						h0 := hash8(cv0, betterLongTableBits)
+						h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
 						off := index0 + e.cur
 						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
-						e.table[hash5(cv1, betterShortTableBits)] = tableEntry{offset: off + 1, val: uint32(cv1)}
+						e.table[hashLen(cv1, betterShortTableBits, betterShortLen)] = tableEntry{offset: off + 1, val: uint32(cv1)}
 						index0 += 2
 					}
 					cv = load6432(src, s)
@@ -249,7 +266,7 @@ encodeLoop:
 					s += lenght + repOff2
 					nextEmit = s
 					if s >= sLimit {
-						if debug {
+						if debugEncoder {
 							println("repeat ended", s, lenght)
 
 						}
@@ -260,10 +277,10 @@ encodeLoop:
 					for index0 < s-1 {
 						cv0 := load6432(src, index0)
 						cv1 := cv0 >> 8
-						h0 := hash8(cv0, betterLongTableBits)
+						h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
 						off := index0 + e.cur
 						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
-						e.table[hash5(cv1, betterShortTableBits)] = tableEntry{offset: off + 1, val: uint32(cv1)}
+						e.table[hashLen(cv1, betterShortTableBits, betterShortLen)] = tableEntry{offset: off + 1, val: uint32(cv1)}
 						index0 += 2
 					}
 					cv = load6432(src, s)
@@ -338,7 +355,7 @@ encodeLoop:
 				// See if we can find a long match at s+1
 				const checkAt = 1
 				cv := load6432(src, s+checkAt)
-				nextHashL = hash8(cv, betterLongTableBits)
+				nextHashL = hashLen(cv, betterLongTableBits, betterLongLen)
 				candidateL = e.longTable[nextHashL]
 				coffsetL = candidateL.offset - e.cur
 
@@ -397,8 +414,41 @@ encodeLoop:
 			cv = load6432(src, s)
 		}
 
-		// A 4-byte match has been found. Update recent offsets.
-		// We'll later see if more than 4 bytes.
+		// Try to find a better match by searching for a long match at the end of the current best match
+		if s+matched < sLimit {
+			nextHashL := hashLen(load6432(src, s+matched), betterLongTableBits, betterLongLen)
+			cv := load3232(src, s)
+			candidateL := e.longTable[nextHashL]
+			coffsetL := candidateL.offset - e.cur - matched
+			if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
+				// Found a long match, at least 4 bytes.
+				matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4
+				if matchedNext > matched {
+					t = coffsetL
+					matched = matchedNext
+					if debugMatches {
+						println("long match at end-of-match")
+					}
+				}
+			}
+
+			// Check prev long...
+			if true {
+				coffsetL = candidateL.prev - e.cur - matched
+				if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
+					// Found a long match, at least 4 bytes.
+					matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4
+					if matchedNext > matched {
+						t = coffsetL
+						matched = matchedNext
+						if debugMatches {
+							println("prev long match at end-of-match")
+						}
+					}
+				}
+			}
+		}
+		// A match has been found. Update recent offsets.
 		offset2 = offset1
 		offset1 = s - t
 
@@ -447,10 +497,10 @@ encodeLoop:
 		for index0 < s-1 {
 			cv0 := load6432(src, index0)
 			cv1 := cv0 >> 8
-			h0 := hash8(cv0, betterLongTableBits)
+			h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
 			off := index0 + e.cur
 			e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
-			e.table[hash5(cv1, betterShortTableBits)] = tableEntry{offset: off + 1, val: uint32(cv1)}
+			e.table[hashLen(cv1, betterShortTableBits, betterShortLen)] = tableEntry{offset: off + 1, val: uint32(cv1)}
 			index0 += 2
 		}
 
@@ -468,8 +518,8 @@ encodeLoop:
 			}
 
 			// Store this, since we have it.
-			nextHashS := hash5(cv, betterShortTableBits)
-			nextHashL := hash8(cv, betterLongTableBits)
+			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
+			nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
 
 			// We have at least 4 byte match.
 			// No need to check backwards. We come straight from a match
@@ -505,7 +555,7 @@ encodeLoop:
 	}
 	blk.recentOffsets[0] = uint32(offset1)
 	blk.recentOffsets[1] = uint32(offset2)
-	if debug {
+	if debugEncoder {
 		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
 	}
 }
@@ -514,5 +564,674 @@ encodeLoop:
 // Most notable difference is that src will not be copied for history and
 // we do not need to check for max match length.
 func (e *betterFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
+	e.ensureHist(len(src))
 	e.Encode(blk, src)
 }
+
+// Encode improves compression...
+func (e *betterFastEncoderDict) Encode(blk *blockEnc, src []byte) {
+	const (
+		// Input margin is the number of bytes we read (8)
+		// and the maximum we will read ahead (2)
+		inputMargin            = 8 + 2
+		minNonLiteralBlockSize = 16
+	)
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.longTable[:] {
+				e.longTable[i] = prevEntry{}
+			}
+			e.cur = e.maxMatchOff
+			e.allDirty = true
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v < minOff {
+				v = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.longTable[:] {
+			v := e.longTable[i].offset
+			v2 := e.longTable[i].prev
+			if v < minOff {
+				v = 0
+				v2 = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+				if v2 < minOff {
+					v2 = 0
+				} else {
+					v2 = v2 - e.cur + e.maxMatchOff
+				}
+			}
+			e.longTable[i] = prevEntry{
+				offset: v,
+				prev:   v2,
+			}
+		}
+		e.allDirty = true
+		e.cur = e.maxMatchOff
+		break
+	}
+
+	s := e.addBlock(src)
+	blk.size = len(src)
+	if len(src) < minNonLiteralBlockSize {
+		blk.extraLits = len(src)
+		blk.literals = blk.literals[:len(src)]
+		copy(blk.literals, src)
+		return
+	}
+
+	// Override src
+	src = e.hist
+	sLimit := int32(len(src)) - inputMargin
+	// stepSize is the number of bytes to skip on every main loop iteration.
+	// It should be >= 1.
+	const stepSize = 1
+
+	const kSearchStrength = 9
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := s
+	cv := load6432(src, s)
+
+	// Relative offsets
+	offset1 := int32(blk.recentOffsets[0])
+	offset2 := int32(blk.recentOffsets[1])
+
+	addLiterals := func(s *seq, until int32) {
+		if until == nextEmit {
+			return
+		}
+		blk.literals = append(blk.literals, src[nextEmit:until]...)
+		s.litLen = uint32(until - nextEmit)
+	}
+	if debugEncoder {
+		println("recent offsets:", blk.recentOffsets)
+	}
+
+encodeLoop:
+	for {
+		var t int32
+		// We allow the encoder to optionally turn off repeat offsets across blocks
+		canRepeat := len(blk.sequences) > 2
+		var matched int32
+
+		for {
+			if debugAsserts && canRepeat && offset1 == 0 {
+				panic("offset0 was 0")
+			}
+
+			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
+			nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
+			candidateL := e.longTable[nextHashL]
+			candidateS := e.table[nextHashS]
+
+			const repOff = 1
+			repIndex := s - offset1 + repOff
+			off := s + e.cur
+			e.longTable[nextHashL] = prevEntry{offset: off, prev: candidateL.offset}
+			e.markLongShardDirty(nextHashL)
+			e.table[nextHashS] = tableEntry{offset: off, val: uint32(cv)}
+			e.markShortShardDirty(nextHashS)
+
+			if canRepeat {
+				if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
+					// Consider history as well.
+					var seq seq
+					lenght := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
+
+					seq.matchLen = uint32(lenght - zstdMinMatch)
+
+					// We might be able to match backwards.
+					// Extend as long as we can.
+					start := s + repOff
+					// We end the search early, so we don't risk 0 literals
+					// and have to do special offset treatment.
+					startLimit := nextEmit + 1
+
+					tMin := s - e.maxMatchOff
+					if tMin < 0 {
+						tMin = 0
+					}
+					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
+						repIndex--
+						start--
+						seq.matchLen++
+					}
+					addLiterals(&seq, start)
+
+					// rep 0
+					seq.offset = 1
+					if debugSequences {
+						println("repeat sequence", seq, "next s:", s)
+					}
+					blk.sequences = append(blk.sequences, seq)
+
+					// Index match start+1 (long) -> s - 1
+					index0 := s + repOff
+					s += lenght + repOff
+
+					nextEmit = s
+					if s >= sLimit {
+						if debugEncoder {
+							println("repeat ended", s, lenght)
+
+						}
+						break encodeLoop
+					}
+					// Index skipped...
+					for index0 < s-1 {
+						cv0 := load6432(src, index0)
+						cv1 := cv0 >> 8
+						h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
+						off := index0 + e.cur
+						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
+						e.markLongShardDirty(h0)
+						h1 := hashLen(cv1, betterShortTableBits, betterShortLen)
+						e.table[h1] = tableEntry{offset: off + 1, val: uint32(cv1)}
+						e.markShortShardDirty(h1)
+						index0 += 2
+					}
+					cv = load6432(src, s)
+					continue
+				}
+				const repOff2 = 1
+
+				// We deviate from the reference encoder and also check offset 2.
+				// Still slower and not much better, so disabled.
+				// repIndex = s - offset2 + repOff2
+				if false && repIndex >= 0 && load6432(src, repIndex) == load6432(src, s+repOff) {
+					// Consider history as well.
+					var seq seq
+					lenght := 8 + e.matchlen(s+8+repOff2, repIndex+8, src)
+
+					seq.matchLen = uint32(lenght - zstdMinMatch)
+
+					// We might be able to match backwards.
+					// Extend as long as we can.
+					start := s + repOff2
+					// We end the search early, so we don't risk 0 literals
+					// and have to do special offset treatment.
+					startLimit := nextEmit + 1
+
+					tMin := s - e.maxMatchOff
+					if tMin < 0 {
+						tMin = 0
+					}
+					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
+						repIndex--
+						start--
+						seq.matchLen++
+					}
+					addLiterals(&seq, start)
+
+					// rep 2
+					seq.offset = 2
+					if debugSequences {
+						println("repeat sequence 2", seq, "next s:", s)
+					}
+					blk.sequences = append(blk.sequences, seq)
+
+					index0 := s + repOff2
+					s += lenght + repOff2
+					nextEmit = s
+					if s >= sLimit {
+						if debugEncoder {
+							println("repeat ended", s, lenght)
+
+						}
+						break encodeLoop
+					}
+
+					// Index skipped...
+					for index0 < s-1 {
+						cv0 := load6432(src, index0)
+						cv1 := cv0 >> 8
+						h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
+						off := index0 + e.cur
+						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
+						e.markLongShardDirty(h0)
+						h1 := hashLen(cv1, betterShortTableBits, betterShortLen)
+						e.table[h1] = tableEntry{offset: off + 1, val: uint32(cv1)}
+						e.markShortShardDirty(h1)
+						index0 += 2
+					}
+					cv = load6432(src, s)
+					// Swap offsets
+					offset1, offset2 = offset2, offset1
+					continue
+				}
+			}
+			// Find the offsets of our two matches.
+			coffsetL := candidateL.offset - e.cur
+			coffsetLP := candidateL.prev - e.cur
+
+			// Check if we have a long match.
+			if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
+				// Found a long match, at least 8 bytes.
+				matched = e.matchlen(s+8, coffsetL+8, src) + 8
+				t = coffsetL
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugMatches {
+					println("long match")
+				}
+
+				if s-coffsetLP < e.maxMatchOff && cv == load6432(src, coffsetLP) {
+					// Found a long match, at least 8 bytes.
+					prevMatch := e.matchlen(s+8, coffsetLP+8, src) + 8
+					if prevMatch > matched {
+						matched = prevMatch
+						t = coffsetLP
+					}
+					if debugAsserts && s <= t {
+						panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+					}
+					if debugAsserts && s-t > e.maxMatchOff {
+						panic("s - t >e.maxMatchOff")
+					}
+					if debugMatches {
+						println("long match")
+					}
+				}
+				break
+			}
+
+			// Check if we have a long match on prev.
+			if s-coffsetLP < e.maxMatchOff && cv == load6432(src, coffsetLP) {
+				// Found a long match, at least 8 bytes.
+				matched = e.matchlen(s+8, coffsetLP+8, src) + 8
+				t = coffsetLP
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugMatches {
+					println("long match")
+				}
+				break
+			}
+
+			coffsetS := candidateS.offset - e.cur
+
+			// Check if we have a short match.
+			if s-coffsetS < e.maxMatchOff && uint32(cv) == candidateS.val {
+				// found a regular match
+				matched = e.matchlen(s+4, coffsetS+4, src) + 4
+
+				// See if we can find a long match at s+1
+				const checkAt = 1
+				cv := load6432(src, s+checkAt)
+				nextHashL = hashLen(cv, betterLongTableBits, betterLongLen)
+				candidateL = e.longTable[nextHashL]
+				coffsetL = candidateL.offset - e.cur
+
+				// We can store it, since we have at least a 4 byte match.
+				e.longTable[nextHashL] = prevEntry{offset: s + checkAt + e.cur, prev: candidateL.offset}
+				e.markLongShardDirty(nextHashL)
+				if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
+					// Found a long match, at least 8 bytes.
+					matchedNext := e.matchlen(s+8+checkAt, coffsetL+8, src) + 8
+					if matchedNext > matched {
+						t = coffsetL
+						s += checkAt
+						matched = matchedNext
+						if debugMatches {
+							println("long match (after short)")
+						}
+						break
+					}
+				}
+
+				// Check prev long...
+				coffsetL = candidateL.prev - e.cur
+				if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
+					// Found a long match, at least 8 bytes.
+					matchedNext := e.matchlen(s+8+checkAt, coffsetL+8, src) + 8
+					if matchedNext > matched {
+						t = coffsetL
+						s += checkAt
+						matched = matchedNext
+						if debugMatches {
+							println("prev long match (after short)")
+						}
+						break
+					}
+				}
+				t = coffsetS
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugAsserts && t < 0 {
+					panic("t<0")
+				}
+				if debugMatches {
+					println("short match")
+				}
+				break
+			}
+
+			// No match found, move forward in input.
+			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
+			if s >= sLimit {
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+		// Try to find a better match by searching for a long match at the end of the current best match
+		if s+matched < sLimit {
+			nextHashL := hashLen(load6432(src, s+matched), betterLongTableBits, betterLongLen)
+			cv := load3232(src, s)
+			candidateL := e.longTable[nextHashL]
+			coffsetL := candidateL.offset - e.cur - matched
+			if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
+				// Found a long match, at least 4 bytes.
+				matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4
+				if matchedNext > matched {
+					t = coffsetL
+					matched = matchedNext
+					if debugMatches {
+						println("long match at end-of-match")
+					}
+				}
+			}
+
+			// Check prev long...
+			if true {
+				coffsetL = candidateL.prev - e.cur - matched
+				if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
+					// Found a long match, at least 4 bytes.
+					matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4
+					if matchedNext > matched {
+						t = coffsetL
+						matched = matchedNext
+						if debugMatches {
+							println("prev long match at end-of-match")
+						}
+					}
+				}
+			}
+		}
+		// A match has been found. Update recent offsets.
+		offset2 = offset1
+		offset1 = s - t
+
+		if debugAsserts && s <= t {
+			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+		}
+
+		if debugAsserts && canRepeat && int(offset1) > len(src) {
+			panic("invalid offset")
+		}
+
+		// Extend the n-byte match as long as possible.
+		l := matched
+
+		// Extend backwards
+		tMin := s - e.maxMatchOff
+		if tMin < 0 {
+			tMin = 0
+		}
+		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
+			s--
+			t--
+			l++
+		}
+
+		// Write our sequence
+		var seq seq
+		seq.litLen = uint32(s - nextEmit)
+		seq.matchLen = uint32(l - zstdMinMatch)
+		if seq.litLen > 0 {
+			blk.literals = append(blk.literals, src[nextEmit:s]...)
+		}
+		seq.offset = uint32(s-t) + 3
+		s += l
+		if debugSequences {
+			println("sequence", seq, "next s:", s)
+		}
+		blk.sequences = append(blk.sequences, seq)
+		nextEmit = s
+		if s >= sLimit {
+			break encodeLoop
+		}
+
+		// Index match start+1 (long) -> s - 1
+		index0 := s - l + 1
+		for index0 < s-1 {
+			cv0 := load6432(src, index0)
+			cv1 := cv0 >> 8
+			h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
+			off := index0 + e.cur
+			e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
+			e.markLongShardDirty(h0)
+			h1 := hashLen(cv1, betterShortTableBits, betterShortLen)
+			e.table[h1] = tableEntry{offset: off + 1, val: uint32(cv1)}
+			e.markShortShardDirty(h1)
+			index0 += 2
+		}
+
+		cv = load6432(src, s)
+		if !canRepeat {
+			continue
+		}
+
+		// Check offset 2
+		for {
+			o2 := s - offset2
+			if load3232(src, o2) != uint32(cv) {
+				// Do regular search
+				break
+			}
+
+			// Store this, since we have it.
+			nextHashS := hashLen(cv, betterShortTableBits, betterShortLen)
+			nextHashL := hashLen(cv, betterLongTableBits, betterLongLen)
+
+			// We have at least 4 byte match.
+			// No need to check backwards. We come straight from a match
+			l := 4 + e.matchlen(s+4, o2+4, src)
+
+			e.longTable[nextHashL] = prevEntry{offset: s + e.cur, prev: e.longTable[nextHashL].offset}
+			e.markLongShardDirty(nextHashL)
+			e.table[nextHashS] = tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.markShortShardDirty(nextHashS)
+			seq.matchLen = uint32(l) - zstdMinMatch
+			seq.litLen = 0
+
+			// Since litlen is always 0, this is offset 1.
+			seq.offset = 1
+			s += l
+			nextEmit = s
+			if debugSequences {
+				println("sequence", seq, "next s:", s)
+			}
+			blk.sequences = append(blk.sequences, seq)
+
+			// Swap offset 1 and 2.
+			offset1, offset2 = offset2, offset1
+			if s >= sLimit {
+				// Finished
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+	}
+
+	if int(nextEmit) < len(src) {
+		blk.literals = append(blk.literals, src[nextEmit:]...)
+		blk.extraLits = len(src) - int(nextEmit)
+	}
+	blk.recentOffsets[0] = uint32(offset1)
+	blk.recentOffsets[1] = uint32(offset2)
+	if debugEncoder {
+		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
+	}
+}
+
+// ResetDict will reset and set a dictionary if not nil
+func (e *betterFastEncoder) Reset(d *dict, singleBlock bool) {
+	e.resetBase(d, singleBlock)
+	if d != nil {
+		panic("betterFastEncoder: Reset with dict")
+	}
+}
+
+// ResetDict will reset and set a dictionary if not nil
+func (e *betterFastEncoderDict) Reset(d *dict, singleBlock bool) {
+	e.resetBase(d, singleBlock)
+	if d == nil {
+		return
+	}
+	// Init or copy dict table
+	if len(e.dictTable) != len(e.table) || d.id != e.lastDictID {
+		if len(e.dictTable) != len(e.table) {
+			e.dictTable = make([]tableEntry, len(e.table))
+		}
+		end := int32(len(d.content)) - 8 + e.maxMatchOff
+		for i := e.maxMatchOff; i < end; i += 4 {
+			const hashLog = betterShortTableBits
+
+			cv := load6432(d.content, i-e.maxMatchOff)
+			nextHash := hashLen(cv, hashLog, betterShortLen)      // 0 -> 4
+			nextHash1 := hashLen(cv>>8, hashLog, betterShortLen)  // 1 -> 5
+			nextHash2 := hashLen(cv>>16, hashLog, betterShortLen) // 2 -> 6
+			nextHash3 := hashLen(cv>>24, hashLog, betterShortLen) // 3 -> 7
+			e.dictTable[nextHash] = tableEntry{
+				val:    uint32(cv),
+				offset: i,
+			}
+			e.dictTable[nextHash1] = tableEntry{
+				val:    uint32(cv >> 8),
+				offset: i + 1,
+			}
+			e.dictTable[nextHash2] = tableEntry{
+				val:    uint32(cv >> 16),
+				offset: i + 2,
+			}
+			e.dictTable[nextHash3] = tableEntry{
+				val:    uint32(cv >> 24),
+				offset: i + 3,
+			}
+		}
+		e.lastDictID = d.id
+		e.allDirty = true
+	}
+
+	// Init or copy dict table
+	if len(e.dictLongTable) != len(e.longTable) || d.id != e.lastDictID {
+		if len(e.dictLongTable) != len(e.longTable) {
+			e.dictLongTable = make([]prevEntry, len(e.longTable))
+		}
+		if len(d.content) >= 8 {
+			cv := load6432(d.content, 0)
+			h := hashLen(cv, betterLongTableBits, betterLongLen)
+			e.dictLongTable[h] = prevEntry{
+				offset: e.maxMatchOff,
+				prev:   e.dictLongTable[h].offset,
+			}
+
+			end := int32(len(d.content)) - 8 + e.maxMatchOff
+			off := 8 // First to read
+			for i := e.maxMatchOff + 1; i < end; i++ {
+				cv = cv>>8 | (uint64(d.content[off]) << 56)
+				h := hashLen(cv, betterLongTableBits, betterLongLen)
+				e.dictLongTable[h] = prevEntry{
+					offset: i,
+					prev:   e.dictLongTable[h].offset,
+				}
+				off++
+			}
+		}
+		e.lastDictID = d.id
+		e.allDirty = true
+	}
+
+	// Reset table to initial state
+	{
+		dirtyShardCnt := 0
+		if !e.allDirty {
+			for i := range e.shortTableShardDirty {
+				if e.shortTableShardDirty[i] {
+					dirtyShardCnt++
+				}
+			}
+		}
+		const shardCnt = betterShortTableShardCnt
+		const shardSize = betterShortTableShardSize
+		if e.allDirty || dirtyShardCnt > shardCnt*4/6 {
+			copy(e.table[:], e.dictTable)
+			for i := range e.shortTableShardDirty {
+				e.shortTableShardDirty[i] = false
+			}
+		} else {
+			for i := range e.shortTableShardDirty {
+				if !e.shortTableShardDirty[i] {
+					continue
+				}
+
+				copy(e.table[i*shardSize:(i+1)*shardSize], e.dictTable[i*shardSize:(i+1)*shardSize])
+				e.shortTableShardDirty[i] = false
+			}
+		}
+	}
+	{
+		dirtyShardCnt := 0
+		if !e.allDirty {
+			for i := range e.shortTableShardDirty {
+				if e.shortTableShardDirty[i] {
+					dirtyShardCnt++
+				}
+			}
+		}
+		const shardCnt = betterLongTableShardCnt
+		const shardSize = betterLongTableShardSize
+		if e.allDirty || dirtyShardCnt > shardCnt*4/6 {
+			copy(e.longTable[:], e.dictLongTable)
+			for i := range e.longTableShardDirty {
+				e.longTableShardDirty[i] = false
+			}
+		} else {
+			for i := range e.longTableShardDirty {
+				if !e.longTableShardDirty[i] {
+					continue
+				}
+
+				copy(e.longTable[i*shardSize:(i+1)*shardSize], e.dictLongTable[i*shardSize:(i+1)*shardSize])
+				e.longTableShardDirty[i] = false
+			}
+		}
+	}
+	e.cur = e.maxMatchOff
+	e.allDirty = false
+}
+
+func (e *betterFastEncoderDict) markLongShardDirty(entryNum uint32) {
+	e.longTableShardDirty[entryNum/betterLongTableShardSize] = true
+}
+
+func (e *betterFastEncoderDict) markShortShardDirty(entryNum uint32) {
+	e.shortTableShardDirty[entryNum/betterShortTableShardSize] = true
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
index 50276bc..d6b3104 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
@@ -10,10 +10,16 @@ const (
 	dFastLongTableBits = 17                      // Bits used in the long match table
 	dFastLongTableSize = 1 << dFastLongTableBits // Size of the table
 	dFastLongTableMask = dFastLongTableSize - 1  // Mask for table indices. Redundant, but can eliminate bounds checks.
+	dFastLongLen       = 8                       // Bytes used for table hash
+
+	dLongTableShardCnt  = 1 << (dFastLongTableBits - dictShardBits) // Number of shards in the table
+	dLongTableShardSize = dFastLongTableSize / tableShardCnt        // Size of an individual shard
 
 	dFastShortTableBits = tableBits                // Bits used in the short match table
 	dFastShortTableSize = 1 << dFastShortTableBits // Size of the table
 	dFastShortTableMask = dFastShortTableSize - 1  // Mask for table indices. Redundant, but can eliminate bounds checks.
+	dFastShortLen       = 5                        // Bytes used for table hash
+
 )
 
 type doubleFastEncoder struct {
@@ -21,6 +27,13 @@ type doubleFastEncoder struct {
 	longTable [dFastLongTableSize]tableEntry
 }
 
+type doubleFastEncoderDict struct {
+	fastEncoderDict
+	longTable           [dFastLongTableSize]tableEntry
+	dictLongTable       []tableEntry
+	longTableShardDirty [dLongTableShardCnt]bool
+}
+
 // Encode mimmics functionality in zstd_dfast.c
 func (e *doubleFastEncoder) Encode(blk *blockEnc, src []byte) {
 	const (
@@ -99,7 +112,7 @@ func (e *doubleFastEncoder) Encode(blk *blockEnc, src []byte) {
 		blk.literals = append(blk.literals, src[nextEmit:until]...)
 		s.litLen = uint32(until - nextEmit)
 	}
-	if debug {
+	if debugEncoder {
 		println("recent offsets:", blk.recentOffsets)
 	}
 
@@ -114,8 +127,8 @@ encodeLoop:
 				panic("offset0 was 0")
 			}
 
-			nextHashS := hash5(cv, dFastShortTableBits)
-			nextHashL := hash8(cv, dFastLongTableBits)
+			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
+			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
 			candidateL := e.longTable[nextHashL]
 			candidateS := e.table[nextHashS]
 
@@ -160,7 +173,7 @@ encodeLoop:
 					s += lenght + repOff
 					nextEmit = s
 					if s >= sLimit {
-						if debug {
+						if debugEncoder {
 							println("repeat ended", s, lenght)
 
 						}
@@ -198,7 +211,7 @@ encodeLoop:
 				// See if we can find a long match at s+1
 				const checkAt = 1
 				cv := load6432(src, s+checkAt)
-				nextHashL = hash8(cv, dFastLongTableBits)
+				nextHashL = hashLen(cv, dFastLongTableBits, dFastLongLen)
 				candidateL = e.longTable[nextHashL]
 				coffsetL = s - (candidateL.offset - e.cur) + checkAt
 
@@ -294,16 +307,16 @@ encodeLoop:
 		cv1 := load6432(src, index1)
 		te0 := tableEntry{offset: index0 + e.cur, val: uint32(cv0)}
 		te1 := tableEntry{offset: index1 + e.cur, val: uint32(cv1)}
-		e.longTable[hash8(cv0, dFastLongTableBits)] = te0
-		e.longTable[hash8(cv1, dFastLongTableBits)] = te1
+		e.longTable[hashLen(cv0, dFastLongTableBits, dFastLongLen)] = te0
+		e.longTable[hashLen(cv1, dFastLongTableBits, dFastLongLen)] = te1
 		cv0 >>= 8
 		cv1 >>= 8
 		te0.offset++
 		te1.offset++
 		te0.val = uint32(cv0)
 		te1.val = uint32(cv1)
-		e.table[hash5(cv0, dFastShortTableBits)] = te0
-		e.table[hash5(cv1, dFastShortTableBits)] = te1
+		e.table[hashLen(cv0, dFastShortTableBits, dFastShortLen)] = te0
+		e.table[hashLen(cv1, dFastShortTableBits, dFastShortLen)] = te1
 
 		cv = load6432(src, s)
 
@@ -320,8 +333,8 @@ encodeLoop:
 			}
 
 			// Store this, since we have it.
-			nextHashS := hash5(cv, dFastShortTableBits)
-			nextHashL := hash8(cv, dFastLongTableBits)
+			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
+			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
 
 			// We have at least 4 byte match.
 			// No need to check backwards. We come straight from a match
@@ -358,7 +371,7 @@ encodeLoop:
 	}
 	blk.recentOffsets[0] = uint32(offset1)
 	blk.recentOffsets[1] = uint32(offset2)
-	if debug {
+	if debugEncoder {
 		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
 	}
 }
@@ -417,7 +430,7 @@ func (e *doubleFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
 		blk.literals = append(blk.literals, src[nextEmit:until]...)
 		s.litLen = uint32(until - nextEmit)
 	}
-	if debug {
+	if debugEncoder {
 		println("recent offsets:", blk.recentOffsets)
 	}
 
@@ -426,8 +439,8 @@ encodeLoop:
 		var t int32
 		for {
 
-			nextHashS := hash5(cv, dFastShortTableBits)
-			nextHashL := hash8(cv, dFastLongTableBits)
+			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
+			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
 			candidateL := e.longTable[nextHashL]
 			candidateS := e.table[nextHashS]
 
@@ -473,7 +486,7 @@ encodeLoop:
 					s += length + repOff
 					nextEmit = s
 					if s >= sLimit {
-						if debug {
+						if debugEncoder {
 							println("repeat ended", s, length)
 
 						}
@@ -494,7 +507,7 @@ encodeLoop:
 				// but the likelihood of both the first 4 bytes and the hash matching should be enough.
 				t = candidateL.offset - e.cur
 				if debugAsserts && s <= t {
-					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+					panic(fmt.Sprintf("s (%d) <= t (%d). cur: %d", s, t, e.cur))
 				}
 				if debugAsserts && s-t > e.maxMatchOff {
 					panic("s - t >e.maxMatchOff")
@@ -511,7 +524,7 @@ encodeLoop:
 				// See if we can find a long match at s+1
 				const checkAt = 1
 				cv := load6432(src, s+checkAt)
-				nextHashL = hash8(cv, dFastLongTableBits)
+				nextHashL = hashLen(cv, dFastLongTableBits, dFastLongLen)
 				candidateL = e.longTable[nextHashL]
 				coffsetL = s - (candidateL.offset - e.cur) + checkAt
 
@@ -604,16 +617,16 @@ encodeLoop:
 		cv1 := load6432(src, index1)
 		te0 := tableEntry{offset: index0 + e.cur, val: uint32(cv0)}
 		te1 := tableEntry{offset: index1 + e.cur, val: uint32(cv1)}
-		e.longTable[hash8(cv0, dFastLongTableBits)] = te0
-		e.longTable[hash8(cv1, dFastLongTableBits)] = te1
+		e.longTable[hashLen(cv0, dFastLongTableBits, dFastLongLen)] = te0
+		e.longTable[hashLen(cv1, dFastLongTableBits, dFastLongLen)] = te1
 		cv0 >>= 8
 		cv1 >>= 8
 		te0.offset++
 		te1.offset++
 		te0.val = uint32(cv0)
 		te1.val = uint32(cv1)
-		e.table[hash5(cv0, dFastShortTableBits)] = te0
-		e.table[hash5(cv1, dFastShortTableBits)] = te1
+		e.table[hashLen(cv0, dFastShortTableBits, dFastShortLen)] = te0
+		e.table[hashLen(cv1, dFastShortTableBits, dFastShortLen)] = te1
 
 		cv = load6432(src, s)
 
@@ -630,8 +643,8 @@ encodeLoop:
 			}
 
 			// Store this, since we have it.
-			nextHashS := hash5(cv1>>8, dFastShortTableBits)
-			nextHashL := hash8(cv, dFastLongTableBits)
+			nextHashS := hashLen(cv1>>8, dFastShortTableBits, dFastShortLen)
+			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
 
 			// We have at least 4 byte match.
 			// No need to check backwards. We come straight from a match
@@ -667,7 +680,7 @@ encodeLoop:
 		blk.literals = append(blk.literals, src[nextEmit:]...)
 		blk.extraLits = len(src) - int(nextEmit)
 	}
-	if debug {
+	if debugEncoder {
 		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
 	}
 
@@ -676,3 +689,436 @@ encodeLoop:
 		e.cur += int32(len(src))
 	}
 }
+
+// Encode will encode the content, with a dictionary if initialized for it.
+func (e *doubleFastEncoderDict) Encode(blk *blockEnc, src []byte) {
+	const (
+		// Input margin is the number of bytes we read (8)
+		// and the maximum we will read ahead (2)
+		inputMargin            = 8 + 2
+		minNonLiteralBlockSize = 16
+	)
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.longTable[:] {
+				e.longTable[i] = tableEntry{}
+			}
+			e.markAllShardsDirty()
+			e.cur = e.maxMatchOff
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v < minOff {
+				v = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.longTable[:] {
+			v := e.longTable[i].offset
+			if v < minOff {
+				v = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+			}
+			e.longTable[i].offset = v
+		}
+		e.markAllShardsDirty()
+		e.cur = e.maxMatchOff
+		break
+	}
+
+	s := e.addBlock(src)
+	blk.size = len(src)
+	if len(src) < minNonLiteralBlockSize {
+		blk.extraLits = len(src)
+		blk.literals = blk.literals[:len(src)]
+		copy(blk.literals, src)
+		return
+	}
+
+	// Override src
+	src = e.hist
+	sLimit := int32(len(src)) - inputMargin
+	// stepSize is the number of bytes to skip on every main loop iteration.
+	// It should be >= 1.
+	const stepSize = 1
+
+	const kSearchStrength = 8
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := s
+	cv := load6432(src, s)
+
+	// Relative offsets
+	offset1 := int32(blk.recentOffsets[0])
+	offset2 := int32(blk.recentOffsets[1])
+
+	addLiterals := func(s *seq, until int32) {
+		if until == nextEmit {
+			return
+		}
+		blk.literals = append(blk.literals, src[nextEmit:until]...)
+		s.litLen = uint32(until - nextEmit)
+	}
+	if debugEncoder {
+		println("recent offsets:", blk.recentOffsets)
+	}
+
+encodeLoop:
+	for {
+		var t int32
+		// We allow the encoder to optionally turn off repeat offsets across blocks
+		canRepeat := len(blk.sequences) > 2
+
+		for {
+			if debugAsserts && canRepeat && offset1 == 0 {
+				panic("offset0 was 0")
+			}
+
+			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
+			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
+			candidateL := e.longTable[nextHashL]
+			candidateS := e.table[nextHashS]
+
+			const repOff = 1
+			repIndex := s - offset1 + repOff
+			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.longTable[nextHashL] = entry
+			e.markLongShardDirty(nextHashL)
+			e.table[nextHashS] = entry
+			e.markShardDirty(nextHashS)
+
+			if canRepeat {
+				if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
+					// Consider history as well.
+					var seq seq
+					lenght := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
+
+					seq.matchLen = uint32(lenght - zstdMinMatch)
+
+					// We might be able to match backwards.
+					// Extend as long as we can.
+					start := s + repOff
+					// We end the search early, so we don't risk 0 literals
+					// and have to do special offset treatment.
+					startLimit := nextEmit + 1
+
+					tMin := s - e.maxMatchOff
+					if tMin < 0 {
+						tMin = 0
+					}
+					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
+						repIndex--
+						start--
+						seq.matchLen++
+					}
+					addLiterals(&seq, start)
+
+					// rep 0
+					seq.offset = 1
+					if debugSequences {
+						println("repeat sequence", seq, "next s:", s)
+					}
+					blk.sequences = append(blk.sequences, seq)
+					s += lenght + repOff
+					nextEmit = s
+					if s >= sLimit {
+						if debugEncoder {
+							println("repeat ended", s, lenght)
+
+						}
+						break encodeLoop
+					}
+					cv = load6432(src, s)
+					continue
+				}
+			}
+			// Find the offsets of our two matches.
+			coffsetL := s - (candidateL.offset - e.cur)
+			coffsetS := s - (candidateS.offset - e.cur)
+
+			// Check if we have a long match.
+			if coffsetL < e.maxMatchOff && uint32(cv) == candidateL.val {
+				// Found a long match, likely at least 8 bytes.
+				// Reference encoder checks all 8 bytes, we only check 4,
+				// but the likelihood of both the first 4 bytes and the hash matching should be enough.
+				t = candidateL.offset - e.cur
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugMatches {
+					println("long match")
+				}
+				break
+			}
+
+			// Check if we have a short match.
+			if coffsetS < e.maxMatchOff && uint32(cv) == candidateS.val {
+				// found a regular match
+				// See if we can find a long match at s+1
+				const checkAt = 1
+				cv := load6432(src, s+checkAt)
+				nextHashL = hashLen(cv, dFastLongTableBits, dFastLongLen)
+				candidateL = e.longTable[nextHashL]
+				coffsetL = s - (candidateL.offset - e.cur) + checkAt
+
+				// We can store it, since we have at least a 4 byte match.
+				e.longTable[nextHashL] = tableEntry{offset: s + checkAt + e.cur, val: uint32(cv)}
+				e.markLongShardDirty(nextHashL)
+				if coffsetL < e.maxMatchOff && uint32(cv) == candidateL.val {
+					// Found a long match, likely at least 8 bytes.
+					// Reference encoder checks all 8 bytes, we only check 4,
+					// but the likelihood of both the first 4 bytes and the hash matching should be enough.
+					t = candidateL.offset - e.cur
+					s += checkAt
+					if debugMatches {
+						println("long match (after short)")
+					}
+					break
+				}
+
+				t = candidateS.offset - e.cur
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugAsserts && t < 0 {
+					panic("t<0")
+				}
+				if debugMatches {
+					println("short match")
+				}
+				break
+			}
+
+			// No match found, move forward in input.
+			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
+			if s >= sLimit {
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+
+		// A 4-byte match has been found. Update recent offsets.
+		// We'll later see if more than 4 bytes.
+		offset2 = offset1
+		offset1 = s - t
+
+		if debugAsserts && s <= t {
+			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+		}
+
+		if debugAsserts && canRepeat && int(offset1) > len(src) {
+			panic("invalid offset")
+		}
+
+		// Extend the 4-byte match as long as possible.
+		l := e.matchlen(s+4, t+4, src) + 4
+
+		// Extend backwards
+		tMin := s - e.maxMatchOff
+		if tMin < 0 {
+			tMin = 0
+		}
+		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
+			s--
+			t--
+			l++
+		}
+
+		// Write our sequence
+		var seq seq
+		seq.litLen = uint32(s - nextEmit)
+		seq.matchLen = uint32(l - zstdMinMatch)
+		if seq.litLen > 0 {
+			blk.literals = append(blk.literals, src[nextEmit:s]...)
+		}
+		seq.offset = uint32(s-t) + 3
+		s += l
+		if debugSequences {
+			println("sequence", seq, "next s:", s)
+		}
+		blk.sequences = append(blk.sequences, seq)
+		nextEmit = s
+		if s >= sLimit {
+			break encodeLoop
+		}
+
+		// Index match start+1 (long) and start+2 (short)
+		index0 := s - l + 1
+		// Index match end-2 (long) and end-1 (short)
+		index1 := s - 2
+
+		cv0 := load6432(src, index0)
+		cv1 := load6432(src, index1)
+		te0 := tableEntry{offset: index0 + e.cur, val: uint32(cv0)}
+		te1 := tableEntry{offset: index1 + e.cur, val: uint32(cv1)}
+		longHash1 := hashLen(cv0, dFastLongTableBits, dFastLongLen)
+		longHash2 := hashLen(cv0, dFastLongTableBits, dFastLongLen)
+		e.longTable[longHash1] = te0
+		e.longTable[longHash2] = te1
+		e.markLongShardDirty(longHash1)
+		e.markLongShardDirty(longHash2)
+		cv0 >>= 8
+		cv1 >>= 8
+		te0.offset++
+		te1.offset++
+		te0.val = uint32(cv0)
+		te1.val = uint32(cv1)
+		hashVal1 := hashLen(cv0, dFastShortTableBits, dFastShortLen)
+		hashVal2 := hashLen(cv1, dFastShortTableBits, dFastShortLen)
+		e.table[hashVal1] = te0
+		e.markShardDirty(hashVal1)
+		e.table[hashVal2] = te1
+		e.markShardDirty(hashVal2)
+
+		cv = load6432(src, s)
+
+		if !canRepeat {
+			continue
+		}
+
+		// Check offset 2
+		for {
+			o2 := s - offset2
+			if load3232(src, o2) != uint32(cv) {
+				// Do regular search
+				break
+			}
+
+			// Store this, since we have it.
+			nextHashS := hashLen(cv, dFastShortTableBits, dFastShortLen)
+			nextHashL := hashLen(cv, dFastLongTableBits, dFastLongLen)
+
+			// We have at least 4 byte match.
+			// No need to check backwards. We come straight from a match
+			l := 4 + e.matchlen(s+4, o2+4, src)
+
+			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.longTable[nextHashL] = entry
+			e.markLongShardDirty(nextHashL)
+			e.table[nextHashS] = entry
+			e.markShardDirty(nextHashS)
+			seq.matchLen = uint32(l) - zstdMinMatch
+			seq.litLen = 0
+
+			// Since litlen is always 0, this is offset 1.
+			seq.offset = 1
+			s += l
+			nextEmit = s
+			if debugSequences {
+				println("sequence", seq, "next s:", s)
+			}
+			blk.sequences = append(blk.sequences, seq)
+
+			// Swap offset 1 and 2.
+			offset1, offset2 = offset2, offset1
+			if s >= sLimit {
+				// Finished
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+	}
+
+	if int(nextEmit) < len(src) {
+		blk.literals = append(blk.literals, src[nextEmit:]...)
+		blk.extraLits = len(src) - int(nextEmit)
+	}
+	blk.recentOffsets[0] = uint32(offset1)
+	blk.recentOffsets[1] = uint32(offset2)
+	if debugEncoder {
+		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
+	}
+	// If we encoded more than 64K mark all dirty.
+	if len(src) > 64<<10 {
+		e.markAllShardsDirty()
+	}
+}
+
+// ResetDict will reset and set a dictionary if not nil
+func (e *doubleFastEncoder) Reset(d *dict, singleBlock bool) {
+	e.fastEncoder.Reset(d, singleBlock)
+	if d != nil {
+		panic("doubleFastEncoder: Reset with dict not supported")
+	}
+}
+
+// ResetDict will reset and set a dictionary if not nil
+func (e *doubleFastEncoderDict) Reset(d *dict, singleBlock bool) {
+	allDirty := e.allDirty
+	e.fastEncoderDict.Reset(d, singleBlock)
+	if d == nil {
+		return
+	}
+
+	// Init or copy dict table
+	if len(e.dictLongTable) != len(e.longTable) || d.id != e.lastDictID {
+		if len(e.dictLongTable) != len(e.longTable) {
+			e.dictLongTable = make([]tableEntry, len(e.longTable))
+		}
+		if len(d.content) >= 8 {
+			cv := load6432(d.content, 0)
+			e.dictLongTable[hashLen(cv, dFastLongTableBits, dFastLongLen)] = tableEntry{
+				val:    uint32(cv),
+				offset: e.maxMatchOff,
+			}
+			end := int32(len(d.content)) - 8 + e.maxMatchOff
+			for i := e.maxMatchOff + 1; i < end; i++ {
+				cv = cv>>8 | (uint64(d.content[i-e.maxMatchOff+7]) << 56)
+				e.dictLongTable[hashLen(cv, dFastLongTableBits, dFastLongLen)] = tableEntry{
+					val:    uint32(cv),
+					offset: i,
+				}
+			}
+		}
+		e.lastDictID = d.id
+		e.allDirty = true
+	}
+	// Reset table to initial state
+	e.cur = e.maxMatchOff
+
+	dirtyShardCnt := 0
+	if !allDirty {
+		for i := range e.longTableShardDirty {
+			if e.longTableShardDirty[i] {
+				dirtyShardCnt++
+			}
+		}
+	}
+
+	if allDirty || dirtyShardCnt > dLongTableShardCnt/2 {
+		copy(e.longTable[:], e.dictLongTable)
+		for i := range e.longTableShardDirty {
+			e.longTableShardDirty[i] = false
+		}
+		return
+	}
+	for i := range e.longTableShardDirty {
+		if !e.longTableShardDirty[i] {
+			continue
+		}
+
+		copy(e.longTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize], e.dictLongTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize])
+		e.longTableShardDirty[i] = false
+	}
+}
+
+func (e *doubleFastEncoderDict) markLongShardDirty(entryNum uint32) {
+	e.longTableShardDirty[entryNum/dLongTableShardSize] = true
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_fast.go b/vendor/github.com/klauspost/compress/zstd/enc_fast.go
index 4104b45..f51ab52 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_fast.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_fast.go
@@ -6,17 +6,16 @@ package zstd
 
 import (
 	"fmt"
-	"math"
-	"math/bits"
-
-	"github.com/klauspost/compress/zstd/internal/xxhash"
 )
 
 const (
-	tableBits      = 15             // Bits used in the table
-	tableSize      = 1 << tableBits // Size of the table
-	tableMask      = tableSize - 1  // Mask for table indices. Redundant, but can eliminate bounds checks.
-	maxMatchLength = 131074
+	tableBits        = 15                               // Bits used in the table
+	tableSize        = 1 << tableBits                   // Size of the table
+	tableShardCnt    = 1 << (tableBits - dictShardBits) // Number of shards in the table
+	tableShardSize   = tableSize / tableShardCnt        // Size of an individual shard
+	tableFastHashLen = 6
+	tableMask        = tableSize - 1 // Mask for table indices. Redundant, but can eliminate bounds checks.
+	maxMatchLength   = 131074
 )
 
 type tableEntry struct {
@@ -24,51 +23,16 @@ type tableEntry struct {
 	offset int32
 }
 
-type fastBase struct {
-	// cur is the offset at the start of hist
-	cur int32
-	// maximum offset. Should be at least 2x block size.
-	maxMatchOff int32
-	hist        []byte
-	crc         *xxhash.Digest
-	tmp         [8]byte
-	blk         *blockEnc
-}
-
 type fastEncoder struct {
 	fastBase
 	table [tableSize]tableEntry
 }
 
-// CRC returns the underlying CRC writer.
-func (e *fastBase) CRC() *xxhash.Digest {
-	return e.crc
-}
-
-// AppendCRC will append the CRC to the destination slice and return it.
-func (e *fastBase) AppendCRC(dst []byte) []byte {
-	crc := e.crc.Sum(e.tmp[:0])
-	dst = append(dst, crc[7], crc[6], crc[5], crc[4])
-	return dst
-}
-
-// WindowSize returns the window size of the encoder,
-// or a window size small enough to contain the input size, if > 0.
-func (e *fastBase) WindowSize(size int) int32 {
-	if size > 0 && size < int(e.maxMatchOff) {
-		b := int32(1) << uint(bits.Len(uint(size)))
-		// Keep minimum window.
-		if b < 1024 {
-			b = 1024
-		}
-		return b
-	}
-	return e.maxMatchOff
-}
-
-// Block returns the current block.
-func (e *fastBase) Block() *blockEnc {
-	return e.blk
+type fastEncoderDict struct {
+	fastEncoder
+	dictTable       []tableEntry
+	tableShardDirty [tableShardCnt]bool
+	allDirty        bool
 }
 
 // Encode mimmics functionality in zstd_fast.c
@@ -121,7 +85,7 @@ func (e *fastEncoder) Encode(blk *blockEnc, src []byte) {
 	// TEMPLATE
 	const hashLog = tableBits
 	// seems global, but would be nice to tweak.
-	const kSearchStrength = 8
+	const kSearchStrength = 6
 
 	// nextEmit is where in src the next emitLiteral should start from.
 	nextEmit := s
@@ -138,7 +102,7 @@ func (e *fastEncoder) Encode(blk *blockEnc, src []byte) {
 		blk.literals = append(blk.literals, src[nextEmit:until]...)
 		s.litLen = uint32(until - nextEmit)
 	}
-	if debug {
+	if debugEncoder {
 		println("recent offsets:", blk.recentOffsets)
 	}
 
@@ -157,8 +121,8 @@ encodeLoop:
 				panic("offset0 was 0")
 			}
 
-			nextHash := hash6(cv, hashLog)
-			nextHash2 := hash6(cv>>8, hashLog)
+			nextHash := hashLen(cv, hashLog, tableFastHashLen)
+			nextHash2 := hashLen(cv>>8, hashLog, tableFastHashLen)
 			candidate := e.table[nextHash]
 			candidate2 := e.table[nextHash2]
 			repIndex := s - offset1 + 2
@@ -170,20 +134,7 @@ encodeLoop:
 				// Consider history as well.
 				var seq seq
 				var length int32
-				// length = 4 + e.matchlen(s+6, repIndex+4, src)
-				{
-					a := src[s+6:]
-					b := src[repIndex+4:]
-					endI := len(a) & (math.MaxInt32 - 7)
-					length = int32(endI) + 4
-					for i := 0; i < endI; i += 8 {
-						if diff := load64(a, i) ^ load64(b, i); diff != 0 {
-							length = int32(i+bits.TrailingZeros64(diff)>>3) + 4
-							break
-						}
-					}
-				}
-
+				length = 4 + e.matchlen(s+6, repIndex+4, src)
 				seq.matchLen = uint32(length - zstdMinMatch)
 
 				// We might be able to match backwards.
@@ -213,7 +164,7 @@ encodeLoop:
 				s += length + 2
 				nextEmit = s
 				if s >= sLimit {
-					if debug {
+					if debugEncoder {
 						println("repeat ended", s, length)
 
 					}
@@ -270,20 +221,7 @@ encodeLoop:
 		}
 
 		// Extend the 4-byte match as long as possible.
-		//l := e.matchlen(s+4, t+4, src) + 4
-		var l int32
-		{
-			a := src[s+4:]
-			b := src[t+4:]
-			endI := len(a) & (math.MaxInt32 - 7)
-			l = int32(endI) + 4
-			for i := 0; i < endI; i += 8 {
-				if diff := load64(a, i) ^ load64(b, i); diff != 0 {
-					l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
-					break
-				}
-			}
-		}
+		l := e.matchlen(s+4, t+4, src) + 4
 
 		// Extend backwards
 		tMin := s - e.maxMatchOff
@@ -320,23 +258,10 @@ encodeLoop:
 		if o2 := s - offset2; canRepeat && load3232(src, o2) == uint32(cv) {
 			// We have at least 4 byte match.
 			// No need to check backwards. We come straight from a match
-			//l := 4 + e.matchlen(s+4, o2+4, src)
-			var l int32
-			{
-				a := src[s+4:]
-				b := src[o2+4:]
-				endI := len(a) & (math.MaxInt32 - 7)
-				l = int32(endI) + 4
-				for i := 0; i < endI; i += 8 {
-					if diff := load64(a, i) ^ load64(b, i); diff != 0 {
-						l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
-						break
-					}
-				}
-			}
+			l := 4 + e.matchlen(s+4, o2+4, src)
 
 			// Store this, since we have it.
-			nextHash := hash6(cv, hashLog)
+			nextHash := hashLen(cv, hashLog, tableFastHashLen)
 			e.table[nextHash] = tableEntry{offset: s + e.cur, val: uint32(cv)}
 			seq.matchLen = uint32(l) - zstdMinMatch
 			seq.litLen = 0
@@ -365,7 +290,7 @@ encodeLoop:
 	}
 	blk.recentOffsets[0] = uint32(offset1)
 	blk.recentOffsets[1] = uint32(offset2)
-	if debug {
+	if debugEncoder {
 		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
 	}
 }
@@ -378,7 +303,7 @@ func (e *fastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
 		inputMargin            = 8
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
 	)
-	if debug {
+	if debugEncoder {
 		if len(src) > maxBlockSize {
 			panic("src too big")
 		}
@@ -409,7 +334,7 @@ func (e *fastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
 	// TEMPLATE
 	const hashLog = tableBits
 	// seems global, but would be nice to tweak.
-	const kSearchStrength = 8
+	const kSearchStrength = 6
 
 	// nextEmit is where in src the next emitLiteral should start from.
 	nextEmit := s
@@ -426,7 +351,7 @@ func (e *fastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
 		blk.literals = append(blk.literals, src[nextEmit:until]...)
 		s.litLen = uint32(until - nextEmit)
 	}
-	if debug {
+	if debugEncoder {
 		println("recent offsets:", blk.recentOffsets)
 	}
 
@@ -440,8 +365,8 @@ encodeLoop:
 		// By not using them for the first 3 matches
 
 		for {
-			nextHash := hash6(cv, hashLog)
-			nextHash2 := hash6(cv>>8, hashLog)
+			nextHash := hashLen(cv, hashLog, tableFastHashLen)
+			nextHash2 := hashLen(cv>>8, hashLog, tableFastHashLen)
 			candidate := e.table[nextHash]
 			candidate2 := e.table[nextHash2]
 			repIndex := s - offset1 + 2
@@ -452,21 +377,7 @@ encodeLoop:
 			if len(blk.sequences) > 2 && load3232(src, repIndex) == uint32(cv>>16) {
 				// Consider history as well.
 				var seq seq
-				// length := 4 + e.matchlen(s+6, repIndex+4, src)
-				// length := 4 + int32(matchLen(src[s+6:], src[repIndex+4:]))
-				var length int32
-				{
-					a := src[s+6:]
-					b := src[repIndex+4:]
-					endI := len(a) & (math.MaxInt32 - 7)
-					length = int32(endI) + 4
-					for i := 0; i < endI; i += 8 {
-						if diff := load64(a, i) ^ load64(b, i); diff != 0 {
-							length = int32(i+bits.TrailingZeros64(diff)>>3) + 4
-							break
-						}
-					}
-				}
+				length := 4 + e.matchlen(s+6, repIndex+4, src)
 
 				seq.matchLen = uint32(length - zstdMinMatch)
 
@@ -497,7 +408,7 @@ encodeLoop:
 				s += length + 2
 				nextEmit = s
 				if s >= sLimit {
-					if debug {
+					if debugEncoder {
 						println("repeat ended", s, length)
 
 					}
@@ -556,21 +467,7 @@ encodeLoop:
 			panic(fmt.Sprintf("t (%d) < 0 ", t))
 		}
 		// Extend the 4-byte match as long as possible.
-		//l := e.matchlenNoHist(s+4, t+4, src) + 4
-		// l := int32(matchLen(src[s+4:], src[t+4:])) + 4
-		var l int32
-		{
-			a := src[s+4:]
-			b := src[t+4:]
-			endI := len(a) & (math.MaxInt32 - 7)
-			l = int32(endI) + 4
-			for i := 0; i < endI; i += 8 {
-				if diff := load64(a, i) ^ load64(b, i); diff != 0 {
-					l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
-					break
-				}
-			}
-		}
+		l := e.matchlen(s+4, t+4, src) + 4
 
 		// Extend backwards
 		tMin := s - e.maxMatchOff
@@ -607,24 +504,10 @@ encodeLoop:
 		if o2 := s - offset2; len(blk.sequences) > 2 && load3232(src, o2) == uint32(cv) {
 			// We have at least 4 byte match.
 			// No need to check backwards. We come straight from a match
-			//l := 4 + e.matchlenNoHist(s+4, o2+4, src)
-			// l := 4 + int32(matchLen(src[s+4:], src[o2+4:]))
-			var l int32
-			{
-				a := src[s+4:]
-				b := src[o2+4:]
-				endI := len(a) & (math.MaxInt32 - 7)
-				l = int32(endI) + 4
-				for i := 0; i < endI; i += 8 {
-					if diff := load64(a, i) ^ load64(b, i); diff != 0 {
-						l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
-						break
-					}
-				}
-			}
+			l := 4 + e.matchlen(s+4, o2+4, src)
 
 			// Store this, since we have it.
-			nextHash := hash6(cv, hashLog)
+			nextHash := hashLen(cv, hashLog, tableFastHashLen)
 			e.table[nextHash] = tableEntry{offset: s + e.cur, val: uint32(cv)}
 			seq.matchLen = uint32(l) - zstdMinMatch
 			seq.litLen = 0
@@ -651,7 +534,7 @@ encodeLoop:
 		blk.literals = append(blk.literals, src[nextEmit:]...)
 		blk.extraLits = len(src) - int(nextEmit)
 	}
-	if debug {
+	if debugEncoder {
 		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
 	}
 	// We do not store history, so we must offset e.cur to avoid false matches for next user.
@@ -660,96 +543,356 @@ encodeLoop:
 	}
 }
 
-func (e *fastBase) addBlock(src []byte) int32 {
-	if debugAsserts && e.cur > bufferReset {
-		panic(fmt.Sprintf("ecur (%d) > buffer reset (%d)", e.cur, bufferReset))
+// Encode will encode the content, with a dictionary if initialized for it.
+func (e *fastEncoderDict) Encode(blk *blockEnc, src []byte) {
+	const (
+		inputMargin            = 8
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+	)
+	if e.allDirty || len(src) > 32<<10 {
+		e.fastEncoder.Encode(blk, src)
+		e.allDirty = true
+		return
 	}
-	// check if we have space already
-	if len(e.hist)+len(src) > cap(e.hist) {
-		if cap(e.hist) == 0 {
-			l := e.maxMatchOff * 2
-			// Make it at least 1MB.
-			if l < 1<<20 {
-				l = 1 << 20
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
 			}
-			e.hist = make([]byte, 0, l)
-		} else {
-			if cap(e.hist) < int(e.maxMatchOff*2) {
-				panic("unexpected buffer size")
+			e.cur = e.maxMatchOff
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v < minOff {
+				v = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
 			}
-			// Move down
-			offset := int32(len(e.hist)) - e.maxMatchOff
-			copy(e.hist[0:e.maxMatchOff], e.hist[offset:])
-			e.cur += offset
-			e.hist = e.hist[:e.maxMatchOff]
+			e.table[i].offset = v
 		}
+		e.cur = e.maxMatchOff
+		break
 	}
-	s := int32(len(e.hist))
-	e.hist = append(e.hist, src...)
-	return s
-}
 
-// useBlock will replace the block with the provided one,
-// but transfer recent offsets from the previous.
-func (e *fastBase) UseBlock(enc *blockEnc) {
-	enc.reset(e.blk)
-	e.blk = enc
-}
+	s := e.addBlock(src)
+	blk.size = len(src)
+	if len(src) < minNonLiteralBlockSize {
+		blk.extraLits = len(src)
+		blk.literals = blk.literals[:len(src)]
+		copy(blk.literals, src)
+		return
+	}
 
-func (e *fastBase) matchlenNoHist(s, t int32, src []byte) int32 {
-	// Extend the match to be as long as possible.
-	return int32(matchLen(src[s:], src[t:]))
-}
+	// Override src
+	src = e.hist
+	sLimit := int32(len(src)) - inputMargin
+	// stepSize is the number of bytes to skip on every main loop iteration.
+	// It should be >= 2.
+	const stepSize = 2
 
-func (e *fastBase) matchlen(s, t int32, src []byte) int32 {
-	if debugAsserts {
-		if s < 0 {
-			err := fmt.Sprintf("s (%d) < 0", s)
-			panic(err)
+	// TEMPLATE
+	const hashLog = tableBits
+	// seems global, but would be nice to tweak.
+	const kSearchStrength = 7
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := s
+	cv := load6432(src, s)
+
+	// Relative offsets
+	offset1 := int32(blk.recentOffsets[0])
+	offset2 := int32(blk.recentOffsets[1])
+
+	addLiterals := func(s *seq, until int32) {
+		if until == nextEmit {
+			return
 		}
-		if t < 0 {
-			err := fmt.Sprintf("s (%d) < 0", s)
-			panic(err)
+		blk.literals = append(blk.literals, src[nextEmit:until]...)
+		s.litLen = uint32(until - nextEmit)
+	}
+	if debugEncoder {
+		println("recent offsets:", blk.recentOffsets)
+	}
+
+encodeLoop:
+	for {
+		// t will contain the match offset when we find one.
+		// When existing the search loop, we have already checked 4 bytes.
+		var t int32
+
+		// We will not use repeat offsets across blocks.
+		// By not using them for the first 3 matches
+		canRepeat := len(blk.sequences) > 2
+
+		for {
+			if debugAsserts && canRepeat && offset1 == 0 {
+				panic("offset0 was 0")
+			}
+
+			nextHash := hashLen(cv, hashLog, tableFastHashLen)
+			nextHash2 := hashLen(cv>>8, hashLog, tableFastHashLen)
+			candidate := e.table[nextHash]
+			candidate2 := e.table[nextHash2]
+			repIndex := s - offset1 + 2
+
+			e.table[nextHash] = tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.markShardDirty(nextHash)
+			e.table[nextHash2] = tableEntry{offset: s + e.cur + 1, val: uint32(cv >> 8)}
+			e.markShardDirty(nextHash2)
+
+			if canRepeat && repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>16) {
+				// Consider history as well.
+				var seq seq
+				var length int32
+				length = 4 + e.matchlen(s+6, repIndex+4, src)
+
+				seq.matchLen = uint32(length - zstdMinMatch)
+
+				// We might be able to match backwards.
+				// Extend as long as we can.
+				start := s + 2
+				// We end the search early, so we don't risk 0 literals
+				// and have to do special offset treatment.
+				startLimit := nextEmit + 1
+
+				sMin := s - e.maxMatchOff
+				if sMin < 0 {
+					sMin = 0
+				}
+				for repIndex > sMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch {
+					repIndex--
+					start--
+					seq.matchLen++
+				}
+				addLiterals(&seq, start)
+
+				// rep 0
+				seq.offset = 1
+				if debugSequences {
+					println("repeat sequence", seq, "next s:", s)
+				}
+				blk.sequences = append(blk.sequences, seq)
+				s += length + 2
+				nextEmit = s
+				if s >= sLimit {
+					if debugEncoder {
+						println("repeat ended", s, length)
+
+					}
+					break encodeLoop
+				}
+				cv = load6432(src, s)
+				continue
+			}
+			coffset0 := s - (candidate.offset - e.cur)
+			coffset1 := s - (candidate2.offset - e.cur) + 1
+			if coffset0 < e.maxMatchOff && uint32(cv) == candidate.val {
+				// found a regular match
+				t = candidate.offset - e.cur
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				break
+			}
+
+			if coffset1 < e.maxMatchOff && uint32(cv>>8) == candidate2.val {
+				// found a regular match
+				t = candidate2.offset - e.cur
+				s++
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugAsserts && t < 0 {
+					panic("t<0")
+				}
+				break
+			}
+			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
+			if s >= sLimit {
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+		// A 4-byte match has been found. We'll later see if more than 4 bytes.
+		offset2 = offset1
+		offset1 = s - t
+
+		if debugAsserts && s <= t {
+			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+		}
+
+		if debugAsserts && canRepeat && int(offset1) > len(src) {
+			panic("invalid offset")
+		}
+
+		// Extend the 4-byte match as long as possible.
+		l := e.matchlen(s+4, t+4, src) + 4
+
+		// Extend backwards
+		tMin := s - e.maxMatchOff
+		if tMin < 0 {
+			tMin = 0
+		}
+		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
+			s--
+			t--
+			l++
+		}
+
+		// Write our sequence.
+		var seq seq
+		seq.litLen = uint32(s - nextEmit)
+		seq.matchLen = uint32(l - zstdMinMatch)
+		if seq.litLen > 0 {
+			blk.literals = append(blk.literals, src[nextEmit:s]...)
+		}
+		// Don't use repeat offsets
+		seq.offset = uint32(s-t) + 3
+		s += l
+		if debugSequences {
+			println("sequence", seq, "next s:", s)
 		}
-		if s-t > e.maxMatchOff {
-			err := fmt.Sprintf("s (%d) - t (%d) > maxMatchOff (%d)", s, t, e.maxMatchOff)
-			panic(err)
+		blk.sequences = append(blk.sequences, seq)
+		nextEmit = s
+		if s >= sLimit {
+			break encodeLoop
 		}
-		if len(src)-int(s) > maxCompressedBlockSize {
-			panic(fmt.Sprintf("len(src)-s (%d) > maxCompressedBlockSize (%d)", len(src)-int(s), maxCompressedBlockSize))
+		cv = load6432(src, s)
+
+		// Check offset 2
+		if o2 := s - offset2; canRepeat && load3232(src, o2) == uint32(cv) {
+			// We have at least 4 byte match.
+			// No need to check backwards. We come straight from a match
+			l := 4 + e.matchlen(s+4, o2+4, src)
+
+			// Store this, since we have it.
+			nextHash := hashLen(cv, hashLog, tableFastHashLen)
+			e.table[nextHash] = tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.markShardDirty(nextHash)
+			seq.matchLen = uint32(l) - zstdMinMatch
+			seq.litLen = 0
+			// Since litlen is always 0, this is offset 1.
+			seq.offset = 1
+			s += l
+			nextEmit = s
+			if debugSequences {
+				println("sequence", seq, "next s:", s)
+			}
+			blk.sequences = append(blk.sequences, seq)
+
+			// Swap offset 1 and 2.
+			offset1, offset2 = offset2, offset1
+			if s >= sLimit {
+				break encodeLoop
+			}
+			// Prepare next loop.
+			cv = load6432(src, s)
 		}
 	}
 
-	// Extend the match to be as long as possible.
-	return int32(matchLen(src[s:], src[t:]))
+	if int(nextEmit) < len(src) {
+		blk.literals = append(blk.literals, src[nextEmit:]...)
+		blk.extraLits = len(src) - int(nextEmit)
+	}
+	blk.recentOffsets[0] = uint32(offset1)
+	blk.recentOffsets[1] = uint32(offset2)
+	if debugEncoder {
+		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
+	}
+}
+
+// ResetDict will reset and set a dictionary if not nil
+func (e *fastEncoder) Reset(d *dict, singleBlock bool) {
+	e.resetBase(d, singleBlock)
+	if d != nil {
+		panic("fastEncoder: Reset with dict")
+	}
 }
 
-// Reset the encoding table.
-func (e *fastBase) Reset(singleBlock bool) {
-	if e.blk == nil {
-		e.blk = &blockEnc{}
-		e.blk.init()
-	} else {
-		e.blk.reset(nil)
+// ResetDict will reset and set a dictionary if not nil
+func (e *fastEncoderDict) Reset(d *dict, singleBlock bool) {
+	e.resetBase(d, singleBlock)
+	if d == nil {
+		return
+	}
+
+	// Init or copy dict table
+	if len(e.dictTable) != len(e.table) || d.id != e.lastDictID {
+		if len(e.dictTable) != len(e.table) {
+			e.dictTable = make([]tableEntry, len(e.table))
+		}
+		if true {
+			end := e.maxMatchOff + int32(len(d.content)) - 8
+			for i := e.maxMatchOff; i < end; i += 3 {
+				const hashLog = tableBits
+
+				cv := load6432(d.content, i-e.maxMatchOff)
+				nextHash := hashLen(cv, hashLog, tableFastHashLen)      // 0 -> 5
+				nextHash1 := hashLen(cv>>8, hashLog, tableFastHashLen)  // 1 -> 6
+				nextHash2 := hashLen(cv>>16, hashLog, tableFastHashLen) // 2 -> 7
+				e.dictTable[nextHash] = tableEntry{
+					val:    uint32(cv),
+					offset: i,
+				}
+				e.dictTable[nextHash1] = tableEntry{
+					val:    uint32(cv >> 8),
+					offset: i + 1,
+				}
+				e.dictTable[nextHash2] = tableEntry{
+					val:    uint32(cv >> 16),
+					offset: i + 2,
+				}
+			}
+		}
+		e.lastDictID = d.id
+		e.allDirty = true
 	}
-	e.blk.initNewEncode()
-	if e.crc == nil {
-		e.crc = xxhash.New()
-	} else {
-		e.crc.Reset()
+
+	e.cur = e.maxMatchOff
+	dirtyShardCnt := 0
+	if !e.allDirty {
+		for i := range e.tableShardDirty {
+			if e.tableShardDirty[i] {
+				dirtyShardCnt++
+			}
+		}
 	}
-	if !singleBlock && cap(e.hist) < int(e.maxMatchOff*2) {
-		l := e.maxMatchOff * 2
-		// Make it at least 1MB.
-		if l < 1<<20 {
-			l = 1 << 20
+
+	const shardCnt = tableShardCnt
+	const shardSize = tableShardSize
+	if e.allDirty || dirtyShardCnt > shardCnt*4/6 {
+		copy(e.table[:], e.dictTable)
+		for i := range e.tableShardDirty {
+			e.tableShardDirty[i] = false
 		}
-		e.hist = make([]byte, 0, l)
+		e.allDirty = false
+		return
 	}
-	// We offset current position so everything will be out of reach.
-	// If above reset line, history will be purged.
-	if e.cur < bufferReset {
-		e.cur += e.maxMatchOff + int32(len(e.hist))
+	for i := range e.tableShardDirty {
+		if !e.tableShardDirty[i] {
+			continue
+		}
+
+		copy(e.table[i*shardSize:(i+1)*shardSize], e.dictTable[i*shardSize:(i+1)*shardSize])
+		e.tableShardDirty[i] = false
 	}
-	e.hist = e.hist[:0]
+	e.allDirty = false
+}
+
+func (e *fastEncoderDict) markAllShardsDirty() {
+	e.allDirty = true
+}
+
+func (e *fastEncoderDict) markShardDirty(entryNum uint32) {
+	e.tableShardDirty[entryNum/tableShardSize] = true
 }
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_params.go b/vendor/github.com/klauspost/compress/zstd/enc_params.go
deleted file mode 100644
index d874116..0000000
--- a/vendor/github.com/klauspost/compress/zstd/enc_params.go
+++ /dev/null
@@ -1,157 +0,0 @@
-// Copyright 2019+ Klaus Post. All rights reserved.
-// License information can be found in the LICENSE file.
-// Based on work by Yann Collet, released under BSD License.
-
-package zstd
-
-/*
-// encParams are not really used, just here for reference.
-type encParams struct {
-	// largest match distance : larger == more compression, more memory needed during decompression
-	windowLog uint8
-
-	// fully searched segment : larger == more compression, slower, more memory (useless for fast)
-	chainLog uint8
-
-	//  dispatch table : larger == faster, more memory
-	hashLog uint8
-
-	// < nb of searches : larger == more compression, slower
-	searchLog uint8
-
-	// < match length searched : larger == faster decompression, sometimes less compression
-	minMatch uint8
-
-	// acceptable match size for optimal parser (only) : larger == more compression, slower
-	targetLength uint32
-
-	// see ZSTD_strategy definition above
-	strategy strategy
-}
-
-// strategy defines the algorithm to use when generating sequences.
-type strategy uint8
-
-const (
-	// Compression strategies, listed from fastest to strongest
-	strategyFast strategy = iota + 1
-	strategyDfast
-	strategyGreedy
-	strategyLazy
-	strategyLazy2
-	strategyBtlazy2
-	strategyBtopt
-	strategyBtultra
-	strategyBtultra2
-	// note : new strategies _might_ be added in the future.
-	//   Only the order (from fast to strong) is guaranteed
-
-)
-
-var defEncParams = [4][]encParams{
-	{ // "default" - for any srcSize > 256 KB
-		// W,  C,  H,  S,  L, TL, strat
-		{19, 12, 13, 1, 6, 1, strategyFast},       // base for negative levels
-		{19, 13, 14, 1, 7, 0, strategyFast},       // level  1
-		{20, 15, 16, 1, 6, 0, strategyFast},       // level  2
-		{21, 16, 17, 1, 5, 1, strategyDfast},      // level  3
-		{21, 18, 18, 1, 5, 1, strategyDfast},      // level  4
-		{21, 18, 19, 2, 5, 2, strategyGreedy},     // level  5
-		{21, 19, 19, 3, 5, 4, strategyGreedy},     // level  6
-		{21, 19, 19, 3, 5, 8, strategyLazy},       // level  7
-		{21, 19, 19, 3, 5, 16, strategyLazy2},     // level  8
-		{21, 19, 20, 4, 5, 16, strategyLazy2},     // level  9
-		{22, 20, 21, 4, 5, 16, strategyLazy2},     // level 10
-		{22, 21, 22, 4, 5, 16, strategyLazy2},     // level 11
-		{22, 21, 22, 5, 5, 16, strategyLazy2},     // level 12
-		{22, 21, 22, 5, 5, 32, strategyBtlazy2},   // level 13
-		{22, 22, 23, 5, 5, 32, strategyBtlazy2},   // level 14
-		{22, 23, 23, 6, 5, 32, strategyBtlazy2},   // level 15
-		{22, 22, 22, 5, 5, 48, strategyBtopt},     // level 16
-		{23, 23, 22, 5, 4, 64, strategyBtopt},     // level 17
-		{23, 23, 22, 6, 3, 64, strategyBtultra},   // level 18
-		{23, 24, 22, 7, 3, 256, strategyBtultra2}, // level 19
-		{25, 25, 23, 7, 3, 256, strategyBtultra2}, // level 20
-		{26, 26, 24, 7, 3, 512, strategyBtultra2}, // level 21
-		{27, 27, 25, 9, 3, 999, strategyBtultra2}, // level 22
-	},
-	{ // for srcSize <= 256 KB
-		// W,  C,  H,  S,  L,  T, strat
-		{18, 12, 13, 1, 5, 1, strategyFast},        // base for negative levels
-		{18, 13, 14, 1, 6, 0, strategyFast},        // level  1
-		{18, 14, 14, 1, 5, 1, strategyDfast},       // level  2
-		{18, 16, 16, 1, 4, 1, strategyDfast},       // level  3
-		{18, 16, 17, 2, 5, 2, strategyGreedy},      // level  4.
-		{18, 18, 18, 3, 5, 2, strategyGreedy},      // level  5.
-		{18, 18, 19, 3, 5, 4, strategyLazy},        // level  6.
-		{18, 18, 19, 4, 4, 4, strategyLazy},        // level  7
-		{18, 18, 19, 4, 4, 8, strategyLazy2},       // level  8
-		{18, 18, 19, 5, 4, 8, strategyLazy2},       // level  9
-		{18, 18, 19, 6, 4, 8, strategyLazy2},       // level 10
-		{18, 18, 19, 5, 4, 12, strategyBtlazy2},    // level 11.
-		{18, 19, 19, 7, 4, 12, strategyBtlazy2},    // level 12.
-		{18, 18, 19, 4, 4, 16, strategyBtopt},      // level 13
-		{18, 18, 19, 4, 3, 32, strategyBtopt},      // level 14.
-		{18, 18, 19, 6, 3, 128, strategyBtopt},     // level 15.
-		{18, 19, 19, 6, 3, 128, strategyBtultra},   // level 16.
-		{18, 19, 19, 8, 3, 256, strategyBtultra},   // level 17.
-		{18, 19, 19, 6, 3, 128, strategyBtultra2},  // level 18.
-		{18, 19, 19, 8, 3, 256, strategyBtultra2},  // level 19.
-		{18, 19, 19, 10, 3, 512, strategyBtultra2}, // level 20.
-		{18, 19, 19, 12, 3, 512, strategyBtultra2}, // level 21.
-		{18, 19, 19, 13, 3, 999, strategyBtultra2}, // level 22.
-	},
-	{ // for srcSize <= 128 KB
-		// W,  C,  H,  S,  L,  T, strat
-		{17, 12, 12, 1, 5, 1, strategyFast},        // base for negative levels
-		{17, 12, 13, 1, 6, 0, strategyFast},        // level  1
-		{17, 13, 15, 1, 5, 0, strategyFast},        // level  2
-		{17, 15, 16, 2, 5, 1, strategyDfast},       // level  3
-		{17, 17, 17, 2, 4, 1, strategyDfast},       // level  4
-		{17, 16, 17, 3, 4, 2, strategyGreedy},      // level  5
-		{17, 17, 17, 3, 4, 4, strategyLazy},        // level  6
-		{17, 17, 17, 3, 4, 8, strategyLazy2},       // level  7
-		{17, 17, 17, 4, 4, 8, strategyLazy2},       // level  8
-		{17, 17, 17, 5, 4, 8, strategyLazy2},       // level  9
-		{17, 17, 17, 6, 4, 8, strategyLazy2},       // level 10
-		{17, 17, 17, 5, 4, 8, strategyBtlazy2},     // level 11
-		{17, 18, 17, 7, 4, 12, strategyBtlazy2},    // level 12
-		{17, 18, 17, 3, 4, 12, strategyBtopt},      // level 13.
-		{17, 18, 17, 4, 3, 32, strategyBtopt},      // level 14.
-		{17, 18, 17, 6, 3, 256, strategyBtopt},     // level 15.
-		{17, 18, 17, 6, 3, 128, strategyBtultra},   // level 16.
-		{17, 18, 17, 8, 3, 256, strategyBtultra},   // level 17.
-		{17, 18, 17, 10, 3, 512, strategyBtultra},  // level 18.
-		{17, 18, 17, 5, 3, 256, strategyBtultra2},  // level 19.
-		{17, 18, 17, 7, 3, 512, strategyBtultra2},  // level 20.
-		{17, 18, 17, 9, 3, 512, strategyBtultra2},  // level 21.
-		{17, 18, 17, 11, 3, 999, strategyBtultra2}, // level 22.
-	},
-	{ // for srcSize <= 16 KB
-		// W,  C,  H,  S,  L,  T, strat
-		{14, 12, 13, 1, 5, 1, strategyFast},        // base for negative levels
-		{14, 14, 15, 1, 5, 0, strategyFast},        // level  1
-		{14, 14, 15, 1, 4, 0, strategyFast},        // level  2
-		{14, 14, 15, 2, 4, 1, strategyDfast},       // level  3
-		{14, 14, 14, 4, 4, 2, strategyGreedy},      // level  4
-		{14, 14, 14, 3, 4, 4, strategyLazy},        // level  5.
-		{14, 14, 14, 4, 4, 8, strategyLazy2},       // level  6
-		{14, 14, 14, 6, 4, 8, strategyLazy2},       // level  7
-		{14, 14, 14, 8, 4, 8, strategyLazy2},       // level  8.
-		{14, 15, 14, 5, 4, 8, strategyBtlazy2},     // level  9.
-		{14, 15, 14, 9, 4, 8, strategyBtlazy2},     // level 10.
-		{14, 15, 14, 3, 4, 12, strategyBtopt},      // level 11.
-		{14, 15, 14, 4, 3, 24, strategyBtopt},      // level 12.
-		{14, 15, 14, 5, 3, 32, strategyBtultra},    // level 13.
-		{14, 15, 15, 6, 3, 64, strategyBtultra},    // level 14.
-		{14, 15, 15, 7, 3, 256, strategyBtultra},   // level 15.
-		{14, 15, 15, 5, 3, 48, strategyBtultra2},   // level 16.
-		{14, 15, 15, 6, 3, 128, strategyBtultra2},  // level 17.
-		{14, 15, 15, 7, 3, 256, strategyBtultra2},  // level 18.
-		{14, 15, 15, 8, 3, 256, strategyBtultra2},  // level 19.
-		{14, 15, 15, 8, 3, 512, strategyBtultra2},  // level 20.
-		{14, 15, 15, 9, 3, 512, strategyBtultra2},  // level 21.
-		{14, 15, 15, 10, 3, 999, strategyBtultra2}, // level 22.
-	},
-}
-*/
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder.go b/vendor/github.com/klauspost/compress/zstd/encoder.go
index c56d224..dcc987a 100644
--- a/vendor/github.com/klauspost/compress/zstd/encoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder.go
@@ -33,9 +33,9 @@ type encoder interface {
 	Block() *blockEnc
 	CRC() *xxhash.Digest
 	AppendCRC([]byte) []byte
-	WindowSize(size int) int32
+	WindowSize(size int64) int32
 	UseBlock(*blockEnc)
-	Reset(singleBlock bool)
+	Reset(d *dict, singleBlock bool)
 }
 
 type encoderState struct {
@@ -48,6 +48,8 @@ type encoderState struct {
 	err              error
 	writeErr         error
 	nWritten         int64
+	nInput           int64
+	frameContentSize int64
 	headerWritten    bool
 	eofWritten       bool
 	fullFrameWritten bool
@@ -83,8 +85,6 @@ func (e *Encoder) initialize() {
 	e.encoders = make(chan encoder, e.o.concurrent)
 	for i := 0; i < e.o.concurrent; i++ {
 		enc := e.o.encoder()
-		// If not single block, history will be allocated on first use.
-		enc.Reset(true)
 		e.encoders <- enc
 	}
 }
@@ -98,31 +98,47 @@ func (e *Encoder) Reset(w io.Writer) {
 	if cap(s.filling) == 0 {
 		s.filling = make([]byte, 0, e.o.blockSize)
 	}
-	if cap(s.current) == 0 {
-		s.current = make([]byte, 0, e.o.blockSize)
-	}
-	if cap(s.previous) == 0 {
-		s.previous = make([]byte, 0, e.o.blockSize)
+	if e.o.concurrent > 1 {
+		if cap(s.current) == 0 {
+			s.current = make([]byte, 0, e.o.blockSize)
+		}
+		if cap(s.previous) == 0 {
+			s.previous = make([]byte, 0, e.o.blockSize)
+		}
+		s.current = s.current[:0]
+		s.previous = s.previous[:0]
+		if s.writing == nil {
+			s.writing = &blockEnc{lowMem: e.o.lowMem}
+			s.writing.init()
+		}
+		s.writing.initNewEncode()
 	}
 	if s.encoder == nil {
 		s.encoder = e.o.encoder()
 	}
-	if s.writing == nil {
-		s.writing = &blockEnc{}
-		s.writing.init()
-	}
-	s.writing.initNewEncode()
 	s.filling = s.filling[:0]
-	s.current = s.current[:0]
-	s.previous = s.previous[:0]
-	s.encoder.Reset(false)
+	s.encoder.Reset(e.o.dict, false)
 	s.headerWritten = false
 	s.eofWritten = false
 	s.fullFrameWritten = false
 	s.w = w
 	s.err = nil
 	s.nWritten = 0
+	s.nInput = 0
 	s.writeErr = nil
+	s.frameContentSize = 0
+}
+
+// ResetContentSize will reset and set a content size for the next stream.
+// If the bytes written does not match the size given an error will be returned
+// when calling Close().
+// This is removed when Reset is called.
+// Sizes <= 0 results in no content size set.
+func (e *Encoder) ResetContentSize(w io.Writer, size int64) {
+	e.Reset(w)
+	if size >= 0 {
+		e.state.frameContentSize = size
+	}
 }
 
 // Write data to the encoder.
@@ -178,6 +194,12 @@ func (e *Encoder) nextBlock(final bool) error {
 	}
 	if !s.headerWritten {
 		// If we have a single block encode, do a sync compression.
+		if final && len(s.filling) == 0 && !e.o.fullZero {
+			s.headerWritten = true
+			s.fullFrameWritten = true
+			s.eofWritten = true
+			return nil
+		}
 		if final && len(s.filling) > 0 {
 			s.current = e.EncodeAll(s.filling, s.current[:0])
 			var n2 int
@@ -186,21 +208,24 @@ func (e *Encoder) nextBlock(final bool) error {
 				return s.err
 			}
 			s.nWritten += int64(n2)
+			s.nInput += int64(len(s.filling))
 			s.current = s.current[:0]
 			s.filling = s.filling[:0]
 			s.headerWritten = true
 			s.fullFrameWritten = true
+			s.eofWritten = true
 			return nil
 		}
 
 		var tmp [maxHeaderSize]byte
 		fh := frameHeader{
-			ContentSize:   0,
-			WindowSize:    uint32(s.encoder.WindowSize(0)),
+			ContentSize:   uint64(s.frameContentSize),
+			WindowSize:    uint32(s.encoder.WindowSize(s.frameContentSize)),
 			SingleSegment: false,
 			Checksum:      e.o.crc,
-			DictID:        0,
+			DictID:        e.o.dict.ID(),
 		}
+
 		dst, err := fh.appendTo(tmp[:0])
 		if err != nil {
 			return err
@@ -235,11 +260,52 @@ func (e *Encoder) nextBlock(final bool) error {
 		return s.err
 	}
 
+	// SYNC:
+	if e.o.concurrent == 1 {
+		src := s.filling
+		s.nInput += int64(len(s.filling))
+		if debugEncoder {
+			println("Adding sync block,", len(src), "bytes, final:", final)
+		}
+		enc := s.encoder
+		blk := enc.Block()
+		blk.reset(nil)
+		enc.Encode(blk, src)
+		blk.last = final
+		if final {
+			s.eofWritten = true
+		}
+
+		err := errIncompressible
+		// If we got the exact same number of literals as input,
+		// assume the literals cannot be compressed.
+		if len(src) != len(blk.literals) || len(src) != e.o.blockSize {
+			err = blk.encode(src, e.o.noEntropy, !e.o.allLitEntropy)
+		}
+		switch err {
+		case errIncompressible:
+			if debugEncoder {
+				println("Storing incompressible block as raw")
+			}
+			blk.encodeRaw(src)
+			// In fast mode, we do not transfer offsets, so we don't have to deal with changing the.
+		case nil:
+		default:
+			s.err = err
+			return err
+		}
+		_, s.err = s.w.Write(blk.output)
+		s.nWritten += int64(len(blk.output))
+		s.filling = s.filling[:0]
+		return s.err
+	}
+
 	// Move blocks forward.
 	s.filling, s.current, s.previous = s.previous[:0], s.filling, s.current
+	s.nInput += int64(len(s.current))
 	s.wg.Add(1)
 	go func(src []byte) {
-		if debug {
+		if debugEncoder {
 			println("Adding block,", len(src), "bytes, final:", final)
 		}
 		defer func() {
@@ -280,11 +346,11 @@ func (e *Encoder) nextBlock(final bool) error {
 			// If we got the exact same number of literals as input,
 			// assume the literals cannot be compressed.
 			if len(src) != len(blk.literals) || len(src) != e.o.blockSize {
-				err = blk.encode(e.o.noEntropy, !e.o.allLitEntropy)
+				err = blk.encode(src, e.o.noEntropy, !e.o.allLitEntropy)
 			}
 			switch err {
 			case errIncompressible:
-				if debug {
+				if debugEncoder {
 					println("Storing incompressible block as raw")
 				}
 				blk.encodeRaw(src)
@@ -307,10 +373,16 @@ func (e *Encoder) nextBlock(final bool) error {
 //
 // The Copy function uses ReaderFrom if available.
 func (e *Encoder) ReadFrom(r io.Reader) (n int64, err error) {
-	if debug {
+	if debugEncoder {
 		println("Using ReadFrom")
 	}
-	// Maybe handle stuff queued?
+
+	// Flush any current writes.
+	if len(e.state.filling) > 0 {
+		if err := e.nextBlock(false); err != nil {
+			return 0, err
+		}
+	}
 	e.state.filling = e.state.filling[:e.o.blockSize]
 	src := e.state.filling
 	for {
@@ -324,20 +396,20 @@ func (e *Encoder) ReadFrom(r io.Reader) (n int64, err error) {
 		switch err {
 		case io.EOF:
 			e.state.filling = e.state.filling[:len(e.state.filling)-len(src)]
-			if debug {
+			if debugEncoder {
 				println("ReadFrom: got EOF final block:", len(e.state.filling))
 			}
-			return n, e.nextBlock(true)
+			return n, nil
+		case nil:
 		default:
-			if debug {
+			if debugEncoder {
 				println("ReadFrom: got error:", err)
 			}
 			e.state.err = err
 			return n, err
-		case nil:
 		}
 		if len(src) > 0 {
-			if debug {
+			if debugEncoder {
 				println("ReadFrom: got space left in source:", len(src))
 			}
 			continue
@@ -382,6 +454,11 @@ func (e *Encoder) Close() error {
 	if err != nil {
 		return err
 	}
+	if s.frameContentSize > 0 {
+		if s.nInput != s.frameContentSize {
+			return fmt.Errorf("frame content size %d given, but %d bytes was written", s.frameContentSize, s.nInput)
+		}
+	}
 	if e.state.fullFrameWritten {
 		return s.err
 	}
@@ -449,7 +526,6 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 	defer func() {
 		// Release encoder reference to last block.
 		// If a non-single block is needed the encoder will reset again.
-		enc.Reset(true)
 		e.encoders <- enc
 	}()
 	// Use single segments when above minimum window and below 1MB.
@@ -459,14 +535,14 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 	}
 	fh := frameHeader{
 		ContentSize:   uint64(len(src)),
-		WindowSize:    uint32(enc.WindowSize(len(src))),
+		WindowSize:    uint32(enc.WindowSize(int64(len(src)))),
 		SingleSegment: single,
 		Checksum:      e.o.crc,
-		DictID:        0,
+		DictID:        e.o.dict.ID(),
 	}
 
 	// If less than 1MB, allocate a buffer up front.
-	if len(dst) == 0 && cap(dst) == 0 && len(src) < 1<<20 {
+	if len(dst) == 0 && cap(dst) == 0 && len(src) < 1<<20 && !e.o.lowMem {
 		dst = make([]byte, 0, len(src))
 	}
 	dst, err := fh.appendTo(dst)
@@ -476,13 +552,18 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 
 	// If we can do everything in one block, prefer that.
 	if len(src) <= maxCompressedBlockSize {
+		enc.Reset(e.o.dict, true)
 		// Slightly faster with no history and everything in one block.
 		if e.o.crc {
 			_, _ = enc.CRC().Write(src)
 		}
 		blk := enc.Block()
 		blk.last = true
-		enc.EncodeNoHist(blk, src)
+		if e.o.dict == nil {
+			enc.EncodeNoHist(blk, src)
+		} else {
+			enc.Encode(blk, src)
+		}
 
 		// If we got the exact same number of literals as input,
 		// assume the literals cannot be compressed.
@@ -491,12 +572,12 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 		if len(blk.literals) != len(src) || len(src) != e.o.blockSize {
 			// Output directly to dst
 			blk.output = dst
-			err = blk.encode(e.o.noEntropy, !e.o.allLitEntropy)
+			err = blk.encode(src, e.o.noEntropy, !e.o.allLitEntropy)
 		}
 
 		switch err {
 		case errIncompressible:
-			if debug {
+			if debugEncoder {
 				println("Storing incompressible block as raw")
 			}
 			dst = blk.encodeRawTo(dst, src)
@@ -507,7 +588,7 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 		}
 		blk.output = oldout
 	} else {
-		enc.Reset(false)
+		enc.Reset(e.o.dict, false)
 		blk := enc.Block()
 		for len(src) > 0 {
 			todo := src
@@ -518,7 +599,6 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 			if e.o.crc {
 				_, _ = enc.CRC().Write(todo)
 			}
-			blk.reset(nil)
 			blk.pushOffsets()
 			enc.Encode(blk, todo)
 			if len(src) == 0 {
@@ -528,12 +608,12 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 			// If we got the exact same number of literals as input,
 			// assume the literals cannot be compressed.
 			if len(blk.literals) != len(todo) || len(todo) != e.o.blockSize {
-				err = blk.encode(e.o.noEntropy, !e.o.allLitEntropy)
+				err = blk.encode(todo, e.o.noEntropy, !e.o.allLitEntropy)
 			}
 
 			switch err {
 			case errIncompressible:
-				if debug {
+				if debugEncoder {
 					println("Storing incompressible block as raw")
 				}
 				dst = blk.encodeRawTo(dst, todo)
@@ -543,6 +623,7 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 			default:
 				panic(err)
 			}
+			blk.reset(nil)
 		}
 	}
 	if e.o.crc {
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder_options.go b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
index dfac14d..44d8dbd 100644
--- a/vendor/github.com/klauspost/compress/zstd/encoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
@@ -24,29 +24,45 @@ type encoderOptions struct {
 	allLitEntropy   bool
 	customWindow    bool
 	customALEntropy bool
+	customBlockSize bool
+	lowMem          bool
+	dict            *dict
 }
 
 func (o *encoderOptions) setDefault() {
 	*o = encoderOptions{
-		// use less ram: true for now, but may change.
-		concurrent: runtime.GOMAXPROCS(0),
-		crc:        true,
-		single:     nil,
-		blockSize:  1 << 16,
-		windowSize: 8 << 20,
-		level:      SpeedDefault,
+		concurrent:    runtime.GOMAXPROCS(0),
+		crc:           true,
+		single:        nil,
+		blockSize:     maxCompressedBlockSize,
+		windowSize:    8 << 20,
+		level:         SpeedDefault,
+		allLitEntropy: true,
+		lowMem:        false,
 	}
 }
 
 // encoder returns an encoder with the selected options.
 func (o encoderOptions) encoder() encoder {
 	switch o.level {
+	case SpeedFastest:
+		if o.dict != nil {
+			return &fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}
+		}
+		return &fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}
+
 	case SpeedDefault:
-		return &doubleFastEncoder{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize)}}}
+		if o.dict != nil {
+			return &doubleFastEncoderDict{fastEncoderDict: fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}}
+		}
+		return &doubleFastEncoder{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}
 	case SpeedBetterCompression:
-		return &betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize)}}
-	case SpeedFastest:
-		return &fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize)}}
+		if o.dict != nil {
+			return &betterFastEncoderDict{betterFastEncoder: betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}
+		}
+		return &betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}
+	case SpeedBestCompression:
+		return &bestFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}
 	}
 	panic("unknown compression level")
 }
@@ -58,8 +74,9 @@ func WithEncoderCRC(b bool) EOption {
 }
 
 // WithEncoderConcurrency will set the concurrency,
-// meaning the maximum number of decoders to run concurrently.
+// meaning the maximum number of encoders to run concurrently.
 // The value supplied must be at least 1.
+// For streams, setting a value of 1 will disable async compression.
 // By default this will be set to GOMAXPROCS.
 func WithEncoderConcurrency(n int) EOption {
 	return func(o *encoderOptions) error {
@@ -91,6 +108,7 @@ func WithWindowSize(n int) EOption {
 		o.customWindow = true
 		if o.blockSize > o.windowSize {
 			o.blockSize = o.windowSize
+			o.customBlockSize = true
 		}
 		return nil
 	}
@@ -141,20 +159,20 @@ const (
 	// By using this, notice that CPU usage may go up in the future.
 	SpeedBetterCompression
 
+	// SpeedBestCompression will choose the best available compression option.
+	// This will offer the best compression no matter the CPU cost.
+	SpeedBestCompression
+
 	// speedLast should be kept as the last actual compression option.
 	// The is not for external usage, but is used to keep track of the valid options.
 	speedLast
-
-	// SpeedBestCompression will choose the best available compression option.
-	// For now this is not implemented.
-	SpeedBestCompression = SpeedBetterCompression
 )
 
 // EncoderLevelFromString will convert a string representation of an encoding level back
 // to a compression level. The compare is not case sensitive.
 // If the string wasn't recognized, (false, SpeedDefault) will be returned.
 func EncoderLevelFromString(s string) (bool, EncoderLevel) {
-	for l := EncoderLevel(speedNotSet + 1); l < speedLast; l++ {
+	for l := speedNotSet + 1; l < speedLast; l++ {
 		if strings.EqualFold(s, l.String()) {
 			return true, l
 		}
@@ -171,10 +189,11 @@ func EncoderLevelFromZstd(level int) EncoderLevel {
 		return SpeedFastest
 	case level >= 3 && level < 6:
 		return SpeedDefault
-	case level > 5:
+	case level >= 6 && level < 10:
 		return SpeedBetterCompression
+	default:
+		return SpeedBestCompression
 	}
-	return SpeedDefault
 }
 
 // String provides a string representation of the compression level.
@@ -186,6 +205,8 @@ func (e EncoderLevel) String() string {
 		return "default"
 	case SpeedBetterCompression:
 		return "better"
+	case SpeedBestCompression:
+		return "best"
 	default:
 		return "invalid"
 	}
@@ -203,10 +224,15 @@ func WithEncoderLevel(l EncoderLevel) EOption {
 			switch o.level {
 			case SpeedFastest:
 				o.windowSize = 4 << 20
+				if !o.customBlockSize {
+					o.blockSize = 1 << 16
+				}
 			case SpeedDefault:
 				o.windowSize = 8 << 20
 			case SpeedBetterCompression:
 				o.windowSize = 16 << 20
+			case SpeedBestCompression:
+				o.windowSize = 32 << 20
 			}
 		}
 		if !o.customALEntropy {
@@ -265,3 +291,27 @@ func WithSingleSegment(b bool) EOption {
 		return nil
 	}
 }
+
+// WithLowerEncoderMem will trade in some memory cases trade less memory usage for
+// slower encoding speed.
+// This will not change the window size which is the primary function for reducing
+// memory usage. See WithWindowSize.
+func WithLowerEncoderMem(b bool) EOption {
+	return func(o *encoderOptions) error {
+		o.lowMem = b
+		return nil
+	}
+}
+
+// WithEncoderDict allows to register a dictionary that will be used for the encode.
+// The encoder *may* choose to use no dictionary instead for certain payloads.
+func WithEncoderDict(dict []byte) EOption {
+	return func(o *encoderOptions) error {
+		d, err := loadDict(dict)
+		if err != nil {
+			return err
+		}
+		o.dict = d
+		return nil
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/framedec.go b/vendor/github.com/klauspost/compress/zstd/framedec.go
index fc4a566..3ff109c 100644
--- a/vendor/github.com/klauspost/compress/zstd/framedec.go
+++ b/vendor/github.com/klauspost/compress/zstd/framedec.go
@@ -8,27 +8,17 @@ import (
 	"bytes"
 	"encoding/hex"
 	"errors"
-	"hash"
 	"io"
-	"sync"
 
 	"github.com/klauspost/compress/zstd/internal/xxhash"
 )
 
 type frameDec struct {
-	o      decoderOptions
-	crc    hash.Hash64
-	offset int64
+	o   decoderOptions
+	crc *xxhash.Digest
 
 	WindowSize uint64
 
-	// maxWindowSize is the maximum windows size to support.
-	// should never be bigger than max-int.
-	maxWindowSize uint64
-
-	// In order queue of blocks being decoded.
-	decoding chan *blockDec
-
 	// Frame history passed between blocks
 	history history
 
@@ -38,20 +28,18 @@ type frameDec struct {
 	bBuf byteBuf
 
 	FrameContentSize uint64
-	frameDone        sync.WaitGroup
 
 	DictionaryID  *uint32
 	HasCheckSum   bool
 	SingleSegment bool
-
-	// asyncRunning indicates whether the async routine processes input on 'decoding'.
-	asyncRunningMu sync.Mutex
-	asyncRunning   bool
 }
 
 const (
-	// The minimum Window_Size is 1 KB.
+	// MinWindowSize is the minimum Window Size, which is 1 KB.
 	MinWindowSize = 1 << 10
+
+	// MaxWindowSize is the maximum encoder window size
+	// and the default decoder maximum window size.
 	MaxWindowSize = 1 << 29
 )
 
@@ -61,12 +49,11 @@ var (
 )
 
 func newFrameDec(o decoderOptions) *frameDec {
-	d := frameDec{
-		o:             o,
-		maxWindowSize: MaxWindowSize,
+	if o.maxWindowSize > o.maxDecodedSize {
+		o.maxWindowSize = o.maxDecodedSize
 	}
-	if d.maxWindowSize > o.maxDecodedSize {
-		d.maxWindowSize = o.maxDecodedSize
+	d := frameDec{
+		o: o,
 	}
 	return &d
 }
@@ -78,50 +65,74 @@ func newFrameDec(o decoderOptions) *frameDec {
 func (d *frameDec) reset(br byteBuffer) error {
 	d.HasCheckSum = false
 	d.WindowSize = 0
-	var b []byte
+	var signature [4]byte
 	for {
-		b = br.readSmall(4)
-		if b == nil {
+		var err error
+		// Check if we can read more...
+		b, err := br.readSmall(1)
+		switch err {
+		case io.EOF, io.ErrUnexpectedEOF:
+			return io.EOF
+		default:
+			return err
+		case nil:
+			signature[0] = b[0]
+		}
+		// Read the rest, don't allow io.ErrUnexpectedEOF
+		b, err = br.readSmall(3)
+		switch err {
+		case io.EOF:
 			return io.EOF
+		default:
+			return err
+		case nil:
+			copy(signature[1:], b)
 		}
-		if !bytes.Equal(b[1:4], skippableFrameMagic) || b[0]&0xf0 != 0x50 {
-			if debug {
-				println("Not skippable", hex.EncodeToString(b), hex.EncodeToString(skippableFrameMagic))
+
+		if !bytes.Equal(signature[1:4], skippableFrameMagic) || signature[0]&0xf0 != 0x50 {
+			if debugDecoder {
+				println("Not skippable", hex.EncodeToString(signature[:]), hex.EncodeToString(skippableFrameMagic))
 			}
 			// Break if not skippable frame.
 			break
 		}
 		// Read size to skip
-		b = br.readSmall(4)
-		if b == nil {
-			println("Reading Frame Size EOF")
-			return io.ErrUnexpectedEOF
+		b, err = br.readSmall(4)
+		if err != nil {
+			if debugDecoder {
+				println("Reading Frame Size", err)
+			}
+			return err
 		}
 		n := uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
 		println("Skipping frame with", n, "bytes.")
-		err := br.skipN(int(n))
+		err = br.skipN(int(n))
 		if err != nil {
-			if debug {
+			if debugDecoder {
 				println("Reading discarded frame", err)
 			}
 			return err
 		}
 	}
-	if !bytes.Equal(b, frameMagic) {
-		println("Got magic numbers: ", b, "want:", frameMagic)
+	if !bytes.Equal(signature[:], frameMagic) {
+		if debugDecoder {
+			println("Got magic numbers: ", signature, "want:", frameMagic)
+		}
 		return ErrMagicMismatch
 	}
 
 	// Read Frame_Header_Descriptor
 	fhd, err := br.readByte()
 	if err != nil {
-		println("Reading Frame_Header_Descriptor", err)
+		if debugDecoder {
+			println("Reading Frame_Header_Descriptor", err)
+		}
 		return err
 	}
 	d.SingleSegment = fhd&(1<<5) != 0
 
 	if fhd&(1<<3) != 0 {
-		return errors.New("Reserved bit set on frame header")
+		return errors.New("reserved bit set on frame header")
 	}
 
 	// Read Window_Descriptor
@@ -130,7 +141,9 @@ func (d *frameDec) reset(br byteBuffer) error {
 	if !d.SingleSegment {
 		wd, err := br.readByte()
 		if err != nil {
-			println("Reading Window_Descriptor", err)
+			if debugDecoder {
+				println("Reading Window_Descriptor", err)
+			}
 			return err
 		}
 		printf("raw: %x, mantissa: %d, exponent: %d\n", wd, wd&7, wd>>3)
@@ -147,12 +160,11 @@ func (d *frameDec) reset(br byteBuffer) error {
 		if size == 3 {
 			size = 4
 		}
-		b = br.readSmall(int(size))
-		if b == nil {
-			if debug {
-				println("Reading Dictionary_ID", io.ErrUnexpectedEOF)
-			}
-			return io.ErrUnexpectedEOF
+
+		b, err := br.readSmall(int(size))
+		if err != nil {
+			println("Reading Dictionary_ID", err)
+			return err
 		}
 		var id uint32
 		switch size {
@@ -163,7 +175,7 @@ func (d *frameDec) reset(br byteBuffer) error {
 		case 4:
 			id = uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
 		}
-		if debug {
+		if debugDecoder {
 			println("Dict size", size, "ID:", id)
 		}
 		if id > 0 {
@@ -185,12 +197,12 @@ func (d *frameDec) reset(br byteBuffer) error {
 	default:
 		fcsSize = 1 << v
 	}
-	d.FrameContentSize = 0
+	d.FrameContentSize = fcsUnknown
 	if fcsSize > 0 {
-		b := br.readSmall(fcsSize)
-		if b == nil {
-			println("Reading Frame content", io.ErrUnexpectedEOF)
-			return io.ErrUnexpectedEOF
+		b, err := br.readSmall(fcsSize)
+		if err != nil {
+			println("Reading Frame content", err)
+			return err
 		}
 		switch fcsSize {
 		case 1:
@@ -205,10 +217,11 @@ func (d *frameDec) reset(br byteBuffer) error {
 			d2 := uint32(b[4]) | (uint32(b[5]) << 8) | (uint32(b[6]) << 16) | (uint32(b[7]) << 24)
 			d.FrameContentSize = uint64(d1) | (uint64(d2) << 32)
 		}
-		if debug {
-			println("field size bits:", v, "fcsSize:", fcsSize, "FrameContentSize:", d.FrameContentSize, hex.EncodeToString(b[:fcsSize]), "singleseg:", d.SingleSegment, "window:", d.WindowSize)
+		if debugDecoder {
+			println("Read FCS:", d.FrameContentSize)
 		}
 	}
+
 	// Move this to shared.
 	d.HasCheckSum = fhd&(1<<2) != 0
 	if d.HasCheckSum {
@@ -226,21 +239,31 @@ func (d *frameDec) reset(br byteBuffer) error {
 		}
 	}
 
-	if d.WindowSize > d.maxWindowSize {
-		printf("window size %d > max %d\n", d.WindowSize, d.maxWindowSize)
+	if d.WindowSize > uint64(d.o.maxWindowSize) {
+		if debugDecoder {
+			printf("window size %d > max %d\n", d.WindowSize, d.o.maxWindowSize)
+		}
 		return ErrWindowSizeExceeded
 	}
 	// The minimum Window_Size is 1 KB.
 	if d.WindowSize < MinWindowSize {
-		println("got window size: ", d.WindowSize)
+		if debugDecoder {
+			println("got window size: ", d.WindowSize)
+		}
 		return ErrWindowSizeTooSmall
 	}
 	d.history.windowSize = int(d.WindowSize)
 	if d.o.lowMem && d.history.windowSize < maxBlockSize {
-		d.history.maxSize = d.history.windowSize * 2
+		d.history.allocFrameBuffer = d.history.windowSize * 2
+		// TODO: Maybe use FrameContent size
 	} else {
-		d.history.maxSize = d.history.windowSize + maxBlockSize
+		d.history.allocFrameBuffer = d.history.windowSize + maxBlockSize
 	}
+
+	if debugDecoder {
+		println("Frame: Dict:", d.DictionaryID, "FrameContentSize:", d.FrameContentSize, "singleseg:", d.SingleSegment, "window:", d.WindowSize, "crc:", d.HasCheckSum)
+	}
+
 	// history contains input - maybe we do something
 	d.rawInput = br
 	return nil
@@ -248,56 +271,37 @@ func (d *frameDec) reset(br byteBuffer) error {
 
 // next will start decoding the next block from stream.
 func (d *frameDec) next(block *blockDec) error {
-	if debug {
-		printf("decoding new block %p:%p", block, block.data)
+	if debugDecoder {
+		println("decoding new block")
 	}
 	err := block.reset(d.rawInput, d.WindowSize)
 	if err != nil {
 		println("block error:", err)
 		// Signal the frame decoder we have a problem.
-		d.sendErr(block, err)
+		block.sendErr(err)
 		return err
 	}
-	block.input <- struct{}{}
-	if debug {
-		println("next block:", block)
-	}
-	d.asyncRunningMu.Lock()
-	defer d.asyncRunningMu.Unlock()
-	if !d.asyncRunning {
-		return nil
-	}
-	if block.Last {
-		// We indicate the frame is done by sending io.EOF
-		d.decoding <- block
-		return io.EOF
-	}
-	d.decoding <- block
 	return nil
 }
 
-// sendEOF will queue an error block on the frame.
-// This will cause the frame decoder to return when it encounters the block.
-// Returns true if the decoder was added.
-func (d *frameDec) sendErr(block *blockDec, err error) bool {
-	d.asyncRunningMu.Lock()
-	defer d.asyncRunningMu.Unlock()
-	if !d.asyncRunning {
-		return false
-	}
-
-	println("sending error", err.Error())
-	block.sendErr(err)
-	d.decoding <- block
-	return true
-}
-
 // checkCRC will check the checksum if the frame has one.
 // Will return ErrCRCMismatch if crc check failed, otherwise nil.
 func (d *frameDec) checkCRC() error {
 	if !d.HasCheckSum {
 		return nil
 	}
+
+	// We can overwrite upper tmp now
+	want, err := d.rawInput.readSmall(4)
+	if err != nil {
+		println("CRC missing?", err)
+		return err
+	}
+
+	if d.o.ignoreChecksum {
+		return nil
+	}
+
 	var tmp [4]byte
 	got := d.crc.Sum64()
 	// Flip to match file order.
@@ -306,142 +310,29 @@ func (d *frameDec) checkCRC() error {
 	tmp[2] = byte(got >> 16)
 	tmp[3] = byte(got >> 24)
 
-	// We can overwrite upper tmp now
-	want := d.rawInput.readSmall(4)
-	if want == nil {
-		println("CRC missing?")
-		return io.ErrUnexpectedEOF
-	}
-
 	if !bytes.Equal(tmp[:], want) {
-		if debug {
+		if debugDecoder {
 			println("CRC Check Failed:", tmp[:], "!=", want)
 		}
 		return ErrCRCMismatch
 	}
-	if debug {
+	if debugDecoder {
 		println("CRC ok", tmp[:])
 	}
 	return nil
 }
 
-func (d *frameDec) initAsync() {
-	if !d.o.lowMem && !d.SingleSegment {
-		// set max extra size history to 10MB.
-		d.history.maxSize = d.history.windowSize + maxBlockSize*5
-	}
-	// re-alloc if more than one extra block size.
-	if d.o.lowMem && cap(d.history.b) > d.history.maxSize+maxBlockSize {
-		d.history.b = make([]byte, 0, d.history.maxSize)
-	}
-	if cap(d.history.b) < d.history.maxSize {
-		d.history.b = make([]byte, 0, d.history.maxSize)
-	}
-	if cap(d.decoding) < d.o.concurrent {
-		d.decoding = make(chan *blockDec, d.o.concurrent)
-	}
-	if debug {
-		h := d.history
-		printf("history init. len: %d, cap: %d", len(h.b), cap(h.b))
-	}
-	d.asyncRunningMu.Lock()
-	d.asyncRunning = true
-	d.asyncRunningMu.Unlock()
-}
-
-// startDecoder will start decoding blocks and write them to the writer.
-// The decoder will stop as soon as an error occurs or at end of frame.
-// When the frame has finished decoding the *bufio.Reader
-// containing the remaining input will be sent on frameDec.frameDone.
-func (d *frameDec) startDecoder(output chan decodeOutput) {
-	written := int64(0)
-
-	defer func() {
-		d.asyncRunningMu.Lock()
-		d.asyncRunning = false
-		d.asyncRunningMu.Unlock()
-
-		// Drain the currently decoding.
-		d.history.error = true
-	flushdone:
-		for {
-			select {
-			case b := <-d.decoding:
-				b.history <- &d.history
-				output <- <-b.result
-			default:
-				break flushdone
-			}
-		}
-		println("frame decoder done, signalling done")
-		d.frameDone.Done()
-	}()
-	// Get decoder for first block.
-	block := <-d.decoding
-	block.history <- &d.history
-	for {
-		var next *blockDec
-		// Get result
-		r := <-block.result
-		if r.err != nil {
-			println("Result contained error", r.err)
-			output <- r
-			return
-		}
-		if debug {
-			println("got result, from ", d.offset, "to", d.offset+int64(len(r.b)))
-			d.offset += int64(len(r.b))
-		}
-		if !block.Last {
-			// Send history to next block
-			select {
-			case next = <-d.decoding:
-				if debug {
-					println("Sending ", len(d.history.b), "bytes as history")
-				}
-				next.history <- &d.history
-			default:
-				// Wait until we have sent the block, so
-				// other decoders can potentially get the decoder.
-				next = nil
-			}
-		}
-
-		// Add checksum, async to decoding.
-		if d.HasCheckSum {
-			n, err := d.crc.Write(r.b)
-			if err != nil {
-				r.err = err
-				if n != len(r.b) {
-					r.err = io.ErrShortWrite
-				}
-				output <- r
-				return
-			}
-		}
-		written += int64(len(r.b))
-		if d.SingleSegment && uint64(written) > d.FrameContentSize {
-			println("runDecoder: single segment and", uint64(written), ">", d.FrameContentSize)
-			r.err = ErrFrameSizeExceeded
-			output <- r
-			return
-		}
-		if block.Last {
-			r.err = d.checkCRC()
-			output <- r
-			return
-		}
-		output <- r
-		if next == nil {
-			// There was no decoder available, we wait for one now that we have sent to the writer.
-			if debug {
-				println("Sending ", len(d.history.b), " bytes as history")
-			}
-			next = <-d.decoding
-			next.history <- &d.history
+// consumeCRC reads the checksum data if the frame has one.
+func (d *frameDec) consumeCRC() error {
+	if d.HasCheckSum {
+		_, err := d.rawInput.readSmall(4)
+		if err != nil {
+			println("CRC missing?", err)
+			return err
 		}
-		block = next
 	}
+
+	return nil
 }
 
 // runDecoder will create a sync decoder that will decode a block of data.
@@ -450,41 +341,67 @@ func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
 
 	// We use the history for output to avoid copying it.
 	d.history.b = dst
+	d.history.ignoreBuffer = len(dst)
 	// Store input length, so we only check new data.
 	crcStart := len(dst)
+	d.history.decoders.maxSyncLen = 0
+	if d.FrameContentSize != fcsUnknown {
+		d.history.decoders.maxSyncLen = d.FrameContentSize + uint64(len(dst))
+		if d.history.decoders.maxSyncLen > d.o.maxDecodedSize {
+			return dst, ErrDecoderSizeExceeded
+		}
+		if uint64(cap(dst)) < d.history.decoders.maxSyncLen {
+			// Alloc for output
+			dst2 := make([]byte, len(dst), d.history.decoders.maxSyncLen+compressedBlockOverAlloc)
+			copy(dst2, dst)
+			dst = dst2
+		}
+	}
 	var err error
 	for {
 		err = dec.reset(d.rawInput, d.WindowSize)
 		if err != nil {
 			break
 		}
-		if debug {
+		if debugDecoder {
 			println("next block:", dec)
 		}
 		err = dec.decodeBuf(&d.history)
-		if err != nil || dec.Last {
+		if err != nil {
 			break
 		}
 		if uint64(len(d.history.b)) > d.o.maxDecodedSize {
 			err = ErrDecoderSizeExceeded
 			break
 		}
-		if d.SingleSegment && uint64(len(d.history.b)) > d.o.maxDecodedSize {
-			println("runDecoder: single segment and", uint64(len(d.history.b)), ">", d.o.maxDecodedSize)
+		if uint64(len(d.history.b)-crcStart) > d.FrameContentSize {
+			println("runDecoder: FrameContentSize exceeded", uint64(len(d.history.b)-crcStart), ">", d.FrameContentSize)
 			err = ErrFrameSizeExceeded
 			break
 		}
+		if dec.Last {
+			break
+		}
+		if debugDecoder {
+			println("runDecoder: FrameContentSize", uint64(len(d.history.b)-crcStart), "<=", d.FrameContentSize)
+		}
 	}
 	dst = d.history.b
 	if err == nil {
-		if d.HasCheckSum {
-			var n int
-			n, err = d.crc.Write(dst[crcStart:])
-			if err == nil {
-				if n != len(dst)-crcStart {
-					err = io.ErrShortWrite
-				} else {
-					err = d.checkCRC()
+		if d.FrameContentSize != fcsUnknown && uint64(len(d.history.b)-crcStart) != d.FrameContentSize {
+			err = ErrFrameSizeMismatch
+		} else if d.HasCheckSum {
+			if d.o.ignoreChecksum {
+				err = d.consumeCRC()
+			} else {
+				var n int
+				n, err = d.crc.Write(dst[crcStart:])
+				if err == nil {
+					if n != len(dst)-crcStart {
+						err = io.ErrShortWrite
+					} else {
+						err = d.checkCRC()
+					}
 				}
 			}
 		}
diff --git a/vendor/github.com/klauspost/compress/zstd/frameenc.go b/vendor/github.com/klauspost/compress/zstd/frameenc.go
index 4479cfe..4ef7f5a 100644
--- a/vendor/github.com/klauspost/compress/zstd/frameenc.go
+++ b/vendor/github.com/klauspost/compress/zstd/frameenc.go
@@ -5,6 +5,7 @@
 package zstd
 
 import (
+	"encoding/binary"
 	"fmt"
 	"io"
 	"math"
@@ -16,7 +17,7 @@ type frameHeader struct {
 	WindowSize    uint32
 	SingleSegment bool
 	Checksum      bool
-	DictID        uint32 // Not stored.
+	DictID        uint32
 }
 
 const maxHeaderSize = 14
@@ -30,6 +31,24 @@ func (f frameHeader) appendTo(dst []byte) ([]byte, error) {
 	if f.SingleSegment {
 		fhd |= 1 << 5
 	}
+
+	var dictIDContent []byte
+	if f.DictID > 0 {
+		var tmp [4]byte
+		if f.DictID < 256 {
+			fhd |= 1
+			tmp[0] = uint8(f.DictID)
+			dictIDContent = tmp[:1]
+		} else if f.DictID < 1<<16 {
+			fhd |= 2
+			binary.LittleEndian.PutUint16(tmp[:2], uint16(f.DictID))
+			dictIDContent = tmp[:2]
+		} else {
+			fhd |= 3
+			binary.LittleEndian.PutUint32(tmp[:4], f.DictID)
+			dictIDContent = tmp[:4]
+		}
+	}
 	var fcs uint8
 	if f.ContentSize >= 256 {
 		fcs++
@@ -40,6 +59,7 @@ func (f frameHeader) appendTo(dst []byte) ([]byte, error) {
 	if f.ContentSize >= 0xffffffff {
 		fcs++
 	}
+
 	fhd |= fcs << 6
 
 	dst = append(dst, fhd)
@@ -48,7 +68,9 @@ func (f frameHeader) appendTo(dst []byte) ([]byte, error) {
 		windowLog := (bits.Len32(f.WindowSize-1) - winLogMin) << 3
 		dst = append(dst, uint8(windowLog))
 	}
-
+	if f.DictID > 0 {
+		dst = append(dst, dictIDContent...)
+	}
 	switch fcs {
 	case 0:
 		if f.SingleSegment {
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
index e6d3d49..fde4e6b 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
@@ -5,8 +5,10 @@
 package zstd
 
 import (
+	"encoding/binary"
 	"errors"
 	"fmt"
+	"io"
 )
 
 const (
@@ -182,6 +184,29 @@ func (s *fseDecoder) readNCount(b *byteReader, maxSymbol uint16) error {
 	return s.buildDtable()
 }
 
+func (s *fseDecoder) mustReadFrom(r io.Reader) {
+	fatalErr := func(err error) {
+		if err != nil {
+			panic(err)
+		}
+	}
+	// 	dt             [maxTablesize]decSymbol // Decompression table.
+	//	symbolLen      uint16                  // Length of active part of the symbol table.
+	//	actualTableLog uint8                   // Selected tablelog.
+	//	maxBits        uint8                   // Maximum number of additional bits
+	//	// used for table creation to avoid allocations.
+	//	stateTable [256]uint16
+	//	norm       [maxSymbolValue + 1]int16
+	//	preDefined bool
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.dt))
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.symbolLen))
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.actualTableLog))
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.maxBits))
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.stateTable))
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.norm))
+	fatalErr(binary.Read(r, binary.LittleEndian, &s.preDefined))
+}
+
 // decSymbol contains information about a state entry,
 // Including the state offset base, the output symbol and
 // the number of bits to read for the low part of the destination state.
@@ -379,7 +404,7 @@ func (s decSymbol) final() (int, uint8) {
 // This can only be used if no symbols are 0 bits.
 // At least tablelog bits must be available in the bit reader.
 func (s *fseState) nextFast(br *bitReader) (uint32, uint8) {
-	lowBits := uint16(br.getBitsFast(s.state.nbBits()))
+	lowBits := br.get16BitsFast(s.state.nbBits())
 	s.state = s.dt[s.state.newState()+lowBits]
 	return s.state.baseline(), s.state.addBits()
 }
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_encoder.go b/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
index aa9eba8..5442061 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
@@ -62,9 +62,8 @@ func (s symbolTransform) String() string {
 // To indicate that you have populated the histogram call HistogramFinished
 // with the value of the highest populated symbol, as well as the number of entries
 // in the most populated entry. These are accepted at face value.
-// The returned slice will always be length 256.
-func (s *fseEncoder) Histogram() []uint32 {
-	return s.count[:]
+func (s *fseEncoder) Histogram() *[256]uint32 {
+	return &s.count
 }
 
 // HistogramFinished can be called to indicate that the histogram has been populated.
@@ -97,7 +96,7 @@ func (s *fseEncoder) prepare() (*fseEncoder, error) {
 func (s *fseEncoder) allocCtable() {
 	tableSize := 1 << s.actualTableLog
 	// get tableSymbol that is big enough.
-	if cap(s.ct.tableSymbol) < int(tableSize) {
+	if cap(s.ct.tableSymbol) < tableSize {
 		s.ct.tableSymbol = make([]byte, tableSize)
 	}
 	s.ct.tableSymbol = s.ct.tableSymbol[:tableSize]
@@ -202,13 +201,13 @@ func (s *fseEncoder) buildCTable() error {
 			case 0:
 			case -1, 1:
 				symbolTT[i].deltaNbBits = tl
-				symbolTT[i].deltaFindState = int16(total - 1)
+				symbolTT[i].deltaFindState = total - 1
 				total++
 			default:
 				maxBitsOut := uint32(tableLog) - highBit(uint32(v-1))
 				minStatePlus := uint32(v) << maxBitsOut
 				symbolTT[i].deltaNbBits = (maxBitsOut << 16) - minStatePlus
-				symbolTT[i].deltaFindState = int16(total - v)
+				symbolTT[i].deltaFindState = total - v
 				total += v
 			}
 		}
@@ -229,7 +228,7 @@ func (s *fseEncoder) setRLE(val byte) {
 		deltaFindState: 0,
 		deltaNbBits:    0,
 	}
-	if debug {
+	if debugEncoder {
 		println("setRLE: val", val, "symbolTT", s.ct.symbolTT[val])
 	}
 	s.rleVal = val
@@ -353,8 +352,8 @@ func (s *fseEncoder) normalizeCount2(length int) error {
 		distributed  uint32
 		total        = uint32(length)
 		tableLog     = s.actualTableLog
-		lowThreshold = uint32(total >> tableLog)
-		lowOne       = uint32((total * 3) >> (tableLog + 1))
+		lowThreshold = total >> tableLog
+		lowOne       = (total * 3) >> (tableLog + 1)
 	)
 	for i, cnt := range s.count[:s.symbolLen] {
 		if cnt == 0 {
@@ -379,7 +378,7 @@ func (s *fseEncoder) normalizeCount2(length int) error {
 
 	if (total / toDistribute) > lowOne {
 		// risk of rounding to zero
-		lowOne = uint32((total * 3) / (toDistribute * 2))
+		lowOne = (total * 3) / (toDistribute * 2)
 		for i, cnt := range s.count[:s.symbolLen] {
 			if (s.norm[i] == notYetAssigned) && (cnt <= lowOne) {
 				s.norm[i] = 1
@@ -708,7 +707,6 @@ func (c *cState) init(bw *bitWriter, ct *cTable, first symbolTransform) {
 	im := int32((nbBitsOut << 16) - first.deltaNbBits)
 	lu := (im >> nbBitsOut) + int32(first.deltaFindState)
 	c.state = c.stateTable[lu]
-	return
 }
 
 // encode the output symbol provided and write it to the bitstream.
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_predefined.go b/vendor/github.com/klauspost/compress/zstd/fse_predefined.go
index 6c17dc1..474cb77 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_predefined.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_predefined.go
@@ -59,7 +59,7 @@ func fillBase(dst []baseOffset, base uint32, bits ...uint8) {
 	}
 	for i, bit := range bits {
 		if base > math.MaxInt32 {
-			panic(fmt.Sprintf("invalid decoding table, base overflows int32"))
+			panic("invalid decoding table, base overflows int32")
 		}
 
 		dst[i] = baseOffset{
diff --git a/vendor/github.com/klauspost/compress/zstd/hash.go b/vendor/github.com/klauspost/compress/zstd/hash.go
index 4a75206..cf33f29 100644
--- a/vendor/github.com/klauspost/compress/zstd/hash.go
+++ b/vendor/github.com/klauspost/compress/zstd/hash.go
@@ -13,24 +13,24 @@ const (
 	prime8bytes = 0xcf1bbcdcb7a56463
 )
 
-// hashLen returns a hash of the lowest l bytes of u for a size size of h bytes.
-// l must be >=4 and <=8. Any other value will return hash for 4 bytes.
-// h should always be <32.
-// Preferably h and l should be a constant.
-// FIXME: This does NOT get resolved, if 'mls' is constant,
-//  so this cannot be used.
-func hashLen(u uint64, hashLog, mls uint8) uint32 {
+// hashLen returns a hash of the lowest mls bytes of with length output bits.
+// mls must be >=3 and <=8. Any other value will return hash for 4 bytes.
+// length should always be < 32.
+// Preferably length and mls should be a constant for inlining.
+func hashLen(u uint64, length, mls uint8) uint32 {
 	switch mls {
+	case 3:
+		return (uint32(u<<8) * prime3bytes) >> (32 - length)
 	case 5:
-		return hash5(u, hashLog)
+		return uint32(((u << (64 - 40)) * prime5bytes) >> (64 - length))
 	case 6:
-		return hash6(u, hashLog)
+		return uint32(((u << (64 - 48)) * prime6bytes) >> (64 - length))
 	case 7:
-		return hash7(u, hashLog)
+		return uint32(((u << (64 - 56)) * prime7bytes) >> (64 - length))
 	case 8:
-		return hash8(u, hashLog)
+		return uint32((u * prime8bytes) >> (64 - length))
 	default:
-		return hash4x64(u, hashLog)
+		return (uint32(u) * prime4bytes) >> (32 - length)
 	}
 }
 
@@ -39,39 +39,3 @@ func hashLen(u uint64, hashLog, mls uint8) uint32 {
 func hash3(u uint32, h uint8) uint32 {
 	return ((u << (32 - 24)) * prime3bytes) >> ((32 - h) & 31)
 }
-
-// hash4 returns the hash of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <32.
-func hash4(u uint32, h uint8) uint32 {
-	return (u * prime4bytes) >> ((32 - h) & 31)
-}
-
-// hash4x64 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <32.
-func hash4x64(u uint64, h uint8) uint32 {
-	return (uint32(u) * prime4bytes) >> ((32 - h) & 31)
-}
-
-// hash5 returns the hash of the lowest 5 bytes of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <64.
-func hash5(u uint64, h uint8) uint32 {
-	return uint32(((u << (64 - 40)) * prime5bytes) >> ((64 - h) & 63))
-}
-
-// hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <64.
-func hash6(u uint64, h uint8) uint32 {
-	return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & 63))
-}
-
-// hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <64.
-func hash7(u uint64, h uint8) uint32 {
-	return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63))
-}
-
-// hash8 returns the hash of u to fit in a hash table with h bits.
-// Preferably h should be a constant and should always be <64.
-func hash8(u uint64, h uint8) uint32 {
-	return uint32((u * prime8bytes) >> ((64 - h) & 63))
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/history.go b/vendor/github.com/klauspost/compress/zstd/history.go
index f418f50..28b4015 100644
--- a/vendor/github.com/klauspost/compress/zstd/history.go
+++ b/vendor/github.com/klauspost/compress/zstd/history.go
@@ -10,20 +10,31 @@ import (
 
 // history contains the information transferred between blocks.
 type history struct {
-	b             []byte
-	huffTree      *huff0.Scratch
-	recentOffsets [3]int
+	// Literal decompression
+	huffTree *huff0.Scratch
+
+	// Sequence decompression
 	decoders      sequenceDecs
-	windowSize    int
-	maxSize       int
-	error         bool
-	dict          *dict
+	recentOffsets [3]int
+
+	// History buffer...
+	b []byte
+
+	// ignoreBuffer is meant to ignore a number of bytes
+	// when checking for matches in history
+	ignoreBuffer int
+
+	windowSize       int
+	allocFrameBuffer int // needed?
+	error            bool
+	dict             *dict
 }
 
 // reset will reset the history to initial state of a frame.
 // The history must already have been initialized to the desired size.
 func (h *history) reset() {
 	h.b = h.b[:0]
+	h.ignoreBuffer = 0
 	h.error = false
 	h.recentOffsets = [3]int{1, 4, 8}
 	if f := h.decoders.litLengths.fse; f != nil && !f.preDefined {
@@ -35,9 +46,9 @@ func (h *history) reset() {
 	if f := h.decoders.matchLengths.fse; f != nil && !f.preDefined {
 		fseDecoderPool.Put(f)
 	}
-	h.decoders = sequenceDecs{}
+	h.decoders = sequenceDecs{br: h.decoders.br}
 	if h.huffTree != nil {
-		if h.dict == nil || h.dict.litDec != h.huffTree {
+		if h.dict == nil || h.dict.litEnc != h.huffTree {
 			huffDecoderPool.Put(h.huffTree)
 		}
 	}
@@ -54,8 +65,9 @@ func (h *history) setDict(dict *dict) {
 	h.decoders.litLengths = dict.llDec
 	h.decoders.offsets = dict.ofDec
 	h.decoders.matchLengths = dict.mlDec
+	h.decoders.dict = dict.content
 	h.recentOffsets = dict.offsets
-	h.huffTree = dict.litDec
+	h.huffTree = dict.litEnc
 }
 
 // append bytes to history.
@@ -83,6 +95,24 @@ func (h *history) append(b []byte) {
 	copy(h.b[h.windowSize-len(b):], b)
 }
 
+// ensureBlock will ensure there is space for at least one block...
+func (h *history) ensureBlock() {
+	if cap(h.b) < h.allocFrameBuffer {
+		h.b = make([]byte, 0, h.allocFrameBuffer)
+		return
+	}
+
+	avail := cap(h.b) - len(h.b)
+	if avail >= h.windowSize || avail > maxCompressedBlockSize {
+		return
+	}
+	// Move data down so we only have window size left.
+	// We know we have less than window size in b at this point.
+	discard := len(h.b) - h.windowSize
+	copy(h.b, h.b[discard:])
+	h.b = h.b[:h.windowSize]
+}
+
 // append bytes to history without ever discarding anything.
 func (h *history) appendKeep(b []byte) {
 	h.b = append(h.b, b...)
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go
index 426b9ca..2c112a0 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go
@@ -195,7 +195,6 @@ func (d *Digest) UnmarshalBinary(b []byte) error {
 	b, d.v4 = consumeUint64(b)
 	b, d.total = consumeUint64(b)
 	copy(d.mem[:], b)
-	b = b[len(d.mem):]
 	d.n = int(d.total % uint64(len(d.mem)))
 	return nil
 }
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s
index 2c9c535..cea1785 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s
@@ -1,12 +1,13 @@
 // +build !appengine
 // +build gc
 // +build !purego
+// +build !noasm
 
 #include "textflag.h"
 
 // Register allocation:
 // AX	h
-// CX	pointer to advance through b
+// SI	pointer to advance through b
 // DX	n
 // BX	loop end
 // R8	v1, k1
@@ -16,39 +17,39 @@
 // R12	tmp
 // R13	prime1v
 // R14	prime2v
-// R15	prime4v
+// DI	prime4v
 
-// round reads from and advances the buffer pointer in CX.
+// round reads from and advances the buffer pointer in SI.
 // It assumes that R13 has prime1v and R14 has prime2v.
 #define round(r) \
-	MOVQ  (CX), R12 \
-	ADDQ  $8, CX    \
+	MOVQ  (SI), R12 \
+	ADDQ  $8, SI    \
 	IMULQ R14, R12  \
 	ADDQ  R12, r    \
 	ROLQ  $31, r    \
 	IMULQ R13, r
 
 // mergeRound applies a merge round on the two registers acc and val.
-// It assumes that R13 has prime1v, R14 has prime2v, and R15 has prime4v.
+// It assumes that R13 has prime1v, R14 has prime2v, and DI has prime4v.
 #define mergeRound(acc, val) \
 	IMULQ R14, val \
 	ROLQ  $31, val \
 	IMULQ R13, val \
 	XORQ  val, acc \
 	IMULQ R13, acc \
-	ADDQ  R15, acc
+	ADDQ  DI, acc
 
 // func Sum64(b []byte) uint64
 TEXT ·Sum64(SB), NOSPLIT, $0-32
 	// Load fixed primes.
 	MOVQ ·prime1v(SB), R13
 	MOVQ ·prime2v(SB), R14
-	MOVQ ·prime4v(SB), R15
+	MOVQ ·prime4v(SB), DI
 
 	// Load slice.
-	MOVQ b_base+0(FP), CX
+	MOVQ b_base+0(FP), SI
 	MOVQ b_len+8(FP), DX
-	LEAQ (CX)(DX*1), BX
+	LEAQ (SI)(DX*1), BX
 
 	// The first loop limit will be len(b)-32.
 	SUBQ $32, BX
@@ -65,14 +66,14 @@ TEXT ·Sum64(SB), NOSPLIT, $0-32
 	XORQ R11, R11
 	SUBQ R13, R11
 
-	// Loop until CX > BX.
+	// Loop until SI > BX.
 blockLoop:
 	round(R8)
 	round(R9)
 	round(R10)
 	round(R11)
 
-	CMPQ CX, BX
+	CMPQ SI, BX
 	JLE  blockLoop
 
 	MOVQ R8, AX
@@ -100,16 +101,16 @@ noBlocks:
 afterBlocks:
 	ADDQ DX, AX
 
-	// Right now BX has len(b)-32, and we want to loop until CX > len(b)-8.
+	// Right now BX has len(b)-32, and we want to loop until SI > len(b)-8.
 	ADDQ $24, BX
 
-	CMPQ CX, BX
+	CMPQ SI, BX
 	JG   fourByte
 
 wordLoop:
 	// Calculate k1.
-	MOVQ  (CX), R8
-	ADDQ  $8, CX
+	MOVQ  (SI), R8
+	ADDQ  $8, SI
 	IMULQ R14, R8
 	ROLQ  $31, R8
 	IMULQ R13, R8
@@ -117,18 +118,18 @@ wordLoop:
 	XORQ  R8, AX
 	ROLQ  $27, AX
 	IMULQ R13, AX
-	ADDQ  R15, AX
+	ADDQ  DI, AX
 
-	CMPQ CX, BX
+	CMPQ SI, BX
 	JLE  wordLoop
 
 fourByte:
 	ADDQ $4, BX
-	CMPQ CX, BX
+	CMPQ SI, BX
 	JG   singles
 
-	MOVL  (CX), R8
-	ADDQ  $4, CX
+	MOVL  (SI), R8
+	ADDQ  $4, SI
 	IMULQ R13, R8
 	XORQ  R8, AX
 
@@ -138,19 +139,19 @@ fourByte:
 
 singles:
 	ADDQ $4, BX
-	CMPQ CX, BX
+	CMPQ SI, BX
 	JGE  finalize
 
 singlesLoop:
-	MOVBQZX (CX), R12
-	ADDQ    $1, CX
+	MOVBQZX (SI), R12
+	ADDQ    $1, SI
 	IMULQ   ·prime5v(SB), R12
 	XORQ    R12, AX
 
 	ROLQ  $11, AX
 	IMULQ R13, AX
 
-	CMPQ CX, BX
+	CMPQ SI, BX
 	JL   singlesLoop
 
 finalize:
@@ -179,13 +180,13 @@ TEXT ·writeBlocks(SB), NOSPLIT, $0-40
 	MOVQ ·prime2v(SB), R14
 
 	// Load slice.
-	MOVQ arg1_base+8(FP), CX
-	MOVQ arg1_len+16(FP), DX
-	LEAQ (CX)(DX*1), BX
+	MOVQ b_base+8(FP), SI
+	MOVQ b_len+16(FP), DX
+	LEAQ (SI)(DX*1), BX
 	SUBQ $32, BX
 
 	// Load vN from d.
-	MOVQ arg+0(FP), AX
+	MOVQ d+0(FP), AX
 	MOVQ 0(AX), R8   // v1
 	MOVQ 8(AX), R9   // v2
 	MOVQ 16(AX), R10 // v3
@@ -199,7 +200,7 @@ blockLoop:
 	round(R10)
 	round(R11)
 
-	CMPQ CX, BX
+	CMPQ SI, BX
 	JLE  blockLoop
 
 	// Copy vN back to d.
@@ -208,8 +209,8 @@ blockLoop:
 	MOVQ R10, 16(AX)
 	MOVQ R11, 24(AX)
 
-	// The number of bytes written is CX minus the old base pointer.
-	SUBQ arg1_base+8(FP), CX
-	MOVQ CX, ret+32(FP)
+	// The number of bytes written is SI minus the old base pointer.
+	SUBQ b_base+8(FP), SI
+	MOVQ SI, ret+32(FP)
 
 	RET
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s
new file mode 100644
index 0000000..4d64a17
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s
@@ -0,0 +1,186 @@
+// +build gc,!purego,!noasm
+
+#include "textflag.h"
+
+// Register allocation.
+#define digest	R1
+#define h	R2 // Return value.
+#define p	R3 // Input pointer.
+#define len	R4
+#define nblocks	R5 // len / 32.
+#define prime1	R7
+#define prime2	R8
+#define prime3	R9
+#define prime4	R10
+#define prime5	R11
+#define v1	R12
+#define v2	R13
+#define v3	R14
+#define v4	R15
+#define x1	R20
+#define x2	R21
+#define x3	R22
+#define x4	R23
+
+#define round(acc, x) \
+	MADD prime2, acc, x, acc \
+	ROR  $64-31, acc         \
+	MUL  prime1, acc         \
+
+// x = round(0, x).
+#define round0(x) \
+	MUL prime2, x \
+	ROR $64-31, x \
+	MUL prime1, x \
+
+#define mergeRound(x) \
+	round0(x)                 \
+	EOR  x, h                 \
+	MADD h, prime4, prime1, h \
+
+// Update v[1-4] with 32-byte blocks. Assumes len >= 32.
+#define blocksLoop() \
+	LSR     $5, len, nblocks \
+	PCALIGN $16              \
+	loop:                    \
+	LDP.P   32(p), (x1, x2)  \
+	round(v1, x1)            \
+	LDP     -16(p), (x3, x4) \
+	round(v2, x2)            \
+	SUB     $1, nblocks      \
+	round(v3, x3)            \
+	round(v4, x4)            \
+	CBNZ    nblocks, loop    \
+
+// The primes are repeated here to ensure that they're stored
+// in a contiguous array, so we can load them with LDP.
+DATA primes<> +0(SB)/8, $11400714785074694791
+DATA primes<> +8(SB)/8, $14029467366897019727
+DATA primes<>+16(SB)/8, $1609587929392839161
+DATA primes<>+24(SB)/8, $9650029242287828579
+DATA primes<>+32(SB)/8, $2870177450012600261
+GLOBL primes<>(SB), NOPTR+RODATA, $40
+
+// func Sum64(b []byte) uint64
+TEXT ·Sum64(SB), NOFRAME+NOSPLIT, $0-32
+	LDP b_base+0(FP), (p, len)
+
+	LDP  primes<> +0(SB), (prime1, prime2)
+	LDP  primes<>+16(SB), (prime3, prime4)
+	MOVD primes<>+32(SB), prime5
+
+	CMP  $32, len
+	CSEL LO, prime5, ZR, h // if len < 32 { h = prime5 } else { h = 0 }
+	BLO  afterLoop
+
+	ADD  prime1, prime2, v1
+	MOVD prime2, v2
+	MOVD $0, v3
+	NEG  prime1, v4
+
+	blocksLoop()
+
+	ROR $64-1, v1, x1
+	ROR $64-7, v2, x2
+	ADD x1, x2
+	ROR $64-12, v3, x3
+	ROR $64-18, v4, x4
+	ADD x3, x4
+	ADD x2, x4, h
+
+	mergeRound(v1)
+	mergeRound(v2)
+	mergeRound(v3)
+	mergeRound(v4)
+
+afterLoop:
+	ADD len, h
+
+	TBZ   $4, len, try8
+	LDP.P 16(p), (x1, x2)
+
+	round0(x1)
+	ROR  $64-27, h
+	EOR  x1 @> 64-27, h, h
+	MADD h, prime4, prime1, h
+
+	round0(x2)
+	ROR  $64-27, h
+	EOR  x2 @> 64-27, h
+	MADD h, prime4, prime1, h
+
+try8:
+	TBZ    $3, len, try4
+	MOVD.P 8(p), x1
+
+	round0(x1)
+	ROR  $64-27, h
+	EOR  x1 @> 64-27, h
+	MADD h, prime4, prime1, h
+
+try4:
+	TBZ     $2, len, try2
+	MOVWU.P 4(p), x2
+
+	MUL  prime1, x2
+	ROR  $64-23, h
+	EOR  x2 @> 64-23, h
+	MADD h, prime3, prime2, h
+
+try2:
+	TBZ     $1, len, try1
+	MOVHU.P 2(p), x3
+	AND     $255, x3, x1
+	LSR     $8, x3, x2
+
+	MUL prime5, x1
+	ROR $64-11, h
+	EOR x1 @> 64-11, h
+	MUL prime1, h
+
+	MUL prime5, x2
+	ROR $64-11, h
+	EOR x2 @> 64-11, h
+	MUL prime1, h
+
+try1:
+	TBZ   $0, len, end
+	MOVBU (p), x4
+
+	MUL prime5, x4
+	ROR $64-11, h
+	EOR x4 @> 64-11, h
+	MUL prime1, h
+
+end:
+	EOR h >> 33, h
+	MUL prime2, h
+	EOR h >> 29, h
+	MUL prime3, h
+	EOR h >> 32, h
+
+	MOVD h, ret+24(FP)
+	RET
+
+// func writeBlocks(d *Digest, b []byte) int
+//
+// Assumes len(b) >= 32.
+TEXT ·writeBlocks(SB), NOFRAME+NOSPLIT, $0-40
+	LDP primes<>(SB), (prime1, prime2)
+
+	// Load state. Assume v[1-4] are stored contiguously.
+	MOVD d+0(FP), digest
+	LDP  0(digest), (v1, v2)
+	LDP  16(digest), (v3, v4)
+
+	LDP b_base+8(FP), (p, len)
+
+	blocksLoop()
+
+	// Store updated state.
+	STP (v1, v2), 0(digest)
+	STP (v3, v4), 16(digest)
+
+	BIC  $31, len
+	MOVD len, ret+32(FP)
+	RET
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go
similarity index 54%
rename from vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.go
rename to vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go
index 35318d7..1a1fac9 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.go
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go
@@ -1,6 +1,9 @@
+//go:build (amd64 || arm64) && !appengine && gc && !purego && !noasm
+// +build amd64 arm64
 // +build !appengine
 // +build gc
 // +build !purego
+// +build !noasm
 
 package xxhash
 
@@ -10,4 +13,4 @@ package xxhash
 func Sum64(b []byte) uint64
 
 //go:noescape
-func writeBlocks(*Digest, []byte) int
+func writeBlocks(d *Digest, b []byte) int
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go
index 4a5a821..209cb4a 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go
@@ -1,4 +1,5 @@
-// +build !amd64 appengine !gc purego
+//go:build (!amd64 && !arm64) || appengine || !gc || purego || noasm
+// +build !amd64,!arm64 appengine !gc purego noasm
 
 package xxhash
 
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec.go b/vendor/github.com/klauspost/compress/zstd/seqdec.go
index 7ff8704..e80139d 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec.go
@@ -20,6 +20,10 @@ type seq struct {
 	llCode, mlCode, ofCode uint8
 }
 
+type seqVals struct {
+	ll, ml, mo int
+}
+
 func (s seq) String() string {
 	if s.offset <= 3 {
 		if s.offset == 0 {
@@ -61,16 +65,19 @@ type sequenceDecs struct {
 	offsets      sequenceDec
 	matchLengths sequenceDec
 	prevOffset   [3]int
-	hist         []byte
 	dict         []byte
 	literals     []byte
 	out          []byte
+	nSeqs        int
+	br           *bitReader
+	seqSize      int
 	windowSize   int
 	maxBits      uint8
+	maxSyncLen   uint64
 }
 
 // initialize all 3 decoders from the stream input.
-func (s *sequenceDecs) initialize(br *bitReader, hist *history, literals, out []byte) error {
+func (s *sequenceDecs) initialize(br *bitReader, hist *history, out []byte) error {
 	if err := s.litLengths.init(br); err != nil {
 		return errors.New("litLengths:" + err.Error())
 	}
@@ -80,8 +87,7 @@ func (s *sequenceDecs) initialize(br *bitReader, hist *history, literals, out []
 	if err := s.matchLengths.init(br); err != nil {
 		return errors.New("matchLengths:" + err.Error())
 	}
-	s.literals = literals
-	s.hist = hist.b
+	s.br = br
 	s.prevOffset = hist.recentOffsets
 	s.maxBits = s.litLengths.fse.maxBits + s.offsets.fse.maxBits + s.matchLengths.fse.maxBits
 	s.windowSize = hist.windowSize
@@ -93,12 +99,127 @@ func (s *sequenceDecs) initialize(br *bitReader, hist *history, literals, out []
 	return nil
 }
 
+// execute will execute the decoded sequence with the provided history.
+// The sequence must be evaluated before being sent.
+func (s *sequenceDecs) execute(seqs []seqVals, hist []byte) error {
+	if len(s.dict) == 0 {
+		return s.executeSimple(seqs, hist)
+	}
+
+	// Ensure we have enough output size...
+	if len(s.out)+s.seqSize > cap(s.out) {
+		addBytes := s.seqSize + len(s.out)
+		s.out = append(s.out, make([]byte, addBytes)...)
+		s.out = s.out[:len(s.out)-addBytes]
+	}
+
+	if debugDecoder {
+		printf("Execute %d seqs with hist %d, dict %d, literals: %d into %d bytes\n", len(seqs), len(hist), len(s.dict), len(s.literals), s.seqSize)
+	}
+
+	var t = len(s.out)
+	out := s.out[:t+s.seqSize]
+
+	for _, seq := range seqs {
+		// Add literals
+		copy(out[t:], s.literals[:seq.ll])
+		t += seq.ll
+		s.literals = s.literals[seq.ll:]
+
+		// Copy from dictionary...
+		if seq.mo > t+len(hist) || seq.mo > s.windowSize {
+			if len(s.dict) == 0 {
+				return fmt.Errorf("match offset (%d) bigger than current history (%d)", seq.mo, t+len(hist))
+			}
+
+			// we may be in dictionary.
+			dictO := len(s.dict) - (seq.mo - (t + len(hist)))
+			if dictO < 0 || dictO >= len(s.dict) {
+				return fmt.Errorf("match offset (%d) bigger than current history+dict (%d)", seq.mo, t+len(hist)+len(s.dict))
+			}
+			end := dictO + seq.ml
+			if end > len(s.dict) {
+				n := len(s.dict) - dictO
+				copy(out[t:], s.dict[dictO:])
+				t += n
+				seq.ml -= n
+			} else {
+				copy(out[t:], s.dict[dictO:end])
+				t += end - dictO
+				continue
+			}
+		}
+
+		// Copy from history.
+		if v := seq.mo - t; v > 0 {
+			// v is the start position in history from end.
+			start := len(hist) - v
+			if seq.ml > v {
+				// Some goes into current block.
+				// Copy remainder of history
+				copy(out[t:], hist[start:])
+				t += v
+				seq.ml -= v
+			} else {
+				copy(out[t:], hist[start:start+seq.ml])
+				t += seq.ml
+				continue
+			}
+		}
+		// We must be in current buffer now
+		if seq.ml > 0 {
+			start := t - seq.mo
+			if seq.ml <= t-start {
+				// No overlap
+				copy(out[t:], out[start:start+seq.ml])
+				t += seq.ml
+				continue
+			} else {
+				// Overlapping copy
+				// Extend destination slice and copy one byte at the time.
+				src := out[start : start+seq.ml]
+				dst := out[t:]
+				dst = dst[:len(src)]
+				t += len(src)
+				// Destination is the space we just added.
+				for i := range src {
+					dst[i] = src[i]
+				}
+			}
+		}
+	}
+	// Add final literals
+	copy(out[t:], s.literals)
+	if debugDecoder {
+		t += len(s.literals)
+		if t != len(out) {
+			panic(fmt.Errorf("length mismatch, want %d, got %d, ss: %d", len(out), t, s.seqSize))
+		}
+	}
+	s.out = out
+
+	return nil
+}
+
 // decode sequences from the stream with the provided history.
-func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
+func (s *sequenceDecs) decodeSync(hist []byte) error {
+	if true {
+		supported, err := s.decodeSyncSimple(hist)
+		if supported {
+			return err
+		}
+	}
+	br := s.br
+	seqs := s.nSeqs
 	startSize := len(s.out)
 	// Grab full sizes tables, to avoid bounds checks.
 	llTable, mlTable, ofTable := s.litLengths.fse.dt[:maxTablesize], s.matchLengths.fse.dt[:maxTablesize], s.offsets.fse.dt[:maxTablesize]
 	llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
+	out := s.out
+	maxBlockSize := maxCompressedBlockSize
+	if s.windowSize < maxBlockSize {
+		maxBlockSize = s.windowSize
+	}
 
 	for i := seqs - 1; i >= 0; i-- {
 		if br.overread() {
@@ -151,7 +272,7 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 
 					if temp == 0 {
 						// 0 is not valid; input is corrupted; force offset to 1
-						println("temp was 0")
+						println("WARNING: temp was 0")
 						temp = 1
 					}
 
@@ -176,40 +297,49 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 		if ll > len(s.literals) {
 			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, len(s.literals))
 		}
-		size := ll + ml + len(s.out)
+		size := ll + ml + len(out)
 		if size-startSize > maxBlockSize {
-			return fmt.Errorf("output (%d) bigger than max block size", size)
+			return fmt.Errorf("output (%d) bigger than max block size (%d)", size-startSize, maxBlockSize)
 		}
-		if size > cap(s.out) {
-			// Not enough size, will be extremely rarely triggered,
+		if size > cap(out) {
+			// Not enough size, which can happen under high volume block streaming conditions
 			// but could be if destination slice is too small for sync operations.
-			// We add maxBlockSize to the capacity.
-			s.out = append(s.out, make([]byte, maxBlockSize)...)
-			s.out = s.out[:len(s.out)-maxBlockSize]
+			// over-allocating here can create a large amount of GC pressure so we try to keep
+			// it as contained as possible
+			used := len(out) - startSize
+			addBytes := 256 + ll + ml + used>>2
+			// Clamp to max block size.
+			if used+addBytes > maxBlockSize {
+				addBytes = maxBlockSize - used
+			}
+			out = append(out, make([]byte, addBytes)...)
+			out = out[:len(out)-addBytes]
 		}
 		if ml > maxMatchLen {
 			return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
 		}
 
 		// Add literals
-		s.out = append(s.out, s.literals[:ll]...)
+		out = append(out, s.literals[:ll]...)
 		s.literals = s.literals[ll:]
-		out := s.out
 
-		if mo > len(s.out)+len(hist) || mo > s.windowSize {
+		if mo == 0 && ml > 0 {
+			return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
+		}
+
+		if mo > len(out)+len(hist) || mo > s.windowSize {
 			if len(s.dict) == 0 {
-				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(s.out)+len(hist))
+				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist)-startSize)
 			}
 
 			// we may be in dictionary.
-			dictO := len(s.dict) - (mo - (len(s.out) + len(hist)))
+			dictO := len(s.dict) - (mo - (len(out) + len(hist)))
 			if dictO < 0 || dictO >= len(s.dict) {
-				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(s.out)+len(hist))
+				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(out)+len(hist)-startSize)
 			}
 			end := dictO + ml
 			if end > len(s.dict) {
 				out = append(out, s.dict[dictO:]...)
-				mo -= len(s.dict) - dictO
 				ml -= len(s.dict) - dictO
 			} else {
 				out = append(out, s.dict[dictO:end]...)
@@ -218,32 +348,27 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 			}
 		}
 
-		if mo == 0 && ml > 0 {
-			return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
-		}
-
 		// Copy from history.
 		// TODO: Blocks without history could be made to ignore this completely.
-		if v := mo - len(s.out); v > 0 {
+		if v := mo - len(out); v > 0 {
 			// v is the start position in history from end.
-			start := len(s.hist) - v
+			start := len(hist) - v
 			if ml > v {
 				// Some goes into current block.
 				// Copy remainder of history
-				out = append(out, s.hist[start:]...)
-				mo -= v
+				out = append(out, hist[start:]...)
 				ml -= v
 			} else {
-				out = append(out, s.hist[start:start+ml]...)
+				out = append(out, hist[start:start+ml]...)
 				ml = 0
 			}
 		}
 		// We must be in current buffer now
 		if ml > 0 {
-			start := len(s.out) - mo
-			if ml <= len(s.out)-start {
+			start := len(out) - mo
+			if ml <= len(out)-start {
 				// No overlap
-				out = append(out, s.out[start:start+ml]...)
+				out = append(out, out[start:start+ml]...)
 			} else {
 				// Overlapping copy
 				// Extend destination slice and copy one byte at the time.
@@ -257,7 +382,6 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 				}
 			}
 		}
-		s.out = out
 		if i == 0 {
 			// This is the last sequence, so we shouldn't update state.
 			break
@@ -271,7 +395,7 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 			mlState = mlTable[mlState.newState()&maxTableMask]
 			ofState = ofTable[ofState.newState()&maxTableMask]
 		} else {
-			bits := br.getBitsFast(nBits)
+			bits := br.get32BitsFast(nBits)
 			lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31))
 			llState = llTable[(llState.newState()+lowBits)&maxTableMask]
 
@@ -284,9 +408,14 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 		}
 	}
 
+	// Check if space for literals
+	if size := len(s.literals) + len(s.out) - startSize; size > maxBlockSize {
+		return fmt.Errorf("output (%d) bigger than max block size (%d)", size, maxBlockSize)
+	}
+
 	// Add final literals
-	s.out = append(s.out, s.literals...)
-	return nil
+	s.out = append(out, s.literals...)
+	return br.close()
 }
 
 // update states, at least 27 bits must be available.
@@ -319,7 +448,7 @@ func (s *sequenceDecs) updateAlt(br *bitReader) {
 		s.offsets.state.state = s.offsets.state.dt[c.newState()]
 		return
 	}
-	bits := br.getBitsFast(nBits)
+	bits := br.get32BitsFast(nBits)
 	lowBits := uint16(bits >> ((c.nbBits() + b.nbBits()) & 31))
 	s.litLengths.state.state = s.litLengths.state.dt[a.newState()+lowBits]
 
@@ -450,36 +579,3 @@ func (s *sequenceDecs) adjustOffset(offset, litLen int, offsetB uint8) int {
 	s.prevOffset[0] = temp
 	return temp
 }
-
-// mergeHistory will merge history.
-func (s *sequenceDecs) mergeHistory(hist *sequenceDecs) (*sequenceDecs, error) {
-	for i := uint(0); i < 3; i++ {
-		var sNew, sHist *sequenceDec
-		switch i {
-		default:
-			// same as "case 0":
-			sNew = &s.litLengths
-			sHist = &hist.litLengths
-		case 1:
-			sNew = &s.offsets
-			sHist = &hist.offsets
-		case 2:
-			sNew = &s.matchLengths
-			sHist = &hist.matchLengths
-		}
-		if sNew.repeat {
-			if sHist.fse == nil {
-				return nil, fmt.Errorf("sequence stream %d, repeat requested, but no history", i)
-			}
-			continue
-		}
-		if sNew.fse == nil {
-			return nil, fmt.Errorf("sequence stream %d, no fse found", i)
-		}
-		if sHist.fse != nil && !sHist.fse.preDefined {
-			fseDecoderPool.Put(sHist.fse)
-		}
-		sHist.fse = sNew.fse
-	}
-	return hist, nil
-}
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
new file mode 100644
index 0000000..4676b09
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
@@ -0,0 +1,350 @@
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+package zstd
+
+import (
+	"fmt"
+
+	"github.com/klauspost/compress/internal/cpuinfo"
+)
+
+type decodeSyncAsmContext struct {
+	llTable     []decSymbol
+	mlTable     []decSymbol
+	ofTable     []decSymbol
+	llState     uint64
+	mlState     uint64
+	ofState     uint64
+	iteration   int
+	litRemain   int
+	out         []byte
+	outPosition int
+	literals    []byte
+	litPosition int
+	history     []byte
+	windowSize  int
+	ll          int // set on error (not for all errors, please refer to _generate/gen.go)
+	ml          int // set on error (not for all errors, please refer to _generate/gen.go)
+	mo          int // set on error (not for all errors, please refer to _generate/gen.go)
+}
+
+// sequenceDecs_decodeSync_amd64 implements the main loop of sequenceDecs.decodeSync in x86 asm.
+//
+// Please refer to seqdec_generic.go for the reference implementation.
+//go:noescape
+func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+
+// sequenceDecs_decodeSync_bmi2 implements the main loop of sequenceDecs.decodeSync in x86 asm with BMI2 extensions.
+//go:noescape
+func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+
+// sequenceDecs_decodeSync_safe_amd64 does the same as above, but does not write more than output buffer.
+//go:noescape
+func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+
+// sequenceDecs_decodeSync_safe_bmi2 does the same as above, but does not write more than output buffer.
+//go:noescape
+func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+
+// decode sequences from the stream with the provided history but without a dictionary.
+func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
+	if len(s.dict) > 0 {
+		return false, nil
+	}
+	if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSize {
+		return false, nil
+	}
+	useSafe := false
+	if s.maxSyncLen == 0 && cap(s.out)-len(s.out) < maxCompressedBlockSizeAlloc {
+		useSafe = true
+	}
+	if s.maxSyncLen > 0 && cap(s.out)-len(s.out)-compressedBlockOverAlloc < int(s.maxSyncLen) {
+		useSafe = true
+	}
+	br := s.br
+
+	maxBlockSize := maxCompressedBlockSize
+	if s.windowSize < maxBlockSize {
+		maxBlockSize = s.windowSize
+	}
+
+	ctx := decodeSyncAsmContext{
+		llTable:     s.litLengths.fse.dt[:maxTablesize],
+		mlTable:     s.matchLengths.fse.dt[:maxTablesize],
+		ofTable:     s.offsets.fse.dt[:maxTablesize],
+		llState:     uint64(s.litLengths.state.state),
+		mlState:     uint64(s.matchLengths.state.state),
+		ofState:     uint64(s.offsets.state.state),
+		iteration:   s.nSeqs - 1,
+		litRemain:   len(s.literals),
+		out:         s.out,
+		outPosition: len(s.out),
+		literals:    s.literals,
+		windowSize:  s.windowSize,
+		history:     hist,
+	}
+
+	s.seqSize = 0
+	startSize := len(s.out)
+
+	var errCode int
+	if cpuinfo.HasBMI2() {
+		if useSafe {
+			errCode = sequenceDecs_decodeSync_safe_bmi2(s, br, &ctx)
+		} else {
+			errCode = sequenceDecs_decodeSync_bmi2(s, br, &ctx)
+		}
+	} else {
+		if useSafe {
+			errCode = sequenceDecs_decodeSync_safe_amd64(s, br, &ctx)
+		} else {
+			errCode = sequenceDecs_decodeSync_amd64(s, br, &ctx)
+		}
+	}
+	switch errCode {
+	case noError:
+		break
+
+	case errorMatchLenOfsMismatch:
+		return true, fmt.Errorf("zero matchoff and matchlen (%d) > 0", ctx.ml)
+
+	case errorMatchLenTooBig:
+		return true, fmt.Errorf("match len (%d) bigger than max allowed length", ctx.ml)
+
+	case errorMatchOffTooBig:
+		return true, fmt.Errorf("match offset (%d) bigger than current history (%d)",
+			ctx.mo, ctx.outPosition+len(hist)-startSize)
+
+	case errorNotEnoughLiterals:
+		return true, fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available",
+			ctx.ll, ctx.litRemain+ctx.ll)
+
+	case errorNotEnoughSpace:
+		size := ctx.outPosition + ctx.ll + ctx.ml
+		if debugDecoder {
+			println("msl:", s.maxSyncLen, "cap", cap(s.out), "bef:", startSize, "sz:", size-startSize, "mbs:", maxBlockSize, "outsz:", cap(s.out)-startSize)
+		}
+		return true, fmt.Errorf("output (%d) bigger than max block size (%d)", size-startSize, maxBlockSize)
+
+	default:
+		return true, fmt.Errorf("sequenceDecs_decode returned erronous code %d", errCode)
+	}
+
+	s.seqSize += ctx.litRemain
+	if s.seqSize > maxBlockSize {
+		return true, fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+	}
+	err := br.close()
+	if err != nil {
+		printf("Closing sequences: %v, %+v\n", err, *br)
+		return true, err
+	}
+
+	s.literals = s.literals[ctx.litPosition:]
+	t := ctx.outPosition
+	s.out = s.out[:t]
+
+	// Add final literals
+	s.out = append(s.out, s.literals...)
+	if debugDecoder {
+		t += len(s.literals)
+		if t != len(s.out) {
+			panic(fmt.Errorf("length mismatch, want %d, got %d", len(s.out), t))
+		}
+	}
+
+	return true, nil
+}
+
+// --------------------------------------------------------------------------------
+
+type decodeAsmContext struct {
+	llTable   []decSymbol
+	mlTable   []decSymbol
+	ofTable   []decSymbol
+	llState   uint64
+	mlState   uint64
+	ofState   uint64
+	iteration int
+	seqs      []seqVals
+	litRemain int
+}
+
+const noError = 0
+
+// error reported when mo == 0 && ml > 0
+const errorMatchLenOfsMismatch = 1
+
+// error reported when ml > maxMatchLen
+const errorMatchLenTooBig = 2
+
+// error reported when mo > available history or mo > s.windowSize
+const errorMatchOffTooBig = 3
+
+// error reported when the sum of literal lengths exeeceds the literal buffer size
+const errorNotEnoughLiterals = 4
+
+// error reported when capacity of `out` is too small
+const errorNotEnoughSpace = 5
+
+// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm.
+//
+// Please refer to seqdec_generic.go for the reference implementation.
+//go:noescape
+func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+
+// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm.
+//
+// Please refer to seqdec_generic.go for the reference implementation.
+//go:noescape
+func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+
+// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions.
+//go:noescape
+func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+
+// sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions.
+//go:noescape
+func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+
+// decode sequences from the stream without the provided history.
+func (s *sequenceDecs) decode(seqs []seqVals) error {
+	br := s.br
+
+	maxBlockSize := maxCompressedBlockSize
+	if s.windowSize < maxBlockSize {
+		maxBlockSize = s.windowSize
+	}
+
+	ctx := decodeAsmContext{
+		llTable:   s.litLengths.fse.dt[:maxTablesize],
+		mlTable:   s.matchLengths.fse.dt[:maxTablesize],
+		ofTable:   s.offsets.fse.dt[:maxTablesize],
+		llState:   uint64(s.litLengths.state.state),
+		mlState:   uint64(s.matchLengths.state.state),
+		ofState:   uint64(s.offsets.state.state),
+		seqs:      seqs,
+		iteration: len(seqs) - 1,
+		litRemain: len(s.literals),
+	}
+
+	s.seqSize = 0
+	lte56bits := s.maxBits+s.offsets.fse.actualTableLog+s.matchLengths.fse.actualTableLog+s.litLengths.fse.actualTableLog <= 56
+	var errCode int
+	if cpuinfo.HasBMI2() {
+		if lte56bits {
+			errCode = sequenceDecs_decode_56_bmi2(s, br, &ctx)
+		} else {
+			errCode = sequenceDecs_decode_bmi2(s, br, &ctx)
+		}
+	} else {
+		if lte56bits {
+			errCode = sequenceDecs_decode_56_amd64(s, br, &ctx)
+		} else {
+			errCode = sequenceDecs_decode_amd64(s, br, &ctx)
+		}
+	}
+	if errCode != 0 {
+		i := len(seqs) - ctx.iteration - 1
+		switch errCode {
+		case errorMatchLenOfsMismatch:
+			ml := ctx.seqs[i].ml
+			return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
+
+		case errorMatchLenTooBig:
+			ml := ctx.seqs[i].ml
+			return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
+
+		case errorNotEnoughLiterals:
+			ll := ctx.seqs[i].ll
+			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, ctx.litRemain+ll)
+		}
+
+		return fmt.Errorf("sequenceDecs_decode_amd64 returned erronous code %d", errCode)
+	}
+
+	if ctx.litRemain < 0 {
+		return fmt.Errorf("literal count is too big: total available %d, total requested %d",
+			len(s.literals), len(s.literals)-ctx.litRemain)
+	}
+
+	s.seqSize += ctx.litRemain
+	if s.seqSize > maxBlockSize {
+		return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+	}
+	err := br.close()
+	if err != nil {
+		printf("Closing sequences: %v, %+v\n", err, *br)
+	}
+	return err
+}
+
+// --------------------------------------------------------------------------------
+
+type executeAsmContext struct {
+	seqs        []seqVals
+	seqIndex    int
+	out         []byte
+	history     []byte
+	literals    []byte
+	outPosition int
+	litPosition int
+	windowSize  int
+}
+
+// sequenceDecs_executeSimple_amd64 implements the main loop of sequenceDecs.executeSimple in x86 asm.
+//
+// Returns false if a match offset is too big.
+//
+// Please refer to seqdec_generic.go for the reference implementation.
+//go:noescape
+func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
+
+// executeSimple handles cases when dictionary is not used.
+func (s *sequenceDecs) executeSimple(seqs []seqVals, hist []byte) error {
+	// Ensure we have enough output size...
+	if len(s.out)+s.seqSize+compressedBlockOverAlloc > cap(s.out) {
+		addBytes := s.seqSize + len(s.out) + compressedBlockOverAlloc
+		s.out = append(s.out, make([]byte, addBytes)...)
+		s.out = s.out[:len(s.out)-addBytes]
+	}
+
+	if debugDecoder {
+		printf("Execute %d seqs with literals: %d into %d bytes\n", len(seqs), len(s.literals), s.seqSize)
+	}
+
+	var t = len(s.out)
+	out := s.out[:t+s.seqSize]
+
+	ctx := executeAsmContext{
+		seqs:        seqs,
+		seqIndex:    0,
+		out:         out,
+		history:     hist,
+		outPosition: t,
+		litPosition: 0,
+		literals:    s.literals,
+		windowSize:  s.windowSize,
+	}
+
+	ok := sequenceDecs_executeSimple_amd64(&ctx)
+	if !ok {
+		return fmt.Errorf("match offset (%d) bigger than current history (%d)",
+			seqs[ctx.seqIndex].mo, ctx.outPosition+len(hist))
+	}
+	s.literals = s.literals[ctx.litPosition:]
+	t = ctx.outPosition
+
+	// Add final literals
+	copy(out[t:], s.literals)
+	if debugDecoder {
+		t += len(s.literals)
+		if t != len(out) {
+			panic(fmt.Errorf("length mismatch, want %d, got %d, ss: %d", len(out), t, s.seqSize))
+		}
+	}
+	s.out = out
+
+	return nil
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
new file mode 100644
index 0000000..2585b2e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
@@ -0,0 +1,3517 @@
+// Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT.
+
+//go:build !appengine && !noasm && gc && !noasm
+// +build !appengine,!noasm,gc,!noasm
+
+// func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+// Requires: CMOV
+TEXT ·sequenceDecs_decode_amd64(SB), $8-32
+	MOVQ    br+8(FP), AX
+	MOVQ    32(AX), DX
+	MOVBQZX 40(AX), BX
+	MOVQ    24(AX), SI
+	MOVQ    (AX), AX
+	ADDQ    SI, AX
+	MOVQ    AX, (SP)
+	MOVQ    ctx+16(FP), AX
+	MOVQ    72(AX), DI
+	MOVQ    80(AX), R8
+	MOVQ    88(AX), R9
+	MOVQ    104(AX), R10
+	MOVQ    s+0(FP), AX
+	MOVQ    144(AX), R11
+	MOVQ    152(AX), R12
+	MOVQ    160(AX), R13
+
+sequenceDecs_decode_amd64_main_loop:
+	MOVQ (SP), R14
+
+	// Fill bitreader to have enough for the offset and match length.
+	CMPQ SI, $0x08
+	JL   sequenceDecs_decode_amd64_fill_byte_by_byte
+	MOVQ BX, AX
+	SHRQ $0x03, AX
+	SUBQ AX, R14
+	MOVQ (R14), DX
+	SUBQ AX, SI
+	ANDQ $0x07, BX
+	JMP  sequenceDecs_decode_amd64_fill_end
+
+sequenceDecs_decode_amd64_fill_byte_by_byte:
+	CMPQ    SI, $0x00
+	JLE     sequenceDecs_decode_amd64_fill_end
+	CMPQ    BX, $0x07
+	JLE     sequenceDecs_decode_amd64_fill_end
+	SHLQ    $0x08, DX
+	SUBQ    $0x01, R14
+	SUBQ    $0x01, SI
+	SUBQ    $0x08, BX
+	MOVBQZX (R14), AX
+	ORQ     AX, DX
+	JMP     sequenceDecs_decode_amd64_fill_byte_by_byte
+
+sequenceDecs_decode_amd64_fill_end:
+	// Update offset
+	MOVQ    R9, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R15
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R15
+	ADDQ    R15, AX
+	MOVQ    AX, 16(R10)
+
+	// Update match length
+	MOVQ    R8, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R15
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R15
+	ADDQ    R15, AX
+	MOVQ    AX, 8(R10)
+
+	// Fill bitreader to have enough for the remaining
+	CMPQ SI, $0x08
+	JL   sequenceDecs_decode_amd64_fill_2_byte_by_byte
+	MOVQ BX, AX
+	SHRQ $0x03, AX
+	SUBQ AX, R14
+	MOVQ (R14), DX
+	SUBQ AX, SI
+	ANDQ $0x07, BX
+	JMP  sequenceDecs_decode_amd64_fill_2_end
+
+sequenceDecs_decode_amd64_fill_2_byte_by_byte:
+	CMPQ    SI, $0x00
+	JLE     sequenceDecs_decode_amd64_fill_2_end
+	CMPQ    BX, $0x07
+	JLE     sequenceDecs_decode_amd64_fill_2_end
+	SHLQ    $0x08, DX
+	SUBQ    $0x01, R14
+	SUBQ    $0x01, SI
+	SUBQ    $0x08, BX
+	MOVBQZX (R14), AX
+	ORQ     AX, DX
+	JMP     sequenceDecs_decode_amd64_fill_2_byte_by_byte
+
+sequenceDecs_decode_amd64_fill_2_end:
+	// Update literal length
+	MOVQ    DI, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R15
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R15
+	ADDQ    R15, AX
+	MOVQ    AX, (R10)
+
+	// Fill bitreader for state updates
+	MOVQ    R14, (SP)
+	MOVQ    R9, AX
+	SHRQ    $0x08, AX
+	MOVBQZX AL, AX
+	MOVQ    ctx+16(FP), CX
+	CMPQ    96(CX), $0x00
+	JZ      sequenceDecs_decode_amd64_skip_update
+
+	// Update Literal Length State
+	MOVBQZX DI, R14
+	SHRQ    $0x10, DI
+	MOVWQZX DI, DI
+	CMPQ    R14, $0x00
+	JZ      sequenceDecs_decode_amd64_llState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R14, BX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVQ    R14, CX
+	NEGQ    CX
+	SHRQ    CL, R15
+	ADDQ    R15, DI
+
+sequenceDecs_decode_amd64_llState_updateState_skip_zero:
+	// Load ctx.llTable
+	MOVQ ctx+16(FP), CX
+	MOVQ (CX), CX
+	MOVQ (CX)(DI*8), DI
+
+	// Update Match Length State
+	MOVBQZX R8, R14
+	SHRQ    $0x10, R8
+	MOVWQZX R8, R8
+	CMPQ    R14, $0x00
+	JZ      sequenceDecs_decode_amd64_mlState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R14, BX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVQ    R14, CX
+	NEGQ    CX
+	SHRQ    CL, R15
+	ADDQ    R15, R8
+
+sequenceDecs_decode_amd64_mlState_updateState_skip_zero:
+	// Load ctx.mlTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 24(CX), CX
+	MOVQ (CX)(R8*8), R8
+
+	// Update Offset State
+	MOVBQZX R9, R14
+	SHRQ    $0x10, R9
+	MOVWQZX R9, R9
+	CMPQ    R14, $0x00
+	JZ      sequenceDecs_decode_amd64_ofState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R14, BX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVQ    R14, CX
+	NEGQ    CX
+	SHRQ    CL, R15
+	ADDQ    R15, R9
+
+sequenceDecs_decode_amd64_ofState_updateState_skip_zero:
+	// Load ctx.ofTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 48(CX), CX
+	MOVQ (CX)(R9*8), R9
+
+sequenceDecs_decode_amd64_skip_update:
+	// Adjust offset
+	MOVQ 16(R10), CX
+	CMPQ AX, $0x01
+	JBE  sequenceDecs_decode_amd64_adjust_offsetB_1_or_0
+	MOVQ R12, R13
+	MOVQ R11, R12
+	MOVQ CX, R11
+	JMP  sequenceDecs_decode_amd64_adjust_end
+
+sequenceDecs_decode_amd64_adjust_offsetB_1_or_0:
+	CMPQ (R10), $0x00000000
+	JNE  sequenceDecs_decode_amd64_adjust_offset_maybezero
+	INCQ CX
+	JMP  sequenceDecs_decode_amd64_adjust_offset_nonzero
+
+sequenceDecs_decode_amd64_adjust_offset_maybezero:
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decode_amd64_adjust_offset_nonzero
+	MOVQ  R11, CX
+	JMP   sequenceDecs_decode_amd64_adjust_end
+
+sequenceDecs_decode_amd64_adjust_offset_nonzero:
+	CMPQ CX, $0x01
+	JB   sequenceDecs_decode_amd64_adjust_zero
+	JEQ  sequenceDecs_decode_amd64_adjust_one
+	CMPQ CX, $0x02
+	JA   sequenceDecs_decode_amd64_adjust_three
+	JMP  sequenceDecs_decode_amd64_adjust_two
+
+sequenceDecs_decode_amd64_adjust_zero:
+	MOVQ R11, AX
+	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_amd64_adjust_one:
+	MOVQ R12, AX
+	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_amd64_adjust_two:
+	MOVQ R13, AX
+	JMP  sequenceDecs_decode_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_amd64_adjust_three:
+	LEAQ -1(R11), AX
+
+sequenceDecs_decode_amd64_adjust_test_temp_valid:
+	TESTQ AX, AX
+	JNZ   sequenceDecs_decode_amd64_adjust_temp_valid
+	MOVQ  $0x00000001, AX
+
+sequenceDecs_decode_amd64_adjust_temp_valid:
+	CMPQ    CX, $0x01
+	CMOVQNE R12, R13
+	MOVQ    R11, R12
+	MOVQ    AX, R11
+	MOVQ    AX, CX
+
+sequenceDecs_decode_amd64_adjust_end:
+	MOVQ CX, 16(R10)
+
+	// Check values
+	MOVQ  8(R10), AX
+	MOVQ  (R10), R14
+	LEAQ  (AX)(R14*1), R15
+	MOVQ  s+0(FP), BP
+	ADDQ  R15, 256(BP)
+	MOVQ  ctx+16(FP), R15
+	SUBQ  R14, 128(R15)
+	JS    error_not_enough_literals
+	CMPQ  AX, $0x00020002
+	JA    sequenceDecs_decode_amd64_error_match_len_too_big
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decode_amd64_match_len_ofs_ok
+	TESTQ AX, AX
+	JNZ   sequenceDecs_decode_amd64_error_match_len_ofs_mismatch
+
+sequenceDecs_decode_amd64_match_len_ofs_ok:
+	ADDQ $0x18, R10
+	MOVQ ctx+16(FP), AX
+	DECQ 96(AX)
+	JNS  sequenceDecs_decode_amd64_main_loop
+	MOVQ s+0(FP), AX
+	MOVQ R11, 144(AX)
+	MOVQ R12, 152(AX)
+	MOVQ R13, 160(AX)
+	MOVQ br+8(FP), AX
+	MOVQ DX, 32(AX)
+	MOVB BL, 40(AX)
+	MOVQ SI, 24(AX)
+
+	// Return success
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+	// Return with match length error
+sequenceDecs_decode_amd64_error_match_len_ofs_mismatch:
+	MOVQ $0x00000001, ret+24(FP)
+	RET
+
+	// Return with match too long error
+sequenceDecs_decode_amd64_error_match_len_too_big:
+	MOVQ $0x00000002, ret+24(FP)
+	RET
+
+	// Return with match offset too long error
+	MOVQ $0x00000003, ret+24(FP)
+	RET
+
+	// Return with not enough literals error
+error_not_enough_literals:
+	MOVQ $0x00000004, ret+24(FP)
+	RET
+
+	// Return with not enough output space error
+	MOVQ $0x00000005, ret+24(FP)
+	RET
+
+// func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+// Requires: CMOV
+TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
+	MOVQ    br+8(FP), AX
+	MOVQ    32(AX), DX
+	MOVBQZX 40(AX), BX
+	MOVQ    24(AX), SI
+	MOVQ    (AX), AX
+	ADDQ    SI, AX
+	MOVQ    AX, (SP)
+	MOVQ    ctx+16(FP), AX
+	MOVQ    72(AX), DI
+	MOVQ    80(AX), R8
+	MOVQ    88(AX), R9
+	MOVQ    104(AX), R10
+	MOVQ    s+0(FP), AX
+	MOVQ    144(AX), R11
+	MOVQ    152(AX), R12
+	MOVQ    160(AX), R13
+
+sequenceDecs_decode_56_amd64_main_loop:
+	MOVQ (SP), R14
+
+	// Fill bitreader to have enough for the offset and match length.
+	CMPQ SI, $0x08
+	JL   sequenceDecs_decode_56_amd64_fill_byte_by_byte
+	MOVQ BX, AX
+	SHRQ $0x03, AX
+	SUBQ AX, R14
+	MOVQ (R14), DX
+	SUBQ AX, SI
+	ANDQ $0x07, BX
+	JMP  sequenceDecs_decode_56_amd64_fill_end
+
+sequenceDecs_decode_56_amd64_fill_byte_by_byte:
+	CMPQ    SI, $0x00
+	JLE     sequenceDecs_decode_56_amd64_fill_end
+	CMPQ    BX, $0x07
+	JLE     sequenceDecs_decode_56_amd64_fill_end
+	SHLQ    $0x08, DX
+	SUBQ    $0x01, R14
+	SUBQ    $0x01, SI
+	SUBQ    $0x08, BX
+	MOVBQZX (R14), AX
+	ORQ     AX, DX
+	JMP     sequenceDecs_decode_56_amd64_fill_byte_by_byte
+
+sequenceDecs_decode_56_amd64_fill_end:
+	// Update offset
+	MOVQ    R9, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R15
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R15
+	ADDQ    R15, AX
+	MOVQ    AX, 16(R10)
+
+	// Update match length
+	MOVQ    R8, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R15
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R15
+	ADDQ    R15, AX
+	MOVQ    AX, 8(R10)
+
+	// Update literal length
+	MOVQ    DI, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R15
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R15
+	ADDQ    R15, AX
+	MOVQ    AX, (R10)
+
+	// Fill bitreader for state updates
+	MOVQ    R14, (SP)
+	MOVQ    R9, AX
+	SHRQ    $0x08, AX
+	MOVBQZX AL, AX
+	MOVQ    ctx+16(FP), CX
+	CMPQ    96(CX), $0x00
+	JZ      sequenceDecs_decode_56_amd64_skip_update
+
+	// Update Literal Length State
+	MOVBQZX DI, R14
+	SHRQ    $0x10, DI
+	MOVWQZX DI, DI
+	CMPQ    R14, $0x00
+	JZ      sequenceDecs_decode_56_amd64_llState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R14, BX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVQ    R14, CX
+	NEGQ    CX
+	SHRQ    CL, R15
+	ADDQ    R15, DI
+
+sequenceDecs_decode_56_amd64_llState_updateState_skip_zero:
+	// Load ctx.llTable
+	MOVQ ctx+16(FP), CX
+	MOVQ (CX), CX
+	MOVQ (CX)(DI*8), DI
+
+	// Update Match Length State
+	MOVBQZX R8, R14
+	SHRQ    $0x10, R8
+	MOVWQZX R8, R8
+	CMPQ    R14, $0x00
+	JZ      sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R14, BX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVQ    R14, CX
+	NEGQ    CX
+	SHRQ    CL, R15
+	ADDQ    R15, R8
+
+sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero:
+	// Load ctx.mlTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 24(CX), CX
+	MOVQ (CX)(R8*8), R8
+
+	// Update Offset State
+	MOVBQZX R9, R14
+	SHRQ    $0x10, R9
+	MOVWQZX R9, R9
+	CMPQ    R14, $0x00
+	JZ      sequenceDecs_decode_56_amd64_ofState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R14, BX
+	MOVQ    DX, R15
+	SHLQ    CL, R15
+	MOVQ    R14, CX
+	NEGQ    CX
+	SHRQ    CL, R15
+	ADDQ    R15, R9
+
+sequenceDecs_decode_56_amd64_ofState_updateState_skip_zero:
+	// Load ctx.ofTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 48(CX), CX
+	MOVQ (CX)(R9*8), R9
+
+sequenceDecs_decode_56_amd64_skip_update:
+	// Adjust offset
+	MOVQ 16(R10), CX
+	CMPQ AX, $0x01
+	JBE  sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0
+	MOVQ R12, R13
+	MOVQ R11, R12
+	MOVQ CX, R11
+	JMP  sequenceDecs_decode_56_amd64_adjust_end
+
+sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0:
+	CMPQ (R10), $0x00000000
+	JNE  sequenceDecs_decode_56_amd64_adjust_offset_maybezero
+	INCQ CX
+	JMP  sequenceDecs_decode_56_amd64_adjust_offset_nonzero
+
+sequenceDecs_decode_56_amd64_adjust_offset_maybezero:
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decode_56_amd64_adjust_offset_nonzero
+	MOVQ  R11, CX
+	JMP   sequenceDecs_decode_56_amd64_adjust_end
+
+sequenceDecs_decode_56_amd64_adjust_offset_nonzero:
+	CMPQ CX, $0x01
+	JB   sequenceDecs_decode_56_amd64_adjust_zero
+	JEQ  sequenceDecs_decode_56_amd64_adjust_one
+	CMPQ CX, $0x02
+	JA   sequenceDecs_decode_56_amd64_adjust_three
+	JMP  sequenceDecs_decode_56_amd64_adjust_two
+
+sequenceDecs_decode_56_amd64_adjust_zero:
+	MOVQ R11, AX
+	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_56_amd64_adjust_one:
+	MOVQ R12, AX
+	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_56_amd64_adjust_two:
+	MOVQ R13, AX
+	JMP  sequenceDecs_decode_56_amd64_adjust_test_temp_valid
+
+sequenceDecs_decode_56_amd64_adjust_three:
+	LEAQ -1(R11), AX
+
+sequenceDecs_decode_56_amd64_adjust_test_temp_valid:
+	TESTQ AX, AX
+	JNZ   sequenceDecs_decode_56_amd64_adjust_temp_valid
+	MOVQ  $0x00000001, AX
+
+sequenceDecs_decode_56_amd64_adjust_temp_valid:
+	CMPQ    CX, $0x01
+	CMOVQNE R12, R13
+	MOVQ    R11, R12
+	MOVQ    AX, R11
+	MOVQ    AX, CX
+
+sequenceDecs_decode_56_amd64_adjust_end:
+	MOVQ CX, 16(R10)
+
+	// Check values
+	MOVQ  8(R10), AX
+	MOVQ  (R10), R14
+	LEAQ  (AX)(R14*1), R15
+	MOVQ  s+0(FP), BP
+	ADDQ  R15, 256(BP)
+	MOVQ  ctx+16(FP), R15
+	SUBQ  R14, 128(R15)
+	JS    error_not_enough_literals
+	CMPQ  AX, $0x00020002
+	JA    sequenceDecs_decode_56_amd64_error_match_len_too_big
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decode_56_amd64_match_len_ofs_ok
+	TESTQ AX, AX
+	JNZ   sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch
+
+sequenceDecs_decode_56_amd64_match_len_ofs_ok:
+	ADDQ $0x18, R10
+	MOVQ ctx+16(FP), AX
+	DECQ 96(AX)
+	JNS  sequenceDecs_decode_56_amd64_main_loop
+	MOVQ s+0(FP), AX
+	MOVQ R11, 144(AX)
+	MOVQ R12, 152(AX)
+	MOVQ R13, 160(AX)
+	MOVQ br+8(FP), AX
+	MOVQ DX, 32(AX)
+	MOVB BL, 40(AX)
+	MOVQ SI, 24(AX)
+
+	// Return success
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+	// Return with match length error
+sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch:
+	MOVQ $0x00000001, ret+24(FP)
+	RET
+
+	// Return with match too long error
+sequenceDecs_decode_56_amd64_error_match_len_too_big:
+	MOVQ $0x00000002, ret+24(FP)
+	RET
+
+	// Return with match offset too long error
+	MOVQ $0x00000003, ret+24(FP)
+	RET
+
+	// Return with not enough literals error
+error_not_enough_literals:
+	MOVQ $0x00000004, ret+24(FP)
+	RET
+
+	// Return with not enough output space error
+	MOVQ $0x00000005, ret+24(FP)
+	RET
+
+// func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+// Requires: BMI, BMI2, CMOV
+TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
+	MOVQ    br+8(FP), CX
+	MOVQ    32(CX), AX
+	MOVBQZX 40(CX), DX
+	MOVQ    24(CX), BX
+	MOVQ    (CX), CX
+	ADDQ    BX, CX
+	MOVQ    CX, (SP)
+	MOVQ    ctx+16(FP), CX
+	MOVQ    72(CX), SI
+	MOVQ    80(CX), DI
+	MOVQ    88(CX), R8
+	MOVQ    104(CX), R9
+	MOVQ    s+0(FP), CX
+	MOVQ    144(CX), R10
+	MOVQ    152(CX), R11
+	MOVQ    160(CX), R12
+
+sequenceDecs_decode_bmi2_main_loop:
+	MOVQ (SP), R13
+
+	// Fill bitreader to have enough for the offset and match length.
+	CMPQ BX, $0x08
+	JL   sequenceDecs_decode_bmi2_fill_byte_by_byte
+	MOVQ DX, CX
+	SHRQ $0x03, CX
+	SUBQ CX, R13
+	MOVQ (R13), AX
+	SUBQ CX, BX
+	ANDQ $0x07, DX
+	JMP  sequenceDecs_decode_bmi2_fill_end
+
+sequenceDecs_decode_bmi2_fill_byte_by_byte:
+	CMPQ    BX, $0x00
+	JLE     sequenceDecs_decode_bmi2_fill_end
+	CMPQ    DX, $0x07
+	JLE     sequenceDecs_decode_bmi2_fill_end
+	SHLQ    $0x08, AX
+	SUBQ    $0x01, R13
+	SUBQ    $0x01, BX
+	SUBQ    $0x08, DX
+	MOVBQZX (R13), CX
+	ORQ     CX, AX
+	JMP     sequenceDecs_decode_bmi2_fill_byte_by_byte
+
+sequenceDecs_decode_bmi2_fill_end:
+	// Update offset
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, R8, R14
+	MOVQ   AX, R15
+	LEAQ   (DX)(R14*1), CX
+	ROLQ   CL, R15
+	BZHIQ  R14, R15, R15
+	MOVQ   CX, DX
+	MOVQ   R8, CX
+	SHRQ   $0x20, CX
+	ADDQ   R15, CX
+	MOVQ   CX, 16(R9)
+
+	// Update match length
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, DI, R14
+	MOVQ   AX, R15
+	LEAQ   (DX)(R14*1), CX
+	ROLQ   CL, R15
+	BZHIQ  R14, R15, R15
+	MOVQ   CX, DX
+	MOVQ   DI, CX
+	SHRQ   $0x20, CX
+	ADDQ   R15, CX
+	MOVQ   CX, 8(R9)
+
+	// Fill bitreader to have enough for the remaining
+	CMPQ BX, $0x08
+	JL   sequenceDecs_decode_bmi2_fill_2_byte_by_byte
+	MOVQ DX, CX
+	SHRQ $0x03, CX
+	SUBQ CX, R13
+	MOVQ (R13), AX
+	SUBQ CX, BX
+	ANDQ $0x07, DX
+	JMP  sequenceDecs_decode_bmi2_fill_2_end
+
+sequenceDecs_decode_bmi2_fill_2_byte_by_byte:
+	CMPQ    BX, $0x00
+	JLE     sequenceDecs_decode_bmi2_fill_2_end
+	CMPQ    DX, $0x07
+	JLE     sequenceDecs_decode_bmi2_fill_2_end
+	SHLQ    $0x08, AX
+	SUBQ    $0x01, R13
+	SUBQ    $0x01, BX
+	SUBQ    $0x08, DX
+	MOVBQZX (R13), CX
+	ORQ     CX, AX
+	JMP     sequenceDecs_decode_bmi2_fill_2_byte_by_byte
+
+sequenceDecs_decode_bmi2_fill_2_end:
+	// Update literal length
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, SI, R14
+	MOVQ   AX, R15
+	LEAQ   (DX)(R14*1), CX
+	ROLQ   CL, R15
+	BZHIQ  R14, R15, R15
+	MOVQ   CX, DX
+	MOVQ   SI, CX
+	SHRQ   $0x20, CX
+	ADDQ   R15, CX
+	MOVQ   CX, (R9)
+
+	// Fill bitreader for state updates
+	MOVQ   R13, (SP)
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, R8, R13
+	MOVQ   ctx+16(FP), CX
+	CMPQ   96(CX), $0x00
+	JZ     sequenceDecs_decode_bmi2_skip_update
+
+	// Update Literal Length State
+	MOVBQZX SI, R14
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, SI, SI
+	LEAQ    (DX)(R14*1), CX
+	MOVQ    AX, R15
+	MOVQ    CX, DX
+	ROLQ    CL, R15
+	BZHIQ   R14, R15, R15
+	ADDQ    R15, SI
+
+	// Load ctx.llTable
+	MOVQ ctx+16(FP), CX
+	MOVQ (CX), CX
+	MOVQ (CX)(SI*8), SI
+
+	// Update Match Length State
+	MOVBQZX DI, R14
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, DI, DI
+	LEAQ    (DX)(R14*1), CX
+	MOVQ    AX, R15
+	MOVQ    CX, DX
+	ROLQ    CL, R15
+	BZHIQ   R14, R15, R15
+	ADDQ    R15, DI
+
+	// Load ctx.mlTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 24(CX), CX
+	MOVQ (CX)(DI*8), DI
+
+	// Update Offset State
+	MOVBQZX R8, R14
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, R8, R8
+	LEAQ    (DX)(R14*1), CX
+	MOVQ    AX, R15
+	MOVQ    CX, DX
+	ROLQ    CL, R15
+	BZHIQ   R14, R15, R15
+	ADDQ    R15, R8
+
+	// Load ctx.ofTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 48(CX), CX
+	MOVQ (CX)(R8*8), R8
+
+sequenceDecs_decode_bmi2_skip_update:
+	// Adjust offset
+	MOVQ 16(R9), CX
+	CMPQ R13, $0x01
+	JBE  sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0
+	MOVQ R11, R12
+	MOVQ R10, R11
+	MOVQ CX, R10
+	JMP  sequenceDecs_decode_bmi2_adjust_end
+
+sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0:
+	CMPQ (R9), $0x00000000
+	JNE  sequenceDecs_decode_bmi2_adjust_offset_maybezero
+	INCQ CX
+	JMP  sequenceDecs_decode_bmi2_adjust_offset_nonzero
+
+sequenceDecs_decode_bmi2_adjust_offset_maybezero:
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decode_bmi2_adjust_offset_nonzero
+	MOVQ  R10, CX
+	JMP   sequenceDecs_decode_bmi2_adjust_end
+
+sequenceDecs_decode_bmi2_adjust_offset_nonzero:
+	CMPQ CX, $0x01
+	JB   sequenceDecs_decode_bmi2_adjust_zero
+	JEQ  sequenceDecs_decode_bmi2_adjust_one
+	CMPQ CX, $0x02
+	JA   sequenceDecs_decode_bmi2_adjust_three
+	JMP  sequenceDecs_decode_bmi2_adjust_two
+
+sequenceDecs_decode_bmi2_adjust_zero:
+	MOVQ R10, R13
+	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_bmi2_adjust_one:
+	MOVQ R11, R13
+	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_bmi2_adjust_two:
+	MOVQ R12, R13
+	JMP  sequenceDecs_decode_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_bmi2_adjust_three:
+	LEAQ -1(R10), R13
+
+sequenceDecs_decode_bmi2_adjust_test_temp_valid:
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decode_bmi2_adjust_temp_valid
+	MOVQ  $0x00000001, R13
+
+sequenceDecs_decode_bmi2_adjust_temp_valid:
+	CMPQ    CX, $0x01
+	CMOVQNE R11, R12
+	MOVQ    R10, R11
+	MOVQ    R13, R10
+	MOVQ    R13, CX
+
+sequenceDecs_decode_bmi2_adjust_end:
+	MOVQ CX, 16(R9)
+
+	// Check values
+	MOVQ  8(R9), R13
+	MOVQ  (R9), R14
+	LEAQ  (R13)(R14*1), R15
+	MOVQ  s+0(FP), BP
+	ADDQ  R15, 256(BP)
+	MOVQ  ctx+16(FP), R15
+	SUBQ  R14, 128(R15)
+	JS    error_not_enough_literals
+	CMPQ  R13, $0x00020002
+	JA    sequenceDecs_decode_bmi2_error_match_len_too_big
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decode_bmi2_match_len_ofs_ok
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch
+
+sequenceDecs_decode_bmi2_match_len_ofs_ok:
+	ADDQ $0x18, R9
+	MOVQ ctx+16(FP), CX
+	DECQ 96(CX)
+	JNS  sequenceDecs_decode_bmi2_main_loop
+	MOVQ s+0(FP), CX
+	MOVQ R10, 144(CX)
+	MOVQ R11, 152(CX)
+	MOVQ R12, 160(CX)
+	MOVQ br+8(FP), CX
+	MOVQ AX, 32(CX)
+	MOVB DL, 40(CX)
+	MOVQ BX, 24(CX)
+
+	// Return success
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+	// Return with match length error
+sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch:
+	MOVQ $0x00000001, ret+24(FP)
+	RET
+
+	// Return with match too long error
+sequenceDecs_decode_bmi2_error_match_len_too_big:
+	MOVQ $0x00000002, ret+24(FP)
+	RET
+
+	// Return with match offset too long error
+	MOVQ $0x00000003, ret+24(FP)
+	RET
+
+	// Return with not enough literals error
+error_not_enough_literals:
+	MOVQ $0x00000004, ret+24(FP)
+	RET
+
+	// Return with not enough output space error
+	MOVQ $0x00000005, ret+24(FP)
+	RET
+
+// func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
+// Requires: BMI, BMI2, CMOV
+TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
+	MOVQ    br+8(FP), CX
+	MOVQ    32(CX), AX
+	MOVBQZX 40(CX), DX
+	MOVQ    24(CX), BX
+	MOVQ    (CX), CX
+	ADDQ    BX, CX
+	MOVQ    CX, (SP)
+	MOVQ    ctx+16(FP), CX
+	MOVQ    72(CX), SI
+	MOVQ    80(CX), DI
+	MOVQ    88(CX), R8
+	MOVQ    104(CX), R9
+	MOVQ    s+0(FP), CX
+	MOVQ    144(CX), R10
+	MOVQ    152(CX), R11
+	MOVQ    160(CX), R12
+
+sequenceDecs_decode_56_bmi2_main_loop:
+	MOVQ (SP), R13
+
+	// Fill bitreader to have enough for the offset and match length.
+	CMPQ BX, $0x08
+	JL   sequenceDecs_decode_56_bmi2_fill_byte_by_byte
+	MOVQ DX, CX
+	SHRQ $0x03, CX
+	SUBQ CX, R13
+	MOVQ (R13), AX
+	SUBQ CX, BX
+	ANDQ $0x07, DX
+	JMP  sequenceDecs_decode_56_bmi2_fill_end
+
+sequenceDecs_decode_56_bmi2_fill_byte_by_byte:
+	CMPQ    BX, $0x00
+	JLE     sequenceDecs_decode_56_bmi2_fill_end
+	CMPQ    DX, $0x07
+	JLE     sequenceDecs_decode_56_bmi2_fill_end
+	SHLQ    $0x08, AX
+	SUBQ    $0x01, R13
+	SUBQ    $0x01, BX
+	SUBQ    $0x08, DX
+	MOVBQZX (R13), CX
+	ORQ     CX, AX
+	JMP     sequenceDecs_decode_56_bmi2_fill_byte_by_byte
+
+sequenceDecs_decode_56_bmi2_fill_end:
+	// Update offset
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, R8, R14
+	MOVQ   AX, R15
+	LEAQ   (DX)(R14*1), CX
+	ROLQ   CL, R15
+	BZHIQ  R14, R15, R15
+	MOVQ   CX, DX
+	MOVQ   R8, CX
+	SHRQ   $0x20, CX
+	ADDQ   R15, CX
+	MOVQ   CX, 16(R9)
+
+	// Update match length
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, DI, R14
+	MOVQ   AX, R15
+	LEAQ   (DX)(R14*1), CX
+	ROLQ   CL, R15
+	BZHIQ  R14, R15, R15
+	MOVQ   CX, DX
+	MOVQ   DI, CX
+	SHRQ   $0x20, CX
+	ADDQ   R15, CX
+	MOVQ   CX, 8(R9)
+
+	// Update literal length
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, SI, R14
+	MOVQ   AX, R15
+	LEAQ   (DX)(R14*1), CX
+	ROLQ   CL, R15
+	BZHIQ  R14, R15, R15
+	MOVQ   CX, DX
+	MOVQ   SI, CX
+	SHRQ   $0x20, CX
+	ADDQ   R15, CX
+	MOVQ   CX, (R9)
+
+	// Fill bitreader for state updates
+	MOVQ   R13, (SP)
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, R8, R13
+	MOVQ   ctx+16(FP), CX
+	CMPQ   96(CX), $0x00
+	JZ     sequenceDecs_decode_56_bmi2_skip_update
+
+	// Update Literal Length State
+	MOVBQZX SI, R14
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, SI, SI
+	LEAQ    (DX)(R14*1), CX
+	MOVQ    AX, R15
+	MOVQ    CX, DX
+	ROLQ    CL, R15
+	BZHIQ   R14, R15, R15
+	ADDQ    R15, SI
+
+	// Load ctx.llTable
+	MOVQ ctx+16(FP), CX
+	MOVQ (CX), CX
+	MOVQ (CX)(SI*8), SI
+
+	// Update Match Length State
+	MOVBQZX DI, R14
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, DI, DI
+	LEAQ    (DX)(R14*1), CX
+	MOVQ    AX, R15
+	MOVQ    CX, DX
+	ROLQ    CL, R15
+	BZHIQ   R14, R15, R15
+	ADDQ    R15, DI
+
+	// Load ctx.mlTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 24(CX), CX
+	MOVQ (CX)(DI*8), DI
+
+	// Update Offset State
+	MOVBQZX R8, R14
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, R8, R8
+	LEAQ    (DX)(R14*1), CX
+	MOVQ    AX, R15
+	MOVQ    CX, DX
+	ROLQ    CL, R15
+	BZHIQ   R14, R15, R15
+	ADDQ    R15, R8
+
+	// Load ctx.ofTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 48(CX), CX
+	MOVQ (CX)(R8*8), R8
+
+sequenceDecs_decode_56_bmi2_skip_update:
+	// Adjust offset
+	MOVQ 16(R9), CX
+	CMPQ R13, $0x01
+	JBE  sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0
+	MOVQ R11, R12
+	MOVQ R10, R11
+	MOVQ CX, R10
+	JMP  sequenceDecs_decode_56_bmi2_adjust_end
+
+sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0:
+	CMPQ (R9), $0x00000000
+	JNE  sequenceDecs_decode_56_bmi2_adjust_offset_maybezero
+	INCQ CX
+	JMP  sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
+
+sequenceDecs_decode_56_bmi2_adjust_offset_maybezero:
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
+	MOVQ  R10, CX
+	JMP   sequenceDecs_decode_56_bmi2_adjust_end
+
+sequenceDecs_decode_56_bmi2_adjust_offset_nonzero:
+	CMPQ CX, $0x01
+	JB   sequenceDecs_decode_56_bmi2_adjust_zero
+	JEQ  sequenceDecs_decode_56_bmi2_adjust_one
+	CMPQ CX, $0x02
+	JA   sequenceDecs_decode_56_bmi2_adjust_three
+	JMP  sequenceDecs_decode_56_bmi2_adjust_two
+
+sequenceDecs_decode_56_bmi2_adjust_zero:
+	MOVQ R10, R13
+	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_56_bmi2_adjust_one:
+	MOVQ R11, R13
+	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_56_bmi2_adjust_two:
+	MOVQ R12, R13
+	JMP  sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
+
+sequenceDecs_decode_56_bmi2_adjust_three:
+	LEAQ -1(R10), R13
+
+sequenceDecs_decode_56_bmi2_adjust_test_temp_valid:
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decode_56_bmi2_adjust_temp_valid
+	MOVQ  $0x00000001, R13
+
+sequenceDecs_decode_56_bmi2_adjust_temp_valid:
+	CMPQ    CX, $0x01
+	CMOVQNE R11, R12
+	MOVQ    R10, R11
+	MOVQ    R13, R10
+	MOVQ    R13, CX
+
+sequenceDecs_decode_56_bmi2_adjust_end:
+	MOVQ CX, 16(R9)
+
+	// Check values
+	MOVQ  8(R9), R13
+	MOVQ  (R9), R14
+	LEAQ  (R13)(R14*1), R15
+	MOVQ  s+0(FP), BP
+	ADDQ  R15, 256(BP)
+	MOVQ  ctx+16(FP), R15
+	SUBQ  R14, 128(R15)
+	JS    error_not_enough_literals
+	CMPQ  R13, $0x00020002
+	JA    sequenceDecs_decode_56_bmi2_error_match_len_too_big
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decode_56_bmi2_match_len_ofs_ok
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch
+
+sequenceDecs_decode_56_bmi2_match_len_ofs_ok:
+	ADDQ $0x18, R9
+	MOVQ ctx+16(FP), CX
+	DECQ 96(CX)
+	JNS  sequenceDecs_decode_56_bmi2_main_loop
+	MOVQ s+0(FP), CX
+	MOVQ R10, 144(CX)
+	MOVQ R11, 152(CX)
+	MOVQ R12, 160(CX)
+	MOVQ br+8(FP), CX
+	MOVQ AX, 32(CX)
+	MOVB DL, 40(CX)
+	MOVQ BX, 24(CX)
+
+	// Return success
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+	// Return with match length error
+sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch:
+	MOVQ $0x00000001, ret+24(FP)
+	RET
+
+	// Return with match too long error
+sequenceDecs_decode_56_bmi2_error_match_len_too_big:
+	MOVQ $0x00000002, ret+24(FP)
+	RET
+
+	// Return with match offset too long error
+	MOVQ $0x00000003, ret+24(FP)
+	RET
+
+	// Return with not enough literals error
+error_not_enough_literals:
+	MOVQ $0x00000004, ret+24(FP)
+	RET
+
+	// Return with not enough output space error
+	MOVQ $0x00000005, ret+24(FP)
+	RET
+
+// func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
+// Requires: SSE
+TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9
+	MOVQ  ctx+0(FP), R10
+	MOVQ  8(R10), CX
+	TESTQ CX, CX
+	JZ    empty_seqs
+	MOVQ  (R10), AX
+	MOVQ  24(R10), DX
+	MOVQ  32(R10), BX
+	MOVQ  80(R10), SI
+	MOVQ  104(R10), DI
+	MOVQ  120(R10), R8
+	MOVQ  56(R10), R9
+	MOVQ  64(R10), R10
+	ADDQ  R10, R9
+
+	// seqsBase += 24 * seqIndex
+	LEAQ (DX)(DX*2), R11
+	SHLQ $0x03, R11
+	ADDQ R11, AX
+
+	// outBase += outPosition
+	ADDQ DI, BX
+
+main_loop:
+	MOVQ (AX), R11
+	MOVQ 16(AX), R12
+	MOVQ 8(AX), R13
+
+	// Copy literals
+	TESTQ R11, R11
+	JZ    check_offset
+	XORQ  R14, R14
+	TESTQ $0x00000001, R11
+	JZ    copy_1_word
+	MOVB  (SI)(R14*1), R15
+	MOVB  R15, (BX)(R14*1)
+	ADDQ  $0x01, R14
+
+copy_1_word:
+	TESTQ $0x00000002, R11
+	JZ    copy_1_dword
+	MOVW  (SI)(R14*1), R15
+	MOVW  R15, (BX)(R14*1)
+	ADDQ  $0x02, R14
+
+copy_1_dword:
+	TESTQ $0x00000004, R11
+	JZ    copy_1_qword
+	MOVL  (SI)(R14*1), R15
+	MOVL  R15, (BX)(R14*1)
+	ADDQ  $0x04, R14
+
+copy_1_qword:
+	TESTQ $0x00000008, R11
+	JZ    copy_1_test
+	MOVQ  (SI)(R14*1), R15
+	MOVQ  R15, (BX)(R14*1)
+	ADDQ  $0x08, R14
+	JMP   copy_1_test
+
+copy_1:
+	MOVUPS (SI)(R14*1), X0
+	MOVUPS X0, (BX)(R14*1)
+	ADDQ   $0x10, R14
+
+copy_1_test:
+	CMPQ R14, R11
+	JB   copy_1
+	ADDQ R11, SI
+	ADDQ R11, BX
+	ADDQ R11, DI
+
+	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+	LEAQ (DI)(R10*1), R11
+	CMPQ R12, R11
+	JG   error_match_off_too_big
+	CMPQ R12, R8
+	JG   error_match_off_too_big
+
+	// Copy match from history
+	MOVQ  R12, R11
+	SUBQ  DI, R11
+	JLS   copy_match
+	MOVQ  R9, R14
+	SUBQ  R11, R14
+	CMPQ  R13, R11
+	JGE   copy_all_from_history
+	XORQ  R11, R11
+	TESTQ $0x00000001, R13
+	JZ    copy_4_word
+	MOVB  (R14)(R11*1), R12
+	MOVB  R12, (BX)(R11*1)
+	ADDQ  $0x01, R11
+
+copy_4_word:
+	TESTQ $0x00000002, R13
+	JZ    copy_4_dword
+	MOVW  (R14)(R11*1), R12
+	MOVW  R12, (BX)(R11*1)
+	ADDQ  $0x02, R11
+
+copy_4_dword:
+	TESTQ $0x00000004, R13
+	JZ    copy_4_qword
+	MOVL  (R14)(R11*1), R12
+	MOVL  R12, (BX)(R11*1)
+	ADDQ  $0x04, R11
+
+copy_4_qword:
+	TESTQ $0x00000008, R13
+	JZ    copy_4_test
+	MOVQ  (R14)(R11*1), R12
+	MOVQ  R12, (BX)(R11*1)
+	ADDQ  $0x08, R11
+	JMP   copy_4_test
+
+copy_4:
+	MOVUPS (R14)(R11*1), X0
+	MOVUPS X0, (BX)(R11*1)
+	ADDQ   $0x10, R11
+
+copy_4_test:
+	CMPQ R11, R13
+	JB   copy_4
+	ADDQ R13, DI
+	ADDQ R13, BX
+	ADDQ $0x18, AX
+	INCQ DX
+	CMPQ DX, CX
+	JB   main_loop
+	JMP  loop_finished
+
+copy_all_from_history:
+	XORQ  R15, R15
+	TESTQ $0x00000001, R11
+	JZ    copy_5_word
+	MOVB  (R14)(R15*1), BP
+	MOVB  BP, (BX)(R15*1)
+	ADDQ  $0x01, R15
+
+copy_5_word:
+	TESTQ $0x00000002, R11
+	JZ    copy_5_dword
+	MOVW  (R14)(R15*1), BP
+	MOVW  BP, (BX)(R15*1)
+	ADDQ  $0x02, R15
+
+copy_5_dword:
+	TESTQ $0x00000004, R11
+	JZ    copy_5_qword
+	MOVL  (R14)(R15*1), BP
+	MOVL  BP, (BX)(R15*1)
+	ADDQ  $0x04, R15
+
+copy_5_qword:
+	TESTQ $0x00000008, R11
+	JZ    copy_5_test
+	MOVQ  (R14)(R15*1), BP
+	MOVQ  BP, (BX)(R15*1)
+	ADDQ  $0x08, R15
+	JMP   copy_5_test
+
+copy_5:
+	MOVUPS (R14)(R15*1), X0
+	MOVUPS X0, (BX)(R15*1)
+	ADDQ   $0x10, R15
+
+copy_5_test:
+	CMPQ R15, R11
+	JB   copy_5
+	ADDQ R11, BX
+	ADDQ R11, DI
+	SUBQ R11, R13
+
+	// Copy match from the current buffer
+copy_match:
+	TESTQ R13, R13
+	JZ    handle_loop
+	MOVQ  BX, R11
+	SUBQ  R12, R11
+
+	// ml <= mo
+	CMPQ R13, R12
+	JA   copy_overlapping_match
+
+	// Copy non-overlapping match
+	ADDQ R13, DI
+	MOVQ BX, R12
+	ADDQ R13, BX
+
+copy_2:
+	MOVUPS (R11), X0
+	MOVUPS X0, (R12)
+	ADDQ   $0x10, R11
+	ADDQ   $0x10, R12
+	SUBQ   $0x10, R13
+	JHI    copy_2
+	JMP    handle_loop
+
+	// Copy overlapping match
+copy_overlapping_match:
+	ADDQ R13, DI
+
+copy_slow_3:
+	MOVB (R11), R12
+	MOVB R12, (BX)
+	INCQ R11
+	INCQ BX
+	DECQ R13
+	JNZ  copy_slow_3
+
+handle_loop:
+	ADDQ $0x18, AX
+	INCQ DX
+	CMPQ DX, CX
+	JB   main_loop
+
+loop_finished:
+	// Return value
+	MOVB $0x01, ret+8(FP)
+
+	// Update the context
+	MOVQ ctx+0(FP), AX
+	MOVQ DX, 24(AX)
+	MOVQ DI, 104(AX)
+	MOVQ 80(AX), CX
+	SUBQ CX, SI
+	MOVQ SI, 112(AX)
+	RET
+
+error_match_off_too_big:
+	// Return value
+	MOVB $0x00, ret+8(FP)
+
+	// Update the context
+	MOVQ ctx+0(FP), AX
+	MOVQ DX, 24(AX)
+	MOVQ DI, 104(AX)
+	MOVQ 80(AX), CX
+	SUBQ CX, SI
+	MOVQ SI, 112(AX)
+	RET
+
+empty_seqs:
+	// Return value
+	MOVB $0x01, ret+8(FP)
+	RET
+
+// func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+// Requires: CMOV, SSE
+TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
+	MOVQ    br+8(FP), AX
+	MOVQ    32(AX), DX
+	MOVBQZX 40(AX), BX
+	MOVQ    24(AX), SI
+	MOVQ    (AX), AX
+	ADDQ    SI, AX
+	MOVQ    AX, (SP)
+	MOVQ    ctx+16(FP), AX
+	MOVQ    72(AX), DI
+	MOVQ    80(AX), R8
+	MOVQ    88(AX), R9
+	MOVQ    112(AX), R10
+	MOVQ    128(AX), CX
+	MOVQ    CX, 32(SP)
+	MOVQ    144(AX), R11
+	MOVQ    136(AX), R12
+	MOVQ    200(AX), CX
+	MOVQ    CX, 56(SP)
+	MOVQ    176(AX), CX
+	MOVQ    CX, 48(SP)
+	MOVQ    184(AX), AX
+	MOVQ    AX, 40(SP)
+	MOVQ    40(SP), AX
+	ADDQ    AX, 48(SP)
+
+	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
+	ADDQ R10, 32(SP)
+
+	// outBase += outPosition
+	ADDQ R12, R10
+
+sequenceDecs_decodeSync_amd64_main_loop:
+	MOVQ (SP), R13
+
+	// Fill bitreader to have enough for the offset and match length.
+	CMPQ SI, $0x08
+	JL   sequenceDecs_decodeSync_amd64_fill_byte_by_byte
+	MOVQ BX, AX
+	SHRQ $0x03, AX
+	SUBQ AX, R13
+	MOVQ (R13), DX
+	SUBQ AX, SI
+	ANDQ $0x07, BX
+	JMP  sequenceDecs_decodeSync_amd64_fill_end
+
+sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
+	CMPQ    SI, $0x00
+	JLE     sequenceDecs_decodeSync_amd64_fill_end
+	CMPQ    BX, $0x07
+	JLE     sequenceDecs_decodeSync_amd64_fill_end
+	SHLQ    $0x08, DX
+	SUBQ    $0x01, R13
+	SUBQ    $0x01, SI
+	SUBQ    $0x08, BX
+	MOVBQZX (R13), AX
+	ORQ     AX, DX
+	JMP     sequenceDecs_decodeSync_amd64_fill_byte_by_byte
+
+sequenceDecs_decodeSync_amd64_fill_end:
+	// Update offset
+	MOVQ    R9, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R14
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R14
+	ADDQ    R14, AX
+	MOVQ    AX, 8(SP)
+
+	// Update match length
+	MOVQ    R8, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R14
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R14
+	ADDQ    R14, AX
+	MOVQ    AX, 16(SP)
+
+	// Fill bitreader to have enough for the remaining
+	CMPQ SI, $0x08
+	JL   sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
+	MOVQ BX, AX
+	SHRQ $0x03, AX
+	SUBQ AX, R13
+	MOVQ (R13), DX
+	SUBQ AX, SI
+	ANDQ $0x07, BX
+	JMP  sequenceDecs_decodeSync_amd64_fill_2_end
+
+sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
+	CMPQ    SI, $0x00
+	JLE     sequenceDecs_decodeSync_amd64_fill_2_end
+	CMPQ    BX, $0x07
+	JLE     sequenceDecs_decodeSync_amd64_fill_2_end
+	SHLQ    $0x08, DX
+	SUBQ    $0x01, R13
+	SUBQ    $0x01, SI
+	SUBQ    $0x08, BX
+	MOVBQZX (R13), AX
+	ORQ     AX, DX
+	JMP     sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
+
+sequenceDecs_decodeSync_amd64_fill_2_end:
+	// Update literal length
+	MOVQ    DI, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R14
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R14
+	ADDQ    R14, AX
+	MOVQ    AX, 24(SP)
+
+	// Fill bitreader for state updates
+	MOVQ    R13, (SP)
+	MOVQ    R9, AX
+	SHRQ    $0x08, AX
+	MOVBQZX AL, AX
+	MOVQ    ctx+16(FP), CX
+	CMPQ    96(CX), $0x00
+	JZ      sequenceDecs_decodeSync_amd64_skip_update
+
+	// Update Literal Length State
+	MOVBQZX DI, R13
+	SHRQ    $0x10, DI
+	MOVWQZX DI, DI
+	CMPQ    R13, $0x00
+	JZ      sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R13, BX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVQ    R13, CX
+	NEGQ    CX
+	SHRQ    CL, R14
+	ADDQ    R14, DI
+
+sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero:
+	// Load ctx.llTable
+	MOVQ ctx+16(FP), CX
+	MOVQ (CX), CX
+	MOVQ (CX)(DI*8), DI
+
+	// Update Match Length State
+	MOVBQZX R8, R13
+	SHRQ    $0x10, R8
+	MOVWQZX R8, R8
+	CMPQ    R13, $0x00
+	JZ      sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R13, BX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVQ    R13, CX
+	NEGQ    CX
+	SHRQ    CL, R14
+	ADDQ    R14, R8
+
+sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero:
+	// Load ctx.mlTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 24(CX), CX
+	MOVQ (CX)(R8*8), R8
+
+	// Update Offset State
+	MOVBQZX R9, R13
+	SHRQ    $0x10, R9
+	MOVWQZX R9, R9
+	CMPQ    R13, $0x00
+	JZ      sequenceDecs_decodeSync_amd64_ofState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R13, BX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVQ    R13, CX
+	NEGQ    CX
+	SHRQ    CL, R14
+	ADDQ    R14, R9
+
+sequenceDecs_decodeSync_amd64_ofState_updateState_skip_zero:
+	// Load ctx.ofTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 48(CX), CX
+	MOVQ (CX)(R9*8), R9
+
+sequenceDecs_decodeSync_amd64_skip_update:
+	// Adjust offset
+	MOVQ   s+0(FP), CX
+	MOVQ   8(SP), R13
+	CMPQ   AX, $0x01
+	JBE    sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0
+	MOVUPS 144(CX), X0
+	MOVQ   R13, 144(CX)
+	MOVUPS X0, 152(CX)
+	JMP    sequenceDecs_decodeSync_amd64_adjust_end
+
+sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0:
+	CMPQ 24(SP), $0x00000000
+	JNE  sequenceDecs_decodeSync_amd64_adjust_offset_maybezero
+	INCQ R13
+	JMP  sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
+
+sequenceDecs_decodeSync_amd64_adjust_offset_maybezero:
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
+	MOVQ  144(CX), R13
+	JMP   sequenceDecs_decodeSync_amd64_adjust_end
+
+sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
+	MOVQ    R13, AX
+	XORQ    R14, R14
+	MOVQ    $-1, R15
+	CMPQ    R13, $0x03
+	CMOVQEQ R14, AX
+	CMOVQEQ R15, R14
+	LEAQ    144(CX), R15
+	ADDQ    (R15)(AX*8), R14
+	JNZ     sequenceDecs_decodeSync_amd64_adjust_temp_valid
+	MOVQ    $0x00000001, R14
+
+sequenceDecs_decodeSync_amd64_adjust_temp_valid:
+	CMPQ R13, $0x01
+	JZ   sequenceDecs_decodeSync_amd64_adjust_skip
+	MOVQ 152(CX), AX
+	MOVQ AX, 160(CX)
+
+sequenceDecs_decodeSync_amd64_adjust_skip:
+	MOVQ 144(CX), AX
+	MOVQ AX, 152(CX)
+	MOVQ R14, 144(CX)
+	MOVQ R14, R13
+
+sequenceDecs_decodeSync_amd64_adjust_end:
+	MOVQ R13, 8(SP)
+
+	// Check values
+	MOVQ  16(SP), AX
+	MOVQ  24(SP), CX
+	LEAQ  (AX)(CX*1), R14
+	MOVQ  s+0(FP), R15
+	ADDQ  R14, 256(R15)
+	MOVQ  ctx+16(FP), R14
+	SUBQ  CX, 104(R14)
+	JS    error_not_enough_literals
+	CMPQ  AX, $0x00020002
+	JA    sequenceDecs_decodeSync_amd64_error_match_len_too_big
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decodeSync_amd64_match_len_ofs_ok
+	TESTQ AX, AX
+	JNZ   sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch
+
+sequenceDecs_decodeSync_amd64_match_len_ofs_ok:
+	MOVQ 24(SP), AX
+	MOVQ 8(SP), CX
+	MOVQ 16(SP), R13
+
+	// Check if we have enough space in s.out
+	LEAQ (AX)(R13*1), R14
+	ADDQ R10, R14
+	CMPQ R14, 32(SP)
+	JA   error_not_enough_space
+
+	// Copy literals
+	TESTQ AX, AX
+	JZ    check_offset
+	XORQ  R14, R14
+	TESTQ $0x00000001, AX
+	JZ    copy_1_word
+	MOVB  (R11)(R14*1), R15
+	MOVB  R15, (R10)(R14*1)
+	ADDQ  $0x01, R14
+
+copy_1_word:
+	TESTQ $0x00000002, AX
+	JZ    copy_1_dword
+	MOVW  (R11)(R14*1), R15
+	MOVW  R15, (R10)(R14*1)
+	ADDQ  $0x02, R14
+
+copy_1_dword:
+	TESTQ $0x00000004, AX
+	JZ    copy_1_qword
+	MOVL  (R11)(R14*1), R15
+	MOVL  R15, (R10)(R14*1)
+	ADDQ  $0x04, R14
+
+copy_1_qword:
+	TESTQ $0x00000008, AX
+	JZ    copy_1_test
+	MOVQ  (R11)(R14*1), R15
+	MOVQ  R15, (R10)(R14*1)
+	ADDQ  $0x08, R14
+	JMP   copy_1_test
+
+copy_1:
+	MOVUPS (R11)(R14*1), X0
+	MOVUPS X0, (R10)(R14*1)
+	ADDQ   $0x10, R14
+
+copy_1_test:
+	CMPQ R14, AX
+	JB   copy_1
+	ADDQ AX, R11
+	ADDQ AX, R10
+	ADDQ AX, R12
+
+	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+	MOVQ R12, AX
+	ADDQ 40(SP), AX
+	CMPQ CX, AX
+	JG   error_match_off_too_big
+	CMPQ CX, 56(SP)
+	JG   error_match_off_too_big
+
+	// Copy match from history
+	MOVQ  CX, AX
+	SUBQ  R12, AX
+	JLS   copy_match
+	MOVQ  48(SP), R14
+	SUBQ  AX, R14
+	CMPQ  R13, AX
+	JGE   copy_all_from_history
+	XORQ  AX, AX
+	TESTQ $0x00000001, R13
+	JZ    copy_4_word
+	MOVB  (R14)(AX*1), CL
+	MOVB  CL, (R10)(AX*1)
+	ADDQ  $0x01, AX
+
+copy_4_word:
+	TESTQ $0x00000002, R13
+	JZ    copy_4_dword
+	MOVW  (R14)(AX*1), CX
+	MOVW  CX, (R10)(AX*1)
+	ADDQ  $0x02, AX
+
+copy_4_dword:
+	TESTQ $0x00000004, R13
+	JZ    copy_4_qword
+	MOVL  (R14)(AX*1), CX
+	MOVL  CX, (R10)(AX*1)
+	ADDQ  $0x04, AX
+
+copy_4_qword:
+	TESTQ $0x00000008, R13
+	JZ    copy_4_test
+	MOVQ  (R14)(AX*1), CX
+	MOVQ  CX, (R10)(AX*1)
+	ADDQ  $0x08, AX
+	JMP   copy_4_test
+
+copy_4:
+	MOVUPS (R14)(AX*1), X0
+	MOVUPS X0, (R10)(AX*1)
+	ADDQ   $0x10, AX
+
+copy_4_test:
+	CMPQ AX, R13
+	JB   copy_4
+	ADDQ R13, R12
+	ADDQ R13, R10
+	JMP  handle_loop
+	JMP loop_finished
+
+copy_all_from_history:
+	XORQ  R15, R15
+	TESTQ $0x00000001, AX
+	JZ    copy_5_word
+	MOVB  (R14)(R15*1), BP
+	MOVB  BP, (R10)(R15*1)
+	ADDQ  $0x01, R15
+
+copy_5_word:
+	TESTQ $0x00000002, AX
+	JZ    copy_5_dword
+	MOVW  (R14)(R15*1), BP
+	MOVW  BP, (R10)(R15*1)
+	ADDQ  $0x02, R15
+
+copy_5_dword:
+	TESTQ $0x00000004, AX
+	JZ    copy_5_qword
+	MOVL  (R14)(R15*1), BP
+	MOVL  BP, (R10)(R15*1)
+	ADDQ  $0x04, R15
+
+copy_5_qword:
+	TESTQ $0x00000008, AX
+	JZ    copy_5_test
+	MOVQ  (R14)(R15*1), BP
+	MOVQ  BP, (R10)(R15*1)
+	ADDQ  $0x08, R15
+	JMP   copy_5_test
+
+copy_5:
+	MOVUPS (R14)(R15*1), X0
+	MOVUPS X0, (R10)(R15*1)
+	ADDQ   $0x10, R15
+
+copy_5_test:
+	CMPQ R15, AX
+	JB   copy_5
+	ADDQ AX, R10
+	ADDQ AX, R12
+	SUBQ AX, R13
+
+	// Copy match from the current buffer
+copy_match:
+	TESTQ R13, R13
+	JZ    handle_loop
+	MOVQ  R10, AX
+	SUBQ  CX, AX
+
+	// ml <= mo
+	CMPQ R13, CX
+	JA   copy_overlapping_match
+
+	// Copy non-overlapping match
+	ADDQ R13, R12
+	MOVQ R10, CX
+	ADDQ R13, R10
+
+copy_2:
+	MOVUPS (AX), X0
+	MOVUPS X0, (CX)
+	ADDQ   $0x10, AX
+	ADDQ   $0x10, CX
+	SUBQ   $0x10, R13
+	JHI    copy_2
+	JMP    handle_loop
+
+	// Copy overlapping match
+copy_overlapping_match:
+	ADDQ R13, R12
+
+copy_slow_3:
+	MOVB (AX), CL
+	MOVB CL, (R10)
+	INCQ AX
+	INCQ R10
+	DECQ R13
+	JNZ  copy_slow_3
+
+handle_loop:
+	MOVQ ctx+16(FP), AX
+	DECQ 96(AX)
+	JNS  sequenceDecs_decodeSync_amd64_main_loop
+
+loop_finished:
+	MOVQ br+8(FP), AX
+	MOVQ DX, 32(AX)
+	MOVB BL, 40(AX)
+	MOVQ SI, 24(AX)
+
+	// Update the context
+	MOVQ ctx+16(FP), AX
+	MOVQ R12, 136(AX)
+	MOVQ 144(AX), CX
+	SUBQ CX, R11
+	MOVQ R11, 168(AX)
+
+	// Return success
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+	// Return with match length error
+sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch:
+	MOVQ 16(SP), AX
+	MOVQ ctx+16(FP), CX
+	MOVQ AX, 216(CX)
+	MOVQ $0x00000001, ret+24(FP)
+	RET
+
+	// Return with match too long error
+sequenceDecs_decodeSync_amd64_error_match_len_too_big:
+	MOVQ ctx+16(FP), AX
+	MOVQ 16(SP), CX
+	MOVQ CX, 216(AX)
+	MOVQ $0x00000002, ret+24(FP)
+	RET
+
+	// Return with match offset too long error
+error_match_off_too_big:
+	MOVQ ctx+16(FP), AX
+	MOVQ 8(SP), CX
+	MOVQ CX, 224(AX)
+	MOVQ R12, 136(AX)
+	MOVQ $0x00000003, ret+24(FP)
+	RET
+
+	// Return with not enough literals error
+error_not_enough_literals:
+	MOVQ ctx+16(FP), AX
+	MOVQ 24(SP), CX
+	MOVQ CX, 208(AX)
+	MOVQ $0x00000004, ret+24(FP)
+	RET
+
+	// Return with not enough output space error
+error_not_enough_space:
+	MOVQ ctx+16(FP), AX
+	MOVQ 24(SP), CX
+	MOVQ CX, 208(AX)
+	MOVQ 16(SP), CX
+	MOVQ CX, 216(AX)
+	MOVQ R12, 136(AX)
+	MOVQ $0x00000005, ret+24(FP)
+	RET
+
+// func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+// Requires: BMI, BMI2, CMOV, SSE
+TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
+	MOVQ    br+8(FP), CX
+	MOVQ    32(CX), AX
+	MOVBQZX 40(CX), DX
+	MOVQ    24(CX), BX
+	MOVQ    (CX), CX
+	ADDQ    BX, CX
+	MOVQ    CX, (SP)
+	MOVQ    ctx+16(FP), CX
+	MOVQ    72(CX), SI
+	MOVQ    80(CX), DI
+	MOVQ    88(CX), R8
+	MOVQ    112(CX), R9
+	MOVQ    128(CX), R10
+	MOVQ    R10, 32(SP)
+	MOVQ    144(CX), R10
+	MOVQ    136(CX), R11
+	MOVQ    200(CX), R12
+	MOVQ    R12, 56(SP)
+	MOVQ    176(CX), R12
+	MOVQ    R12, 48(SP)
+	MOVQ    184(CX), CX
+	MOVQ    CX, 40(SP)
+	MOVQ    40(SP), CX
+	ADDQ    CX, 48(SP)
+
+	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
+	ADDQ R9, 32(SP)
+
+	// outBase += outPosition
+	ADDQ R11, R9
+
+sequenceDecs_decodeSync_bmi2_main_loop:
+	MOVQ (SP), R12
+
+	// Fill bitreader to have enough for the offset and match length.
+	CMPQ BX, $0x08
+	JL   sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
+	MOVQ DX, CX
+	SHRQ $0x03, CX
+	SUBQ CX, R12
+	MOVQ (R12), AX
+	SUBQ CX, BX
+	ANDQ $0x07, DX
+	JMP  sequenceDecs_decodeSync_bmi2_fill_end
+
+sequenceDecs_decodeSync_bmi2_fill_byte_by_byte:
+	CMPQ    BX, $0x00
+	JLE     sequenceDecs_decodeSync_bmi2_fill_end
+	CMPQ    DX, $0x07
+	JLE     sequenceDecs_decodeSync_bmi2_fill_end
+	SHLQ    $0x08, AX
+	SUBQ    $0x01, R12
+	SUBQ    $0x01, BX
+	SUBQ    $0x08, DX
+	MOVBQZX (R12), CX
+	ORQ     CX, AX
+	JMP     sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
+
+sequenceDecs_decodeSync_bmi2_fill_end:
+	// Update offset
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, R8, R13
+	MOVQ   AX, R14
+	LEAQ   (DX)(R13*1), CX
+	ROLQ   CL, R14
+	BZHIQ  R13, R14, R14
+	MOVQ   CX, DX
+	MOVQ   R8, CX
+	SHRQ   $0x20, CX
+	ADDQ   R14, CX
+	MOVQ   CX, 8(SP)
+
+	// Update match length
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, DI, R13
+	MOVQ   AX, R14
+	LEAQ   (DX)(R13*1), CX
+	ROLQ   CL, R14
+	BZHIQ  R13, R14, R14
+	MOVQ   CX, DX
+	MOVQ   DI, CX
+	SHRQ   $0x20, CX
+	ADDQ   R14, CX
+	MOVQ   CX, 16(SP)
+
+	// Fill bitreader to have enough for the remaining
+	CMPQ BX, $0x08
+	JL   sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
+	MOVQ DX, CX
+	SHRQ $0x03, CX
+	SUBQ CX, R12
+	MOVQ (R12), AX
+	SUBQ CX, BX
+	ANDQ $0x07, DX
+	JMP  sequenceDecs_decodeSync_bmi2_fill_2_end
+
+sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte:
+	CMPQ    BX, $0x00
+	JLE     sequenceDecs_decodeSync_bmi2_fill_2_end
+	CMPQ    DX, $0x07
+	JLE     sequenceDecs_decodeSync_bmi2_fill_2_end
+	SHLQ    $0x08, AX
+	SUBQ    $0x01, R12
+	SUBQ    $0x01, BX
+	SUBQ    $0x08, DX
+	MOVBQZX (R12), CX
+	ORQ     CX, AX
+	JMP     sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
+
+sequenceDecs_decodeSync_bmi2_fill_2_end:
+	// Update literal length
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, SI, R13
+	MOVQ   AX, R14
+	LEAQ   (DX)(R13*1), CX
+	ROLQ   CL, R14
+	BZHIQ  R13, R14, R14
+	MOVQ   CX, DX
+	MOVQ   SI, CX
+	SHRQ   $0x20, CX
+	ADDQ   R14, CX
+	MOVQ   CX, 24(SP)
+
+	// Fill bitreader for state updates
+	MOVQ   R12, (SP)
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, R8, R12
+	MOVQ   ctx+16(FP), CX
+	CMPQ   96(CX), $0x00
+	JZ     sequenceDecs_decodeSync_bmi2_skip_update
+
+	// Update Literal Length State
+	MOVBQZX SI, R13
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, SI, SI
+	LEAQ    (DX)(R13*1), CX
+	MOVQ    AX, R14
+	MOVQ    CX, DX
+	ROLQ    CL, R14
+	BZHIQ   R13, R14, R14
+	ADDQ    R14, SI
+
+	// Load ctx.llTable
+	MOVQ ctx+16(FP), CX
+	MOVQ (CX), CX
+	MOVQ (CX)(SI*8), SI
+
+	// Update Match Length State
+	MOVBQZX DI, R13
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, DI, DI
+	LEAQ    (DX)(R13*1), CX
+	MOVQ    AX, R14
+	MOVQ    CX, DX
+	ROLQ    CL, R14
+	BZHIQ   R13, R14, R14
+	ADDQ    R14, DI
+
+	// Load ctx.mlTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 24(CX), CX
+	MOVQ (CX)(DI*8), DI
+
+	// Update Offset State
+	MOVBQZX R8, R13
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, R8, R8
+	LEAQ    (DX)(R13*1), CX
+	MOVQ    AX, R14
+	MOVQ    CX, DX
+	ROLQ    CL, R14
+	BZHIQ   R13, R14, R14
+	ADDQ    R14, R8
+
+	// Load ctx.ofTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 48(CX), CX
+	MOVQ (CX)(R8*8), R8
+
+sequenceDecs_decodeSync_bmi2_skip_update:
+	// Adjust offset
+	MOVQ   s+0(FP), CX
+	MOVQ   8(SP), R13
+	CMPQ   R12, $0x01
+	JBE    sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0
+	MOVUPS 144(CX), X0
+	MOVQ   R13, 144(CX)
+	MOVUPS X0, 152(CX)
+	JMP    sequenceDecs_decodeSync_bmi2_adjust_end
+
+sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0:
+	CMPQ 24(SP), $0x00000000
+	JNE  sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero
+	INCQ R13
+	JMP  sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
+
+sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero:
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
+	MOVQ  144(CX), R13
+	JMP   sequenceDecs_decodeSync_bmi2_adjust_end
+
+sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
+	MOVQ    R13, R12
+	XORQ    R14, R14
+	MOVQ    $-1, R15
+	CMPQ    R13, $0x03
+	CMOVQEQ R14, R12
+	CMOVQEQ R15, R14
+	LEAQ    144(CX), R15
+	ADDQ    (R15)(R12*8), R14
+	JNZ     sequenceDecs_decodeSync_bmi2_adjust_temp_valid
+	MOVQ    $0x00000001, R14
+
+sequenceDecs_decodeSync_bmi2_adjust_temp_valid:
+	CMPQ R13, $0x01
+	JZ   sequenceDecs_decodeSync_bmi2_adjust_skip
+	MOVQ 152(CX), R12
+	MOVQ R12, 160(CX)
+
+sequenceDecs_decodeSync_bmi2_adjust_skip:
+	MOVQ 144(CX), R12
+	MOVQ R12, 152(CX)
+	MOVQ R14, 144(CX)
+	MOVQ R14, R13
+
+sequenceDecs_decodeSync_bmi2_adjust_end:
+	MOVQ R13, 8(SP)
+
+	// Check values
+	MOVQ  16(SP), CX
+	MOVQ  24(SP), R12
+	LEAQ  (CX)(R12*1), R14
+	MOVQ  s+0(FP), R15
+	ADDQ  R14, 256(R15)
+	MOVQ  ctx+16(FP), R14
+	SUBQ  R12, 104(R14)
+	JS    error_not_enough_literals
+	CMPQ  CX, $0x00020002
+	JA    sequenceDecs_decodeSync_bmi2_error_match_len_too_big
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decodeSync_bmi2_match_len_ofs_ok
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch
+
+sequenceDecs_decodeSync_bmi2_match_len_ofs_ok:
+	MOVQ 24(SP), CX
+	MOVQ 8(SP), R12
+	MOVQ 16(SP), R13
+
+	// Check if we have enough space in s.out
+	LEAQ (CX)(R13*1), R14
+	ADDQ R9, R14
+	CMPQ R14, 32(SP)
+	JA   error_not_enough_space
+
+	// Copy literals
+	TESTQ CX, CX
+	JZ    check_offset
+	XORQ  R14, R14
+	TESTQ $0x00000001, CX
+	JZ    copy_1_word
+	MOVB  (R10)(R14*1), R15
+	MOVB  R15, (R9)(R14*1)
+	ADDQ  $0x01, R14
+
+copy_1_word:
+	TESTQ $0x00000002, CX
+	JZ    copy_1_dword
+	MOVW  (R10)(R14*1), R15
+	MOVW  R15, (R9)(R14*1)
+	ADDQ  $0x02, R14
+
+copy_1_dword:
+	TESTQ $0x00000004, CX
+	JZ    copy_1_qword
+	MOVL  (R10)(R14*1), R15
+	MOVL  R15, (R9)(R14*1)
+	ADDQ  $0x04, R14
+
+copy_1_qword:
+	TESTQ $0x00000008, CX
+	JZ    copy_1_test
+	MOVQ  (R10)(R14*1), R15
+	MOVQ  R15, (R9)(R14*1)
+	ADDQ  $0x08, R14
+	JMP   copy_1_test
+
+copy_1:
+	MOVUPS (R10)(R14*1), X0
+	MOVUPS X0, (R9)(R14*1)
+	ADDQ   $0x10, R14
+
+copy_1_test:
+	CMPQ R14, CX
+	JB   copy_1
+	ADDQ CX, R10
+	ADDQ CX, R9
+	ADDQ CX, R11
+
+	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+	MOVQ R11, CX
+	ADDQ 40(SP), CX
+	CMPQ R12, CX
+	JG   error_match_off_too_big
+	CMPQ R12, 56(SP)
+	JG   error_match_off_too_big
+
+	// Copy match from history
+	MOVQ  R12, CX
+	SUBQ  R11, CX
+	JLS   copy_match
+	MOVQ  48(SP), R14
+	SUBQ  CX, R14
+	CMPQ  R13, CX
+	JGE   copy_all_from_history
+	XORQ  CX, CX
+	TESTQ $0x00000001, R13
+	JZ    copy_4_word
+	MOVB  (R14)(CX*1), R12
+	MOVB  R12, (R9)(CX*1)
+	ADDQ  $0x01, CX
+
+copy_4_word:
+	TESTQ $0x00000002, R13
+	JZ    copy_4_dword
+	MOVW  (R14)(CX*1), R12
+	MOVW  R12, (R9)(CX*1)
+	ADDQ  $0x02, CX
+
+copy_4_dword:
+	TESTQ $0x00000004, R13
+	JZ    copy_4_qword
+	MOVL  (R14)(CX*1), R12
+	MOVL  R12, (R9)(CX*1)
+	ADDQ  $0x04, CX
+
+copy_4_qword:
+	TESTQ $0x00000008, R13
+	JZ    copy_4_test
+	MOVQ  (R14)(CX*1), R12
+	MOVQ  R12, (R9)(CX*1)
+	ADDQ  $0x08, CX
+	JMP   copy_4_test
+
+copy_4:
+	MOVUPS (R14)(CX*1), X0
+	MOVUPS X0, (R9)(CX*1)
+	ADDQ   $0x10, CX
+
+copy_4_test:
+	CMPQ CX, R13
+	JB   copy_4
+	ADDQ R13, R11
+	ADDQ R13, R9
+	JMP  handle_loop
+	JMP loop_finished
+
+copy_all_from_history:
+	XORQ  R15, R15
+	TESTQ $0x00000001, CX
+	JZ    copy_5_word
+	MOVB  (R14)(R15*1), BP
+	MOVB  BP, (R9)(R15*1)
+	ADDQ  $0x01, R15
+
+copy_5_word:
+	TESTQ $0x00000002, CX
+	JZ    copy_5_dword
+	MOVW  (R14)(R15*1), BP
+	MOVW  BP, (R9)(R15*1)
+	ADDQ  $0x02, R15
+
+copy_5_dword:
+	TESTQ $0x00000004, CX
+	JZ    copy_5_qword
+	MOVL  (R14)(R15*1), BP
+	MOVL  BP, (R9)(R15*1)
+	ADDQ  $0x04, R15
+
+copy_5_qword:
+	TESTQ $0x00000008, CX
+	JZ    copy_5_test
+	MOVQ  (R14)(R15*1), BP
+	MOVQ  BP, (R9)(R15*1)
+	ADDQ  $0x08, R15
+	JMP   copy_5_test
+
+copy_5:
+	MOVUPS (R14)(R15*1), X0
+	MOVUPS X0, (R9)(R15*1)
+	ADDQ   $0x10, R15
+
+copy_5_test:
+	CMPQ R15, CX
+	JB   copy_5
+	ADDQ CX, R9
+	ADDQ CX, R11
+	SUBQ CX, R13
+
+	// Copy match from the current buffer
+copy_match:
+	TESTQ R13, R13
+	JZ    handle_loop
+	MOVQ  R9, CX
+	SUBQ  R12, CX
+
+	// ml <= mo
+	CMPQ R13, R12
+	JA   copy_overlapping_match
+
+	// Copy non-overlapping match
+	ADDQ R13, R11
+	MOVQ R9, R12
+	ADDQ R13, R9
+
+copy_2:
+	MOVUPS (CX), X0
+	MOVUPS X0, (R12)
+	ADDQ   $0x10, CX
+	ADDQ   $0x10, R12
+	SUBQ   $0x10, R13
+	JHI    copy_2
+	JMP    handle_loop
+
+	// Copy overlapping match
+copy_overlapping_match:
+	ADDQ R13, R11
+
+copy_slow_3:
+	MOVB (CX), R12
+	MOVB R12, (R9)
+	INCQ CX
+	INCQ R9
+	DECQ R13
+	JNZ  copy_slow_3
+
+handle_loop:
+	MOVQ ctx+16(FP), CX
+	DECQ 96(CX)
+	JNS  sequenceDecs_decodeSync_bmi2_main_loop
+
+loop_finished:
+	MOVQ br+8(FP), CX
+	MOVQ AX, 32(CX)
+	MOVB DL, 40(CX)
+	MOVQ BX, 24(CX)
+
+	// Update the context
+	MOVQ ctx+16(FP), AX
+	MOVQ R11, 136(AX)
+	MOVQ 144(AX), CX
+	SUBQ CX, R10
+	MOVQ R10, 168(AX)
+
+	// Return success
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+	// Return with match length error
+sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch:
+	MOVQ 16(SP), AX
+	MOVQ ctx+16(FP), CX
+	MOVQ AX, 216(CX)
+	MOVQ $0x00000001, ret+24(FP)
+	RET
+
+	// Return with match too long error
+sequenceDecs_decodeSync_bmi2_error_match_len_too_big:
+	MOVQ ctx+16(FP), AX
+	MOVQ 16(SP), CX
+	MOVQ CX, 216(AX)
+	MOVQ $0x00000002, ret+24(FP)
+	RET
+
+	// Return with match offset too long error
+error_match_off_too_big:
+	MOVQ ctx+16(FP), AX
+	MOVQ 8(SP), CX
+	MOVQ CX, 224(AX)
+	MOVQ R11, 136(AX)
+	MOVQ $0x00000003, ret+24(FP)
+	RET
+
+	// Return with not enough literals error
+error_not_enough_literals:
+	MOVQ ctx+16(FP), AX
+	MOVQ 24(SP), CX
+	MOVQ CX, 208(AX)
+	MOVQ $0x00000004, ret+24(FP)
+	RET
+
+	// Return with not enough output space error
+error_not_enough_space:
+	MOVQ ctx+16(FP), AX
+	MOVQ 24(SP), CX
+	MOVQ CX, 208(AX)
+	MOVQ 16(SP), CX
+	MOVQ CX, 216(AX)
+	MOVQ R11, 136(AX)
+	MOVQ $0x00000005, ret+24(FP)
+	RET
+
+// func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+// Requires: CMOV, SSE
+TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
+	MOVQ    br+8(FP), AX
+	MOVQ    32(AX), DX
+	MOVBQZX 40(AX), BX
+	MOVQ    24(AX), SI
+	MOVQ    (AX), AX
+	ADDQ    SI, AX
+	MOVQ    AX, (SP)
+	MOVQ    ctx+16(FP), AX
+	MOVQ    72(AX), DI
+	MOVQ    80(AX), R8
+	MOVQ    88(AX), R9
+	MOVQ    112(AX), R10
+	MOVQ    128(AX), CX
+	MOVQ    CX, 32(SP)
+	MOVQ    144(AX), R11
+	MOVQ    136(AX), R12
+	MOVQ    200(AX), CX
+	MOVQ    CX, 56(SP)
+	MOVQ    176(AX), CX
+	MOVQ    CX, 48(SP)
+	MOVQ    184(AX), AX
+	MOVQ    AX, 40(SP)
+	MOVQ    40(SP), AX
+	ADDQ    AX, 48(SP)
+
+	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
+	ADDQ R10, 32(SP)
+
+	// outBase += outPosition
+	ADDQ R12, R10
+
+sequenceDecs_decodeSync_safe_amd64_main_loop:
+	MOVQ (SP), R13
+
+	// Fill bitreader to have enough for the offset and match length.
+	CMPQ SI, $0x08
+	JL   sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
+	MOVQ BX, AX
+	SHRQ $0x03, AX
+	SUBQ AX, R13
+	MOVQ (R13), DX
+	SUBQ AX, SI
+	ANDQ $0x07, BX
+	JMP  sequenceDecs_decodeSync_safe_amd64_fill_end
+
+sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
+	CMPQ    SI, $0x00
+	JLE     sequenceDecs_decodeSync_safe_amd64_fill_end
+	CMPQ    BX, $0x07
+	JLE     sequenceDecs_decodeSync_safe_amd64_fill_end
+	SHLQ    $0x08, DX
+	SUBQ    $0x01, R13
+	SUBQ    $0x01, SI
+	SUBQ    $0x08, BX
+	MOVBQZX (R13), AX
+	ORQ     AX, DX
+	JMP     sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
+
+sequenceDecs_decodeSync_safe_amd64_fill_end:
+	// Update offset
+	MOVQ    R9, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R14
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R14
+	ADDQ    R14, AX
+	MOVQ    AX, 8(SP)
+
+	// Update match length
+	MOVQ    R8, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R14
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R14
+	ADDQ    R14, AX
+	MOVQ    AX, 16(SP)
+
+	// Fill bitreader to have enough for the remaining
+	CMPQ SI, $0x08
+	JL   sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
+	MOVQ BX, AX
+	SHRQ $0x03, AX
+	SUBQ AX, R13
+	MOVQ (R13), DX
+	SUBQ AX, SI
+	ANDQ $0x07, BX
+	JMP  sequenceDecs_decodeSync_safe_amd64_fill_2_end
+
+sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
+	CMPQ    SI, $0x00
+	JLE     sequenceDecs_decodeSync_safe_amd64_fill_2_end
+	CMPQ    BX, $0x07
+	JLE     sequenceDecs_decodeSync_safe_amd64_fill_2_end
+	SHLQ    $0x08, DX
+	SUBQ    $0x01, R13
+	SUBQ    $0x01, SI
+	SUBQ    $0x08, BX
+	MOVBQZX (R13), AX
+	ORQ     AX, DX
+	JMP     sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
+
+sequenceDecs_decodeSync_safe_amd64_fill_2_end:
+	// Update literal length
+	MOVQ    DI, AX
+	MOVQ    BX, CX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVB    AH, CL
+	ADDQ    CX, BX
+	NEGL    CX
+	SHRQ    CL, R14
+	SHRQ    $0x20, AX
+	TESTQ   CX, CX
+	CMOVQEQ CX, R14
+	ADDQ    R14, AX
+	MOVQ    AX, 24(SP)
+
+	// Fill bitreader for state updates
+	MOVQ    R13, (SP)
+	MOVQ    R9, AX
+	SHRQ    $0x08, AX
+	MOVBQZX AL, AX
+	MOVQ    ctx+16(FP), CX
+	CMPQ    96(CX), $0x00
+	JZ      sequenceDecs_decodeSync_safe_amd64_skip_update
+
+	// Update Literal Length State
+	MOVBQZX DI, R13
+	SHRQ    $0x10, DI
+	MOVWQZX DI, DI
+	CMPQ    R13, $0x00
+	JZ      sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R13, BX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVQ    R13, CX
+	NEGQ    CX
+	SHRQ    CL, R14
+	ADDQ    R14, DI
+
+sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero:
+	// Load ctx.llTable
+	MOVQ ctx+16(FP), CX
+	MOVQ (CX), CX
+	MOVQ (CX)(DI*8), DI
+
+	// Update Match Length State
+	MOVBQZX R8, R13
+	SHRQ    $0x10, R8
+	MOVWQZX R8, R8
+	CMPQ    R13, $0x00
+	JZ      sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R13, BX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVQ    R13, CX
+	NEGQ    CX
+	SHRQ    CL, R14
+	ADDQ    R14, R8
+
+sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero:
+	// Load ctx.mlTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 24(CX), CX
+	MOVQ (CX)(R8*8), R8
+
+	// Update Offset State
+	MOVBQZX R9, R13
+	SHRQ    $0x10, R9
+	MOVWQZX R9, R9
+	CMPQ    R13, $0x00
+	JZ      sequenceDecs_decodeSync_safe_amd64_ofState_updateState_skip_zero
+	MOVQ    BX, CX
+	ADDQ    R13, BX
+	MOVQ    DX, R14
+	SHLQ    CL, R14
+	MOVQ    R13, CX
+	NEGQ    CX
+	SHRQ    CL, R14
+	ADDQ    R14, R9
+
+sequenceDecs_decodeSync_safe_amd64_ofState_updateState_skip_zero:
+	// Load ctx.ofTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 48(CX), CX
+	MOVQ (CX)(R9*8), R9
+
+sequenceDecs_decodeSync_safe_amd64_skip_update:
+	// Adjust offset
+	MOVQ   s+0(FP), CX
+	MOVQ   8(SP), R13
+	CMPQ   AX, $0x01
+	JBE    sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0
+	MOVUPS 144(CX), X0
+	MOVQ   R13, 144(CX)
+	MOVUPS X0, 152(CX)
+	JMP    sequenceDecs_decodeSync_safe_amd64_adjust_end
+
+sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0:
+	CMPQ 24(SP), $0x00000000
+	JNE  sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero
+	INCQ R13
+	JMP  sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
+
+sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero:
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
+	MOVQ  144(CX), R13
+	JMP   sequenceDecs_decodeSync_safe_amd64_adjust_end
+
+sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
+	MOVQ    R13, AX
+	XORQ    R14, R14
+	MOVQ    $-1, R15
+	CMPQ    R13, $0x03
+	CMOVQEQ R14, AX
+	CMOVQEQ R15, R14
+	LEAQ    144(CX), R15
+	ADDQ    (R15)(AX*8), R14
+	JNZ     sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid
+	MOVQ    $0x00000001, R14
+
+sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid:
+	CMPQ R13, $0x01
+	JZ   sequenceDecs_decodeSync_safe_amd64_adjust_skip
+	MOVQ 152(CX), AX
+	MOVQ AX, 160(CX)
+
+sequenceDecs_decodeSync_safe_amd64_adjust_skip:
+	MOVQ 144(CX), AX
+	MOVQ AX, 152(CX)
+	MOVQ R14, 144(CX)
+	MOVQ R14, R13
+
+sequenceDecs_decodeSync_safe_amd64_adjust_end:
+	MOVQ R13, 8(SP)
+
+	// Check values
+	MOVQ  16(SP), AX
+	MOVQ  24(SP), CX
+	LEAQ  (AX)(CX*1), R14
+	MOVQ  s+0(FP), R15
+	ADDQ  R14, 256(R15)
+	MOVQ  ctx+16(FP), R14
+	SUBQ  CX, 104(R14)
+	JS    error_not_enough_literals
+	CMPQ  AX, $0x00020002
+	JA    sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok
+	TESTQ AX, AX
+	JNZ   sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch
+
+sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok:
+	MOVQ 24(SP), AX
+	MOVQ 8(SP), CX
+	MOVQ 16(SP), R13
+
+	// Check if we have enough space in s.out
+	LEAQ (AX)(R13*1), R14
+	ADDQ R10, R14
+	CMPQ R14, 32(SP)
+	JA   error_not_enough_space
+
+	// Copy literals
+	TESTQ AX, AX
+	JZ    check_offset
+	XORQ  R14, R14
+	TESTQ $0x00000001, AX
+	JZ    copy_1_word
+	MOVB  (R11)(R14*1), R15
+	MOVB  R15, (R10)(R14*1)
+	ADDQ  $0x01, R14
+
+copy_1_word:
+	TESTQ $0x00000002, AX
+	JZ    copy_1_dword
+	MOVW  (R11)(R14*1), R15
+	MOVW  R15, (R10)(R14*1)
+	ADDQ  $0x02, R14
+
+copy_1_dword:
+	TESTQ $0x00000004, AX
+	JZ    copy_1_qword
+	MOVL  (R11)(R14*1), R15
+	MOVL  R15, (R10)(R14*1)
+	ADDQ  $0x04, R14
+
+copy_1_qword:
+	TESTQ $0x00000008, AX
+	JZ    copy_1_test
+	MOVQ  (R11)(R14*1), R15
+	MOVQ  R15, (R10)(R14*1)
+	ADDQ  $0x08, R14
+	JMP   copy_1_test
+
+copy_1:
+	MOVUPS (R11)(R14*1), X0
+	MOVUPS X0, (R10)(R14*1)
+	ADDQ   $0x10, R14
+
+copy_1_test:
+	CMPQ R14, AX
+	JB   copy_1
+	ADDQ AX, R11
+	ADDQ AX, R10
+	ADDQ AX, R12
+
+	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+	MOVQ R12, AX
+	ADDQ 40(SP), AX
+	CMPQ CX, AX
+	JG   error_match_off_too_big
+	CMPQ CX, 56(SP)
+	JG   error_match_off_too_big
+
+	// Copy match from history
+	MOVQ  CX, AX
+	SUBQ  R12, AX
+	JLS   copy_match
+	MOVQ  48(SP), R14
+	SUBQ  AX, R14
+	CMPQ  R13, AX
+	JGE   copy_all_from_history
+	XORQ  AX, AX
+	TESTQ $0x00000001, R13
+	JZ    copy_4_word
+	MOVB  (R14)(AX*1), CL
+	MOVB  CL, (R10)(AX*1)
+	ADDQ  $0x01, AX
+
+copy_4_word:
+	TESTQ $0x00000002, R13
+	JZ    copy_4_dword
+	MOVW  (R14)(AX*1), CX
+	MOVW  CX, (R10)(AX*1)
+	ADDQ  $0x02, AX
+
+copy_4_dword:
+	TESTQ $0x00000004, R13
+	JZ    copy_4_qword
+	MOVL  (R14)(AX*1), CX
+	MOVL  CX, (R10)(AX*1)
+	ADDQ  $0x04, AX
+
+copy_4_qword:
+	TESTQ $0x00000008, R13
+	JZ    copy_4_test
+	MOVQ  (R14)(AX*1), CX
+	MOVQ  CX, (R10)(AX*1)
+	ADDQ  $0x08, AX
+	JMP   copy_4_test
+
+copy_4:
+	MOVUPS (R14)(AX*1), X0
+	MOVUPS X0, (R10)(AX*1)
+	ADDQ   $0x10, AX
+
+copy_4_test:
+	CMPQ AX, R13
+	JB   copy_4
+	ADDQ R13, R12
+	ADDQ R13, R10
+	JMP  handle_loop
+	JMP loop_finished
+
+copy_all_from_history:
+	XORQ  R15, R15
+	TESTQ $0x00000001, AX
+	JZ    copy_5_word
+	MOVB  (R14)(R15*1), BP
+	MOVB  BP, (R10)(R15*1)
+	ADDQ  $0x01, R15
+
+copy_5_word:
+	TESTQ $0x00000002, AX
+	JZ    copy_5_dword
+	MOVW  (R14)(R15*1), BP
+	MOVW  BP, (R10)(R15*1)
+	ADDQ  $0x02, R15
+
+copy_5_dword:
+	TESTQ $0x00000004, AX
+	JZ    copy_5_qword
+	MOVL  (R14)(R15*1), BP
+	MOVL  BP, (R10)(R15*1)
+	ADDQ  $0x04, R15
+
+copy_5_qword:
+	TESTQ $0x00000008, AX
+	JZ    copy_5_test
+	MOVQ  (R14)(R15*1), BP
+	MOVQ  BP, (R10)(R15*1)
+	ADDQ  $0x08, R15
+	JMP   copy_5_test
+
+copy_5:
+	MOVUPS (R14)(R15*1), X0
+	MOVUPS X0, (R10)(R15*1)
+	ADDQ   $0x10, R15
+
+copy_5_test:
+	CMPQ R15, AX
+	JB   copy_5
+	ADDQ AX, R10
+	ADDQ AX, R12
+	SUBQ AX, R13
+
+	// Copy match from the current buffer
+copy_match:
+	TESTQ R13, R13
+	JZ    handle_loop
+	MOVQ  R10, AX
+	SUBQ  CX, AX
+
+	// ml <= mo
+	CMPQ R13, CX
+	JA   copy_overlapping_match
+
+	// Copy non-overlapping match
+	ADDQ  R13, R12
+	XORQ  CX, CX
+	TESTQ $0x00000001, R13
+	JZ    copy_2_word
+	MOVB  (AX)(CX*1), R14
+	MOVB  R14, (R10)(CX*1)
+	ADDQ  $0x01, CX
+
+copy_2_word:
+	TESTQ $0x00000002, R13
+	JZ    copy_2_dword
+	MOVW  (AX)(CX*1), R14
+	MOVW  R14, (R10)(CX*1)
+	ADDQ  $0x02, CX
+
+copy_2_dword:
+	TESTQ $0x00000004, R13
+	JZ    copy_2_qword
+	MOVL  (AX)(CX*1), R14
+	MOVL  R14, (R10)(CX*1)
+	ADDQ  $0x04, CX
+
+copy_2_qword:
+	TESTQ $0x00000008, R13
+	JZ    copy_2_test
+	MOVQ  (AX)(CX*1), R14
+	MOVQ  R14, (R10)(CX*1)
+	ADDQ  $0x08, CX
+	JMP   copy_2_test
+
+copy_2:
+	MOVUPS (AX)(CX*1), X0
+	MOVUPS X0, (R10)(CX*1)
+	ADDQ   $0x10, CX
+
+copy_2_test:
+	CMPQ CX, R13
+	JB   copy_2
+	ADDQ R13, R10
+	JMP  handle_loop
+
+	// Copy overlapping match
+copy_overlapping_match:
+	ADDQ R13, R12
+
+copy_slow_3:
+	MOVB (AX), CL
+	MOVB CL, (R10)
+	INCQ AX
+	INCQ R10
+	DECQ R13
+	JNZ  copy_slow_3
+
+handle_loop:
+	MOVQ ctx+16(FP), AX
+	DECQ 96(AX)
+	JNS  sequenceDecs_decodeSync_safe_amd64_main_loop
+
+loop_finished:
+	MOVQ br+8(FP), AX
+	MOVQ DX, 32(AX)
+	MOVB BL, 40(AX)
+	MOVQ SI, 24(AX)
+
+	// Update the context
+	MOVQ ctx+16(FP), AX
+	MOVQ R12, 136(AX)
+	MOVQ 144(AX), CX
+	SUBQ CX, R11
+	MOVQ R11, 168(AX)
+
+	// Return success
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+	// Return with match length error
+sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch:
+	MOVQ 16(SP), AX
+	MOVQ ctx+16(FP), CX
+	MOVQ AX, 216(CX)
+	MOVQ $0x00000001, ret+24(FP)
+	RET
+
+	// Return with match too long error
+sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big:
+	MOVQ ctx+16(FP), AX
+	MOVQ 16(SP), CX
+	MOVQ CX, 216(AX)
+	MOVQ $0x00000002, ret+24(FP)
+	RET
+
+	// Return with match offset too long error
+error_match_off_too_big:
+	MOVQ ctx+16(FP), AX
+	MOVQ 8(SP), CX
+	MOVQ CX, 224(AX)
+	MOVQ R12, 136(AX)
+	MOVQ $0x00000003, ret+24(FP)
+	RET
+
+	// Return with not enough literals error
+error_not_enough_literals:
+	MOVQ ctx+16(FP), AX
+	MOVQ 24(SP), CX
+	MOVQ CX, 208(AX)
+	MOVQ $0x00000004, ret+24(FP)
+	RET
+
+	// Return with not enough output space error
+error_not_enough_space:
+	MOVQ ctx+16(FP), AX
+	MOVQ 24(SP), CX
+	MOVQ CX, 208(AX)
+	MOVQ 16(SP), CX
+	MOVQ CX, 216(AX)
+	MOVQ R12, 136(AX)
+	MOVQ $0x00000005, ret+24(FP)
+	RET
+
+// func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
+// Requires: BMI, BMI2, CMOV, SSE
+TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
+	MOVQ    br+8(FP), CX
+	MOVQ    32(CX), AX
+	MOVBQZX 40(CX), DX
+	MOVQ    24(CX), BX
+	MOVQ    (CX), CX
+	ADDQ    BX, CX
+	MOVQ    CX, (SP)
+	MOVQ    ctx+16(FP), CX
+	MOVQ    72(CX), SI
+	MOVQ    80(CX), DI
+	MOVQ    88(CX), R8
+	MOVQ    112(CX), R9
+	MOVQ    128(CX), R10
+	MOVQ    R10, 32(SP)
+	MOVQ    144(CX), R10
+	MOVQ    136(CX), R11
+	MOVQ    200(CX), R12
+	MOVQ    R12, 56(SP)
+	MOVQ    176(CX), R12
+	MOVQ    R12, 48(SP)
+	MOVQ    184(CX), CX
+	MOVQ    CX, 40(SP)
+	MOVQ    40(SP), CX
+	ADDQ    CX, 48(SP)
+
+	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
+	ADDQ R9, 32(SP)
+
+	// outBase += outPosition
+	ADDQ R11, R9
+
+sequenceDecs_decodeSync_safe_bmi2_main_loop:
+	MOVQ (SP), R12
+
+	// Fill bitreader to have enough for the offset and match length.
+	CMPQ BX, $0x08
+	JL   sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
+	MOVQ DX, CX
+	SHRQ $0x03, CX
+	SUBQ CX, R12
+	MOVQ (R12), AX
+	SUBQ CX, BX
+	ANDQ $0x07, DX
+	JMP  sequenceDecs_decodeSync_safe_bmi2_fill_end
+
+sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte:
+	CMPQ    BX, $0x00
+	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_end
+	CMPQ    DX, $0x07
+	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_end
+	SHLQ    $0x08, AX
+	SUBQ    $0x01, R12
+	SUBQ    $0x01, BX
+	SUBQ    $0x08, DX
+	MOVBQZX (R12), CX
+	ORQ     CX, AX
+	JMP     sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
+
+sequenceDecs_decodeSync_safe_bmi2_fill_end:
+	// Update offset
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, R8, R13
+	MOVQ   AX, R14
+	LEAQ   (DX)(R13*1), CX
+	ROLQ   CL, R14
+	BZHIQ  R13, R14, R14
+	MOVQ   CX, DX
+	MOVQ   R8, CX
+	SHRQ   $0x20, CX
+	ADDQ   R14, CX
+	MOVQ   CX, 8(SP)
+
+	// Update match length
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, DI, R13
+	MOVQ   AX, R14
+	LEAQ   (DX)(R13*1), CX
+	ROLQ   CL, R14
+	BZHIQ  R13, R14, R14
+	MOVQ   CX, DX
+	MOVQ   DI, CX
+	SHRQ   $0x20, CX
+	ADDQ   R14, CX
+	MOVQ   CX, 16(SP)
+
+	// Fill bitreader to have enough for the remaining
+	CMPQ BX, $0x08
+	JL   sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
+	MOVQ DX, CX
+	SHRQ $0x03, CX
+	SUBQ CX, R12
+	MOVQ (R12), AX
+	SUBQ CX, BX
+	ANDQ $0x07, DX
+	JMP  sequenceDecs_decodeSync_safe_bmi2_fill_2_end
+
+sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte:
+	CMPQ    BX, $0x00
+	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_2_end
+	CMPQ    DX, $0x07
+	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_2_end
+	SHLQ    $0x08, AX
+	SUBQ    $0x01, R12
+	SUBQ    $0x01, BX
+	SUBQ    $0x08, DX
+	MOVBQZX (R12), CX
+	ORQ     CX, AX
+	JMP     sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
+
+sequenceDecs_decodeSync_safe_bmi2_fill_2_end:
+	// Update literal length
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, SI, R13
+	MOVQ   AX, R14
+	LEAQ   (DX)(R13*1), CX
+	ROLQ   CL, R14
+	BZHIQ  R13, R14, R14
+	MOVQ   CX, DX
+	MOVQ   SI, CX
+	SHRQ   $0x20, CX
+	ADDQ   R14, CX
+	MOVQ   CX, 24(SP)
+
+	// Fill bitreader for state updates
+	MOVQ   R12, (SP)
+	MOVQ   $0x00000808, CX
+	BEXTRQ CX, R8, R12
+	MOVQ   ctx+16(FP), CX
+	CMPQ   96(CX), $0x00
+	JZ     sequenceDecs_decodeSync_safe_bmi2_skip_update
+
+	// Update Literal Length State
+	MOVBQZX SI, R13
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, SI, SI
+	LEAQ    (DX)(R13*1), CX
+	MOVQ    AX, R14
+	MOVQ    CX, DX
+	ROLQ    CL, R14
+	BZHIQ   R13, R14, R14
+	ADDQ    R14, SI
+
+	// Load ctx.llTable
+	MOVQ ctx+16(FP), CX
+	MOVQ (CX), CX
+	MOVQ (CX)(SI*8), SI
+
+	// Update Match Length State
+	MOVBQZX DI, R13
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, DI, DI
+	LEAQ    (DX)(R13*1), CX
+	MOVQ    AX, R14
+	MOVQ    CX, DX
+	ROLQ    CL, R14
+	BZHIQ   R13, R14, R14
+	ADDQ    R14, DI
+
+	// Load ctx.mlTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 24(CX), CX
+	MOVQ (CX)(DI*8), DI
+
+	// Update Offset State
+	MOVBQZX R8, R13
+	MOVQ    $0x00001010, CX
+	BEXTRQ  CX, R8, R8
+	LEAQ    (DX)(R13*1), CX
+	MOVQ    AX, R14
+	MOVQ    CX, DX
+	ROLQ    CL, R14
+	BZHIQ   R13, R14, R14
+	ADDQ    R14, R8
+
+	// Load ctx.ofTable
+	MOVQ ctx+16(FP), CX
+	MOVQ 48(CX), CX
+	MOVQ (CX)(R8*8), R8
+
+sequenceDecs_decodeSync_safe_bmi2_skip_update:
+	// Adjust offset
+	MOVQ   s+0(FP), CX
+	MOVQ   8(SP), R13
+	CMPQ   R12, $0x01
+	JBE    sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0
+	MOVUPS 144(CX), X0
+	MOVQ   R13, 144(CX)
+	MOVUPS X0, 152(CX)
+	JMP    sequenceDecs_decodeSync_safe_bmi2_adjust_end
+
+sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0:
+	CMPQ 24(SP), $0x00000000
+	JNE  sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero
+	INCQ R13
+	JMP  sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
+
+sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero:
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
+	MOVQ  144(CX), R13
+	JMP   sequenceDecs_decodeSync_safe_bmi2_adjust_end
+
+sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
+	MOVQ    R13, R12
+	XORQ    R14, R14
+	MOVQ    $-1, R15
+	CMPQ    R13, $0x03
+	CMOVQEQ R14, R12
+	CMOVQEQ R15, R14
+	LEAQ    144(CX), R15
+	ADDQ    (R15)(R12*8), R14
+	JNZ     sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid
+	MOVQ    $0x00000001, R14
+
+sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid:
+	CMPQ R13, $0x01
+	JZ   sequenceDecs_decodeSync_safe_bmi2_adjust_skip
+	MOVQ 152(CX), R12
+	MOVQ R12, 160(CX)
+
+sequenceDecs_decodeSync_safe_bmi2_adjust_skip:
+	MOVQ 144(CX), R12
+	MOVQ R12, 152(CX)
+	MOVQ R14, 144(CX)
+	MOVQ R14, R13
+
+sequenceDecs_decodeSync_safe_bmi2_adjust_end:
+	MOVQ R13, 8(SP)
+
+	// Check values
+	MOVQ  16(SP), CX
+	MOVQ  24(SP), R12
+	LEAQ  (CX)(R12*1), R14
+	MOVQ  s+0(FP), R15
+	ADDQ  R14, 256(R15)
+	MOVQ  ctx+16(FP), R14
+	SUBQ  R12, 104(R14)
+	JS    error_not_enough_literals
+	CMPQ  CX, $0x00020002
+	JA    sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big
+	TESTQ R13, R13
+	JNZ   sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok
+	TESTQ CX, CX
+	JNZ   sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch
+
+sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok:
+	MOVQ 24(SP), CX
+	MOVQ 8(SP), R12
+	MOVQ 16(SP), R13
+
+	// Check if we have enough space in s.out
+	LEAQ (CX)(R13*1), R14
+	ADDQ R9, R14
+	CMPQ R14, 32(SP)
+	JA   error_not_enough_space
+
+	// Copy literals
+	TESTQ CX, CX
+	JZ    check_offset
+	XORQ  R14, R14
+	TESTQ $0x00000001, CX
+	JZ    copy_1_word
+	MOVB  (R10)(R14*1), R15
+	MOVB  R15, (R9)(R14*1)
+	ADDQ  $0x01, R14
+
+copy_1_word:
+	TESTQ $0x00000002, CX
+	JZ    copy_1_dword
+	MOVW  (R10)(R14*1), R15
+	MOVW  R15, (R9)(R14*1)
+	ADDQ  $0x02, R14
+
+copy_1_dword:
+	TESTQ $0x00000004, CX
+	JZ    copy_1_qword
+	MOVL  (R10)(R14*1), R15
+	MOVL  R15, (R9)(R14*1)
+	ADDQ  $0x04, R14
+
+copy_1_qword:
+	TESTQ $0x00000008, CX
+	JZ    copy_1_test
+	MOVQ  (R10)(R14*1), R15
+	MOVQ  R15, (R9)(R14*1)
+	ADDQ  $0x08, R14
+	JMP   copy_1_test
+
+copy_1:
+	MOVUPS (R10)(R14*1), X0
+	MOVUPS X0, (R9)(R14*1)
+	ADDQ   $0x10, R14
+
+copy_1_test:
+	CMPQ R14, CX
+	JB   copy_1
+	ADDQ CX, R10
+	ADDQ CX, R9
+	ADDQ CX, R11
+
+	// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
+check_offset:
+	MOVQ R11, CX
+	ADDQ 40(SP), CX
+	CMPQ R12, CX
+	JG   error_match_off_too_big
+	CMPQ R12, 56(SP)
+	JG   error_match_off_too_big
+
+	// Copy match from history
+	MOVQ  R12, CX
+	SUBQ  R11, CX
+	JLS   copy_match
+	MOVQ  48(SP), R14
+	SUBQ  CX, R14
+	CMPQ  R13, CX
+	JGE   copy_all_from_history
+	XORQ  CX, CX
+	TESTQ $0x00000001, R13
+	JZ    copy_4_word
+	MOVB  (R14)(CX*1), R12
+	MOVB  R12, (R9)(CX*1)
+	ADDQ  $0x01, CX
+
+copy_4_word:
+	TESTQ $0x00000002, R13
+	JZ    copy_4_dword
+	MOVW  (R14)(CX*1), R12
+	MOVW  R12, (R9)(CX*1)
+	ADDQ  $0x02, CX
+
+copy_4_dword:
+	TESTQ $0x00000004, R13
+	JZ    copy_4_qword
+	MOVL  (R14)(CX*1), R12
+	MOVL  R12, (R9)(CX*1)
+	ADDQ  $0x04, CX
+
+copy_4_qword:
+	TESTQ $0x00000008, R13
+	JZ    copy_4_test
+	MOVQ  (R14)(CX*1), R12
+	MOVQ  R12, (R9)(CX*1)
+	ADDQ  $0x08, CX
+	JMP   copy_4_test
+
+copy_4:
+	MOVUPS (R14)(CX*1), X0
+	MOVUPS X0, (R9)(CX*1)
+	ADDQ   $0x10, CX
+
+copy_4_test:
+	CMPQ CX, R13
+	JB   copy_4
+	ADDQ R13, R11
+	ADDQ R13, R9
+	JMP  handle_loop
+	JMP loop_finished
+
+copy_all_from_history:
+	XORQ  R15, R15
+	TESTQ $0x00000001, CX
+	JZ    copy_5_word
+	MOVB  (R14)(R15*1), BP
+	MOVB  BP, (R9)(R15*1)
+	ADDQ  $0x01, R15
+
+copy_5_word:
+	TESTQ $0x00000002, CX
+	JZ    copy_5_dword
+	MOVW  (R14)(R15*1), BP
+	MOVW  BP, (R9)(R15*1)
+	ADDQ  $0x02, R15
+
+copy_5_dword:
+	TESTQ $0x00000004, CX
+	JZ    copy_5_qword
+	MOVL  (R14)(R15*1), BP
+	MOVL  BP, (R9)(R15*1)
+	ADDQ  $0x04, R15
+
+copy_5_qword:
+	TESTQ $0x00000008, CX
+	JZ    copy_5_test
+	MOVQ  (R14)(R15*1), BP
+	MOVQ  BP, (R9)(R15*1)
+	ADDQ  $0x08, R15
+	JMP   copy_5_test
+
+copy_5:
+	MOVUPS (R14)(R15*1), X0
+	MOVUPS X0, (R9)(R15*1)
+	ADDQ   $0x10, R15
+
+copy_5_test:
+	CMPQ R15, CX
+	JB   copy_5
+	ADDQ CX, R9
+	ADDQ CX, R11
+	SUBQ CX, R13
+
+	// Copy match from the current buffer
+copy_match:
+	TESTQ R13, R13
+	JZ    handle_loop
+	MOVQ  R9, CX
+	SUBQ  R12, CX
+
+	// ml <= mo
+	CMPQ R13, R12
+	JA   copy_overlapping_match
+
+	// Copy non-overlapping match
+	ADDQ  R13, R11
+	XORQ  R12, R12
+	TESTQ $0x00000001, R13
+	JZ    copy_2_word
+	MOVB  (CX)(R12*1), R14
+	MOVB  R14, (R9)(R12*1)
+	ADDQ  $0x01, R12
+
+copy_2_word:
+	TESTQ $0x00000002, R13
+	JZ    copy_2_dword
+	MOVW  (CX)(R12*1), R14
+	MOVW  R14, (R9)(R12*1)
+	ADDQ  $0x02, R12
+
+copy_2_dword:
+	TESTQ $0x00000004, R13
+	JZ    copy_2_qword
+	MOVL  (CX)(R12*1), R14
+	MOVL  R14, (R9)(R12*1)
+	ADDQ  $0x04, R12
+
+copy_2_qword:
+	TESTQ $0x00000008, R13
+	JZ    copy_2_test
+	MOVQ  (CX)(R12*1), R14
+	MOVQ  R14, (R9)(R12*1)
+	ADDQ  $0x08, R12
+	JMP   copy_2_test
+
+copy_2:
+	MOVUPS (CX)(R12*1), X0
+	MOVUPS X0, (R9)(R12*1)
+	ADDQ   $0x10, R12
+
+copy_2_test:
+	CMPQ R12, R13
+	JB   copy_2
+	ADDQ R13, R9
+	JMP  handle_loop
+
+	// Copy overlapping match
+copy_overlapping_match:
+	ADDQ R13, R11
+
+copy_slow_3:
+	MOVB (CX), R12
+	MOVB R12, (R9)
+	INCQ CX
+	INCQ R9
+	DECQ R13
+	JNZ  copy_slow_3
+
+handle_loop:
+	MOVQ ctx+16(FP), CX
+	DECQ 96(CX)
+	JNS  sequenceDecs_decodeSync_safe_bmi2_main_loop
+
+loop_finished:
+	MOVQ br+8(FP), CX
+	MOVQ AX, 32(CX)
+	MOVB DL, 40(CX)
+	MOVQ BX, 24(CX)
+
+	// Update the context
+	MOVQ ctx+16(FP), AX
+	MOVQ R11, 136(AX)
+	MOVQ 144(AX), CX
+	SUBQ CX, R10
+	MOVQ R10, 168(AX)
+
+	// Return success
+	MOVQ $0x00000000, ret+24(FP)
+	RET
+
+	// Return with match length error
+sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch:
+	MOVQ 16(SP), AX
+	MOVQ ctx+16(FP), CX
+	MOVQ AX, 216(CX)
+	MOVQ $0x00000001, ret+24(FP)
+	RET
+
+	// Return with match too long error
+sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big:
+	MOVQ ctx+16(FP), AX
+	MOVQ 16(SP), CX
+	MOVQ CX, 216(AX)
+	MOVQ $0x00000002, ret+24(FP)
+	RET
+
+	// Return with match offset too long error
+error_match_off_too_big:
+	MOVQ ctx+16(FP), AX
+	MOVQ 8(SP), CX
+	MOVQ CX, 224(AX)
+	MOVQ R11, 136(AX)
+	MOVQ $0x00000003, ret+24(FP)
+	RET
+
+	// Return with not enough literals error
+error_not_enough_literals:
+	MOVQ ctx+16(FP), AX
+	MOVQ 24(SP), CX
+	MOVQ CX, 208(AX)
+	MOVQ $0x00000004, ret+24(FP)
+	RET
+
+	// Return with not enough output space error
+error_not_enough_space:
+	MOVQ ctx+16(FP), AX
+	MOVQ 24(SP), CX
+	MOVQ CX, 208(AX)
+	MOVQ 16(SP), CX
+	MOVQ CX, 216(AX)
+	MOVQ R11, 136(AX)
+	MOVQ $0x00000005, ret+24(FP)
+	RET
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go b/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go
new file mode 100644
index 0000000..c3452bc
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go
@@ -0,0 +1,237 @@
+//go:build !amd64 || appengine || !gc || noasm
+// +build !amd64 appengine !gc noasm
+
+package zstd
+
+import (
+	"fmt"
+	"io"
+)
+
+// decode sequences from the stream with the provided history but without dictionary.
+func (s *sequenceDecs) decodeSyncSimple(hist []byte) (bool, error) {
+	return false, nil
+}
+
+// decode sequences from the stream without the provided history.
+func (s *sequenceDecs) decode(seqs []seqVals) error {
+	br := s.br
+
+	// Grab full sizes tables, to avoid bounds checks.
+	llTable, mlTable, ofTable := s.litLengths.fse.dt[:maxTablesize], s.matchLengths.fse.dt[:maxTablesize], s.offsets.fse.dt[:maxTablesize]
+	llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
+	s.seqSize = 0
+	litRemain := len(s.literals)
+
+	maxBlockSize := maxCompressedBlockSize
+	if s.windowSize < maxBlockSize {
+		maxBlockSize = s.windowSize
+	}
+	for i := range seqs {
+		var ll, mo, ml int
+		if br.off > 4+((maxOffsetBits+16+16)>>3) {
+			// inlined function:
+			// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
+
+			// Final will not read from stream.
+			var llB, mlB, moB uint8
+			ll, llB = llState.final()
+			ml, mlB = mlState.final()
+			mo, moB = ofState.final()
+
+			// extra bits are stored in reverse order.
+			br.fillFast()
+			mo += br.getBits(moB)
+			if s.maxBits > 32 {
+				br.fillFast()
+			}
+			ml += br.getBits(mlB)
+			ll += br.getBits(llB)
+
+			if moB > 1 {
+				s.prevOffset[2] = s.prevOffset[1]
+				s.prevOffset[1] = s.prevOffset[0]
+				s.prevOffset[0] = mo
+			} else {
+				// mo = s.adjustOffset(mo, ll, moB)
+				// Inlined for rather big speedup
+				if ll == 0 {
+					// There is an exception though, when current sequence's literals_length = 0.
+					// In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2,
+					// an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte.
+					mo++
+				}
+
+				if mo == 0 {
+					mo = s.prevOffset[0]
+				} else {
+					var temp int
+					if mo == 3 {
+						temp = s.prevOffset[0] - 1
+					} else {
+						temp = s.prevOffset[mo]
+					}
+
+					if temp == 0 {
+						// 0 is not valid; input is corrupted; force offset to 1
+						println("WARNING: temp was 0")
+						temp = 1
+					}
+
+					if mo != 1 {
+						s.prevOffset[2] = s.prevOffset[1]
+					}
+					s.prevOffset[1] = s.prevOffset[0]
+					s.prevOffset[0] = temp
+					mo = temp
+				}
+			}
+			br.fillFast()
+		} else {
+			if br.overread() {
+				if debugDecoder {
+					printf("reading sequence %d, exceeded available data\n", i)
+				}
+				return io.ErrUnexpectedEOF
+			}
+			ll, mo, ml = s.next(br, llState, mlState, ofState)
+			br.fill()
+		}
+
+		if debugSequences {
+			println("Seq", i, "Litlen:", ll, "mo:", mo, "(abs) ml:", ml)
+		}
+		// Evaluate.
+		// We might be doing this async, so do it early.
+		if mo == 0 && ml > 0 {
+			return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
+		}
+		if ml > maxMatchLen {
+			return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
+		}
+		s.seqSize += ll + ml
+		if s.seqSize > maxBlockSize {
+			return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+		}
+		litRemain -= ll
+		if litRemain < 0 {
+			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, litRemain+ll)
+		}
+		seqs[i] = seqVals{
+			ll: ll,
+			ml: ml,
+			mo: mo,
+		}
+		if i == len(seqs)-1 {
+			// This is the last sequence, so we shouldn't update state.
+			break
+		}
+
+		// Manually inlined, ~ 5-20% faster
+		// Update all 3 states at once. Approx 20% faster.
+		nBits := llState.nbBits() + mlState.nbBits() + ofState.nbBits()
+		if nBits == 0 {
+			llState = llTable[llState.newState()&maxTableMask]
+			mlState = mlTable[mlState.newState()&maxTableMask]
+			ofState = ofTable[ofState.newState()&maxTableMask]
+		} else {
+			bits := br.get32BitsFast(nBits)
+			lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31))
+			llState = llTable[(llState.newState()+lowBits)&maxTableMask]
+
+			lowBits = uint16(bits >> (ofState.nbBits() & 31))
+			lowBits &= bitMask[mlState.nbBits()&15]
+			mlState = mlTable[(mlState.newState()+lowBits)&maxTableMask]
+
+			lowBits = uint16(bits) & bitMask[ofState.nbBits()&15]
+			ofState = ofTable[(ofState.newState()+lowBits)&maxTableMask]
+		}
+	}
+	s.seqSize += litRemain
+	if s.seqSize > maxBlockSize {
+		return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+	}
+	err := br.close()
+	if err != nil {
+		printf("Closing sequences: %v, %+v\n", err, *br)
+	}
+	return err
+}
+
+// executeSimple handles cases when a dictionary is not used.
+func (s *sequenceDecs) executeSimple(seqs []seqVals, hist []byte) error {
+	// Ensure we have enough output size...
+	if len(s.out)+s.seqSize > cap(s.out) {
+		addBytes := s.seqSize + len(s.out)
+		s.out = append(s.out, make([]byte, addBytes)...)
+		s.out = s.out[:len(s.out)-addBytes]
+	}
+
+	if debugDecoder {
+		printf("Execute %d seqs with literals: %d into %d bytes\n", len(seqs), len(s.literals), s.seqSize)
+	}
+
+	var t = len(s.out)
+	out := s.out[:t+s.seqSize]
+
+	for _, seq := range seqs {
+		// Add literals
+		copy(out[t:], s.literals[:seq.ll])
+		t += seq.ll
+		s.literals = s.literals[seq.ll:]
+
+		// Malformed input
+		if seq.mo > t+len(hist) || seq.mo > s.windowSize {
+			return fmt.Errorf("match offset (%d) bigger than current history (%d)", seq.mo, t+len(hist))
+		}
+
+		// Copy from history.
+		if v := seq.mo - t; v > 0 {
+			// v is the start position in history from end.
+			start := len(hist) - v
+			if seq.ml > v {
+				// Some goes into the current block.
+				// Copy remainder of history
+				copy(out[t:], hist[start:])
+				t += v
+				seq.ml -= v
+			} else {
+				copy(out[t:], hist[start:start+seq.ml])
+				t += seq.ml
+				continue
+			}
+		}
+
+		// We must be in the current buffer now
+		if seq.ml > 0 {
+			start := t - seq.mo
+			if seq.ml <= t-start {
+				// No overlap
+				copy(out[t:], out[start:start+seq.ml])
+				t += seq.ml
+			} else {
+				// Overlapping copy
+				// Extend destination slice and copy one byte at the time.
+				src := out[start : start+seq.ml]
+				dst := out[t:]
+				dst = dst[:len(src)]
+				t += len(src)
+				// Destination is the space we just added.
+				for i := range src {
+					dst[i] = src[i]
+				}
+			}
+		}
+	}
+	// Add final literals
+	copy(out[t:], s.literals)
+	if debugDecoder {
+		t += len(s.literals)
+		if t != len(out) {
+			panic(fmt.Errorf("length mismatch, want %d, got %d, ss: %d", len(out), t, s.seqSize))
+		}
+	}
+	s.out = out
+
+	return nil
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/seqenc.go b/vendor/github.com/klauspost/compress/zstd/seqenc.go
index 36bcc3c..8014174 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqenc.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqenc.go
@@ -35,7 +35,6 @@ func (s *seqCoders) setPrev(ll, ml, of *fseEncoder) {
 		// Ensure we cannot reuse by accident
 		prevEnc := *prev
 		prevEnc.symbolLen = 0
-		return
 	}
 	compareSwap(ll, &s.llEnc, &s.llPrev)
 	compareSwap(ml, &s.mlEnc, &s.mlPrev)
diff --git a/vendor/github.com/klauspost/compress/zstd/snappy.go b/vendor/github.com/klauspost/compress/zstd/snappy.go
index 690428c..9e1baad 100644
--- a/vendor/github.com/klauspost/compress/zstd/snappy.go
+++ b/vendor/github.com/klauspost/compress/zstd/snappy.go
@@ -11,7 +11,7 @@ import (
 	"io"
 
 	"github.com/klauspost/compress/huff0"
-	"github.com/klauspost/compress/snappy"
+	snappy "github.com/klauspost/compress/internal/snapref"
 )
 
 const (
@@ -111,7 +111,7 @@ func (r *SnappyConverter) Convert(in io.Reader, w io.Writer) (int64, error) {
 			// Add empty last block
 			r.block.reset(nil)
 			r.block.last = true
-			err := r.block.encodeLits(false)
+			err := r.block.encodeLits(r.block.literals, false)
 			if err != nil {
 				return written, err
 			}
@@ -178,17 +178,16 @@ func (r *SnappyConverter) Convert(in io.Reader, w io.Writer) (int64, error) {
 				r.err = ErrSnappyCorrupt
 				return written, r.err
 			}
-			err = r.block.encode(false, false)
+			err = r.block.encode(nil, false, false)
 			switch err {
 			case errIncompressible:
 				r.block.popOffsets()
 				r.block.reset(nil)
 				r.block.literals, err = snappy.Decode(r.block.literals[:n], r.buf[snappyChecksumSize:chunkLen])
 				if err != nil {
-					println("snappy.Decode:", err)
 					return written, err
 				}
-				err = r.block.encodeLits(false)
+				err = r.block.encodeLits(r.block.literals, false)
 				if err != nil {
 					return written, err
 				}
@@ -204,7 +203,7 @@ func (r *SnappyConverter) Convert(in io.Reader, w io.Writer) (int64, error) {
 			written += int64(n)
 			continue
 		case chunkTypeUncompressedData:
-			if debug {
+			if debugEncoder {
 				println("Uncompressed, chunklen", chunkLen)
 			}
 			// Section 4.3. Uncompressed data (chunk type 0x01).
@@ -235,7 +234,7 @@ func (r *SnappyConverter) Convert(in io.Reader, w io.Writer) (int64, error) {
 				r.err = ErrSnappyCorrupt
 				return written, r.err
 			}
-			err := r.block.encodeLits(false)
+			err := r.block.encodeLits(r.block.literals, false)
 			if err != nil {
 				return written, err
 			}
@@ -247,7 +246,7 @@ func (r *SnappyConverter) Convert(in io.Reader, w io.Writer) (int64, error) {
 			continue
 
 		case chunkTypeStreamIdentifier:
-			if debug {
+			if debugEncoder {
 				println("stream id", chunkLen, len(snappyMagicBody))
 			}
 			// Section 4.1. Stream identifier (chunk type 0xff).
@@ -417,7 +416,7 @@ var crcTable = crc32.MakeTable(crc32.Castagnoli)
 // https://github.com/google/snappy/blob/master/framing_format.txt
 func snappyCRC(b []byte) uint32 {
 	c := crc32.Update(0, crcTable, b)
-	return uint32(c>>15|c<<17) + 0xa282ead8
+	return c>>15 | c<<17 + 0xa282ead8
 }
 
 // snappyDecodedLen returns the length of the decoded block and the number of bytes
diff --git a/vendor/github.com/klauspost/compress/zstd/zip.go b/vendor/github.com/klauspost/compress/zstd/zip.go
new file mode 100644
index 0000000..b53f606
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/zip.go
@@ -0,0 +1,134 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+
+package zstd
+
+import (
+	"errors"
+	"io"
+	"sync"
+)
+
+// ZipMethodWinZip is the method for Zstandard compressed data inside Zip files for WinZip.
+// See https://www.winzip.com/win/en/comp_info.html
+const ZipMethodWinZip = 93
+
+// ZipMethodPKWare is the original method number used by PKWARE to indicate Zstandard compression.
+// Deprecated: This has been deprecated by PKWARE, use ZipMethodWinZip instead for compression.
+// See https://pkware.cachefly.net/webdocs/APPNOTE/APPNOTE-6.3.9.TXT
+const ZipMethodPKWare = 20
+
+var zipReaderPool sync.Pool
+
+// newZipReader creates a pooled zip decompressor.
+func newZipReader(opts ...DOption) func(r io.Reader) io.ReadCloser {
+	pool := &zipReaderPool
+	if len(opts) > 0 {
+		opts = append([]DOption{WithDecoderLowmem(true), WithDecoderMaxWindow(128 << 20)}, opts...)
+		// Force concurrency 1
+		opts = append(opts, WithDecoderConcurrency(1))
+		// Create our own pool
+		pool = &sync.Pool{}
+	}
+	return func(r io.Reader) io.ReadCloser {
+		dec, ok := pool.Get().(*Decoder)
+		if ok {
+			dec.Reset(r)
+		} else {
+			d, err := NewReader(r, opts...)
+			if err != nil {
+				panic(err)
+			}
+			dec = d
+		}
+		return &pooledZipReader{dec: dec, pool: pool}
+	}
+}
+
+type pooledZipReader struct {
+	mu   sync.Mutex // guards Close and Read
+	pool *sync.Pool
+	dec  *Decoder
+}
+
+func (r *pooledZipReader) Read(p []byte) (n int, err error) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	if r.dec == nil {
+		return 0, errors.New("read after close or EOF")
+	}
+	dec, err := r.dec.Read(p)
+	if err == io.EOF {
+		r.dec.Reset(nil)
+		r.pool.Put(r.dec)
+		r.dec = nil
+	}
+	return dec, err
+}
+
+func (r *pooledZipReader) Close() error {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	var err error
+	if r.dec != nil {
+		err = r.dec.Reset(nil)
+		r.pool.Put(r.dec)
+		r.dec = nil
+	}
+	return err
+}
+
+type pooledZipWriter struct {
+	mu   sync.Mutex // guards Close and Read
+	enc  *Encoder
+	pool *sync.Pool
+}
+
+func (w *pooledZipWriter) Write(p []byte) (n int, err error) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	if w.enc == nil {
+		return 0, errors.New("Write after Close")
+	}
+	return w.enc.Write(p)
+}
+
+func (w *pooledZipWriter) Close() error {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	var err error
+	if w.enc != nil {
+		err = w.enc.Close()
+		w.pool.Put(w.enc)
+		w.enc = nil
+	}
+	return err
+}
+
+// ZipCompressor returns a compressor that can be registered with zip libraries.
+// The provided encoder options will be used on all encodes.
+func ZipCompressor(opts ...EOption) func(w io.Writer) (io.WriteCloser, error) {
+	var pool sync.Pool
+	return func(w io.Writer) (io.WriteCloser, error) {
+		enc, ok := pool.Get().(*Encoder)
+		if ok {
+			enc.Reset(w)
+		} else {
+			var err error
+			enc, err = NewWriter(w, opts...)
+			if err != nil {
+				return nil, err
+			}
+		}
+		return &pooledZipWriter{enc: enc, pool: &pool}, nil
+	}
+}
+
+// ZipDecompressor returns a decompressor that can be registered with zip libraries.
+// See ZipCompressor for example.
+// Options can be specified. WithDecoderConcurrency(1) is forced,
+// and by default a 128MB maximum decompression window is specified.
+// The window size can be overridden if required.
+func ZipDecompressor(opts ...DOption) func(r io.Reader) io.ReadCloser {
+	return newZipReader(opts...)
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/zstd.go b/vendor/github.com/klauspost/compress/zstd/zstd.go
index 0807719..c1c90b4 100644
--- a/vendor/github.com/klauspost/compress/zstd/zstd.go
+++ b/vendor/github.com/klauspost/compress/zstd/zstd.go
@@ -4,6 +4,8 @@
 package zstd
 
 import (
+	"bytes"
+	"encoding/binary"
 	"errors"
 	"log"
 	"math"
@@ -13,6 +15,12 @@ import (
 // enable debug printing
 const debug = false
 
+// enable encoding debug printing
+const debugEncoder = debug
+
+// enable decoding debug printing
+const debugDecoder = debug
+
 // Enable extra assertions.
 const debugAsserts = debug || false
 
@@ -31,6 +39,9 @@ const zstdMinMatch = 3
 // Reset the buffer offset when reaching this.
 const bufferReset = math.MaxInt32 - MaxWindowSize
 
+// fcsUnknown is used for unknown frame content size.
+const fcsUnknown = math.MaxUint64
+
 var (
 	// ErrReservedBlockType is returned when a reserved block type is found.
 	// Typically this indicates wrong or corrupted input.
@@ -44,6 +55,10 @@ var (
 	// Typically returned on invalid input.
 	ErrBlockTooSmall = errors.New("block too small")
 
+	// ErrUnexpectedBlockSize is returned when a block has unexpected size.
+	// Typically returned on invalid input.
+	ErrUnexpectedBlockSize = errors.New("unexpected block size")
+
 	// ErrMagicMismatch is returned when a "magic" number isn't what is expected.
 	// Typically this indicates wrong or corrupted input.
 	ErrMagicMismatch = errors.New("invalid input: magic number mismatch")
@@ -67,22 +82,30 @@ var (
 	// This is only returned if SingleSegment is specified on the frame.
 	ErrFrameSizeExceeded = errors.New("frame size exceeded")
 
+	// ErrFrameSizeMismatch is returned if the stated frame size does not match the expected size.
+	// This is only returned if SingleSegment is specified on the frame.
+	ErrFrameSizeMismatch = errors.New("frame size does not match size on stream")
+
 	// ErrCRCMismatch is returned if CRC mismatches.
 	ErrCRCMismatch = errors.New("CRC check failed")
 
 	// ErrDecoderClosed will be returned if the Decoder was used after
 	// Close has been called.
 	ErrDecoderClosed = errors.New("decoder used after Close")
+
+	// ErrDecoderNilInput is returned when a nil Reader was provided
+	// and an operation other than Reset/DecodeAll/Close was attempted.
+	ErrDecoderNilInput = errors.New("nil input provided as reader")
 )
 
 func println(a ...interface{}) {
-	if debug {
+	if debug || debugDecoder || debugEncoder {
 		log.Println(a...)
 	}
 }
 
 func printf(format string, a ...interface{}) {
-	if debug {
+	if debug || debugDecoder || debugEncoder {
 		log.Printf(format, a...)
 	}
 }
@@ -121,24 +144,20 @@ func matchLen(a, b []byte) int {
 }
 
 func load3232(b []byte, i int32) uint32 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:4]
-	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+	return binary.LittleEndian.Uint32(b[i:])
 }
 
 func load6432(b []byte, i int32) uint64 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:8]
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
-		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+	return binary.LittleEndian.Uint64(b[i:])
 }
 
 func load64(b []byte, i int) uint64 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:8]
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
-		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+	return binary.LittleEndian.Uint64(b[i:])
 }
+
+type byter interface {
+	Bytes() []byte
+	Len() int
+}
+
+var _ byter = &bytes.Buffer{}
diff --git a/vendor/github.com/klauspost/pgzip/.travis.yml b/vendor/github.com/klauspost/pgzip/.travis.yml
index 6e9fca0..acfec4b 100644
--- a/vendor/github.com/klauspost/pgzip/.travis.yml
+++ b/vendor/github.com/klauspost/pgzip/.travis.yml
@@ -1,19 +1,22 @@
 language: go
 
-sudo: false
-
 os:
   - linux
   - osx
 
 go:
-  - 1.9.x
-  - 1.10.x
+  - 1.13.x
+  - 1.14.x
+  - 1.15.x
   - master
 
-script: 
- - go test -v -cpu=1,2,4 .
- - go test -v -cpu=2 -race -short .
+env:
+  - GO111MODULE=off
+
+script:
+  - diff <(gofmt -d .) <(printf "")
+  - go test -v -cpu=1,2,4 .
+  - go test -v -cpu=2 -race -short .
 
 matrix:
   allow_failures:
diff --git a/vendor/github.com/klauspost/pgzip/gunzip.go b/vendor/github.com/klauspost/pgzip/gunzip.go
index 93efec7..d1ae730 100644
--- a/vendor/github.com/klauspost/pgzip/gunzip.go
+++ b/vendor/github.com/klauspost/pgzip/gunzip.go
@@ -331,6 +331,16 @@ func (z *Reader) killReadAhead() error {
 		// Wait for decompressor to be closed and return error, if any.
 		e, ok := <-z.closeErr
 		z.activeRA = false
+
+		for blk := range z.readAhead {
+			if blk.b != nil {
+				z.blockPool <- blk.b
+			}
+		}
+		if cap(z.current) > 0 {
+			z.blockPool <- z.current
+			z.current = nil
+		}
 		if !ok {
 			// Channel is closed, so if there was any error it has already been returned.
 			return nil
@@ -418,6 +428,7 @@ func (z *Reader) doReadAhead() {
 			case z.readAhead <- read{b: buf, err: err}:
 			case <-closeReader:
 				// Sent on close, we don't care about the next results
+				z.blockPool <- buf
 				return
 			}
 			if err != nil {
diff --git a/vendor/github.com/magiconair/properties/.travis.yml b/vendor/github.com/magiconair/properties/.travis.yml
index 6cee379..baf9031 100644
--- a/vendor/github.com/magiconair/properties/.travis.yml
+++ b/vendor/github.com/magiconair/properties/.travis.yml
@@ -12,4 +12,6 @@ go:
     - "1.12.x"
     - "1.13.x"
     - "1.14.x"
+    - "1.15.x"
+    - "1.16.x"
     - tip
diff --git a/vendor/github.com/magiconair/properties/CHANGELOG.md b/vendor/github.com/magiconair/properties/CHANGELOG.md
index 176626a..ff8d025 100644
--- a/vendor/github.com/magiconair/properties/CHANGELOG.md
+++ b/vendor/github.com/magiconair/properties/CHANGELOG.md
@@ -1,8 +1,29 @@
 ## Changelog
 
+### [1.8.2](https://github.com/magiconair/properties/tree/v1.8.2) - 25 Aug 2020
+
+ * [PR #36](https://github.com/magiconair/properties/pull/36): Escape backslash on write
+
+   This patch ensures that backslashes are escaped on write. Existing applications which
+   rely on the old behavior may need to be updated.
+
+   Thanks to [@apesternikov](https://github.com/apesternikov) for the patch.
+
+ * [PR #42](https://github.com/magiconair/properties/pull/42): Made Content-Type check whitespace agnostic in LoadURL()
+
+   Thanks to [@aliras1](https://github.com/aliras1) for the patch.
+
+ * [PR #41](https://github.com/magiconair/properties/pull/41): Make key/value separator configurable on Write()
+
+   Thanks to [@mkjor](https://github.com/mkjor) for the patch.
+
+ * [PR #40](https://github.com/magiconair/properties/pull/40): Add method to return a sorted list of keys
+
+   Thanks to [@mkjor](https://github.com/mkjor) for the patch.
+
 ### [1.8.1](https://github.com/magiconair/properties/tree/v1.8.1) - 10 May 2019
 
- * [PR #26](https://github.com/magiconair/properties/pull/35): Close body always after request
+ * [PR #35](https://github.com/magiconair/properties/pull/35): Close body always after request
 
    This patch ensures that in `LoadURL` the response body is always closed.
 
diff --git a/vendor/github.com/magiconair/properties/README.md b/vendor/github.com/magiconair/properties/README.md
index 42ed5c3..e2edda0 100644
--- a/vendor/github.com/magiconair/properties/README.md
+++ b/vendor/github.com/magiconair/properties/README.md
@@ -1,6 +1,5 @@
 [![](https://img.shields.io/github/tag/magiconair/properties.svg?style=flat-square&label=release)](https://github.com/magiconair/properties/releases)
 [![Travis CI Status](https://img.shields.io/travis/magiconair/properties.svg?branch=master&style=flat-square&label=travis)](https://travis-ci.org/magiconair/properties)
-[![CircleCI Status](https://img.shields.io/circleci/project/github/magiconair/properties.svg?label=circle+ci&style=flat-square)](https://circleci.com/gh/magiconair/properties)
 [![License](https://img.shields.io/badge/License-BSD%202--Clause-orange.svg?style=flat-square)](https://raw.githubusercontent.com/magiconair/properties/master/LICENSE)
 [![GoDoc](http://img.shields.io/badge/godoc-reference-5272B4.svg?style=flat-square)](http://godoc.org/github.com/magiconair/properties)
 
diff --git a/vendor/github.com/magiconair/properties/lex.go b/vendor/github.com/magiconair/properties/lex.go
index 367166d..e1e9dd7 100644
--- a/vendor/github.com/magiconair/properties/lex.go
+++ b/vendor/github.com/magiconair/properties/lex.go
@@ -128,18 +128,6 @@ func (l *lexer) acceptRun(valid string) {
 	l.backup()
 }
 
-// acceptRunUntil consumes a run of runes up to a terminator.
-func (l *lexer) acceptRunUntil(term rune) {
-	for term != l.next() {
-	}
-	l.backup()
-}
-
-// hasText returns true if the current parsed text is not empty.
-func (l *lexer) isNotEmpty() bool {
-	return l.pos > l.start
-}
-
 // lineNumber reports which line we're on, based on the position of
 // the previous item returned by nextItem. Doing it this way
 // means we don't have to worry about peek double counting.
diff --git a/vendor/github.com/magiconair/properties/load.go b/vendor/github.com/magiconair/properties/load.go
index ab95325..c83c2da 100644
--- a/vendor/github.com/magiconair/properties/load.go
+++ b/vendor/github.com/magiconair/properties/load.go
@@ -132,11 +132,12 @@ func (l *Loader) LoadURL(url string) (*Properties, error) {
 	}
 
 	ct := resp.Header.Get("Content-Type")
+	ct = strings.Join(strings.Fields(ct), "")
 	var enc Encoding
 	switch strings.ToLower(ct) {
-	case "text/plain", "text/plain; charset=iso-8859-1", "text/plain; charset=latin1":
+	case "text/plain", "text/plain;charset=iso-8859-1", "text/plain;charset=latin1":
 		enc = ISO_8859_1
-	case "", "text/plain; charset=utf-8":
+	case "", "text/plain;charset=utf-8":
 		enc = UTF8
 	default:
 		return nil, fmt.Errorf("properties: invalid content type %s", ct)
diff --git a/vendor/github.com/magiconair/properties/parser.go b/vendor/github.com/magiconair/properties/parser.go
index cdc4a80..430e4fc 100644
--- a/vendor/github.com/magiconair/properties/parser.go
+++ b/vendor/github.com/magiconair/properties/parser.go
@@ -59,14 +59,6 @@ func (p *parser) errorf(format string, args ...interface{}) {
 	panic(fmt.Errorf(format, args...))
 }
 
-func (p *parser) expect(expected itemType) (token item) {
-	token = p.lex.nextItem()
-	if token.typ != expected {
-		p.unexpected(token)
-	}
-	return token
-}
-
 func (p *parser) expectOneOf(expected ...itemType) (token item) {
 	token = p.lex.nextItem()
 	for _, v := range expected {
@@ -91,5 +83,4 @@ func (p *parser) recover(errp *error) {
 		}
 		*errp = e.(error)
 	}
-	return
 }
diff --git a/vendor/github.com/magiconair/properties/properties.go b/vendor/github.com/magiconair/properties/properties.go
index 2e4c8a2..62ae2d6 100644
--- a/vendor/github.com/magiconair/properties/properties.go
+++ b/vendor/github.com/magiconair/properties/properties.go
@@ -8,6 +8,7 @@ package properties
 // BUG(frank): Write() does not allow to configure the newline character. Therefore, on Windows LF is used.
 
 import (
+	"bytes"
 	"fmt"
 	"io"
 	"log"
@@ -115,7 +116,7 @@ func (p *Properties) Get(key string) (value string, ok bool) {
 	// circular references and malformed expressions
 	// so we panic if we still get an error here.
 	if err != nil {
-		ErrorHandler(fmt.Errorf("%s in %q", err, key+" = "+v))
+		ErrorHandler(err)
 	}
 
 	return expanded, true
@@ -636,7 +637,7 @@ func (p *Properties) WriteComment(w io.Writer, prefix string, enc Encoding) (n i
 					}
 
 					for _, c := range comments {
-						x, err = fmt.Fprintf(w, "%s%s\n", prefix, encode(c, "", enc))
+						x, err = fmt.Fprintf(w, "%s%s\n", prefix, c)
 						if err != nil {
 							return
 						}
@@ -766,7 +767,12 @@ func expand(s string, keys []string, prefix, postfix string, values map[string]s
 
 		for _, k := range keys {
 			if key == k {
-				return "", fmt.Errorf("circular reference")
+				var b bytes.Buffer
+				b.WriteString("circular reference in:\n")
+				for _, k1 := range keys {
+					fmt.Fprintf(&b, "%s=%s\n", k1, values[k1])
+				}
+				return "", fmt.Errorf(b.String())
 			}
 		}
 
@@ -780,7 +786,6 @@ func expand(s string, keys []string, prefix, postfix string, values map[string]s
 		}
 		s = s[:start] + new_val + s[end+1:]
 	}
-	return s, nil
 }
 
 // encode encodes a UTF-8 string to ISO-8859-1 and escapes some characters.
@@ -833,6 +838,8 @@ func escape(r rune, special string) string {
 		return "\\r"
 	case '\t':
 		return "\\t"
+	case '\\':
+		return "\\\\"
 	default:
 		if strings.ContainsRune(special, r) {
 			return "\\" + string(r)
diff --git a/vendor/github.com/nwaples/rardecode/archive.go b/vendor/github.com/nwaples/rardecode/archive.go
index f878751..34e4ac2 100644
--- a/vendor/github.com/nwaples/rardecode/archive.go
+++ b/vendor/github.com/nwaples/rardecode/archive.go
@@ -146,67 +146,54 @@ type volume struct {
 	old   bool          // uses old naming scheme
 }
 
-// nextVolName updates name to the next filename in the archive.
-func (v *volume) nextVolName() {
-	if v.num == 0 {
-		// check file extensions
-		i := strings.LastIndex(v.file, ".")
-		if i < 0 {
-			// no file extension, add one
-			i = len(v.file)
-			v.file += ".rar"
-		} else {
-			ext := strings.ToLower(v.file[i+1:])
-			// replace with .rar for empty extensions & self extracting archives
-			if ext == "" || ext == "exe" || ext == "sfx" {
-				v.file = v.file[:i+1] + "rar"
-			}
-		}
-		if a, ok := v.fileBlockReader.(*archive15); ok {
-			v.old = a.old
-		}
-		// new naming scheme must have volume number in filename
-		if !v.old && reDigits.FindStringIndex(v.file) == nil {
-			v.old = true
-		}
-		// For old style naming if 2nd and 3rd character of file extension is not a digit replace
-		// with "00" and ignore any trailing characters.
-		if v.old && (len(v.file) < i+4 || v.file[i+2] < '0' || v.file[i+2] > '9' || v.file[i+3] < '0' || v.file[i+3] > '9') {
-			v.file = v.file[:i+2] + "00"
-			return
-		}
+func (v *volume) openFile(file string) error {
+	f, err := os.Open(v.dir + file)
+	if err != nil {
+		return err
 	}
-	// new style volume naming
-	if !v.old {
-		// find all numbers in volume name
-		m := reDigits.FindAllStringIndex(v.file, -1)
-		if l := len(m); l > 1 {
-			// More than 1 match so assume name.part###of###.rar style.
-			// Take the last 2 matches where the first is the volume number.
-			m = m[l-2 : l]
-			if strings.Contains(v.file[m[0][1]:m[1][0]], ".") || !strings.Contains(v.file[:m[0][0]], ".") {
-				// Didn't match above style as volume had '.' between the two numbers or didnt have a '.'
-				// before the first match. Use the second number as volume number.
-				m = m[1:]
-			}
-		}
-		// extract and increment volume number
-		lo, hi := m[0][0], m[0][1]
-		n, err := strconv.Atoi(v.file[lo:hi])
-		if err != nil {
-			n = 0
-		} else {
-			n++
+	v.f = f
+	v.file = file
+	return nil
+}
+
+func nextNewVolName(file string) string {
+	// find all numbers in volume name
+	m := reDigits.FindAllStringIndex(file, -1)
+	if l := len(m); l > 1 {
+		// More than 1 match so assume name.part###of###.rar style.
+		// Take the last 2 matches where the first is the volume number.
+		m = m[l-2 : l]
+		if strings.Contains(file[m[0][1]:m[1][0]], ".") || !strings.Contains(file[:m[0][0]], ".") {
+			// Didn't match above style as volume had '.' between the two numbers or didnt have a '.'
+			// before the first match. Use the second number as volume number.
+			m = m[1:]
 		}
-		// volume number must use at least the same number of characters as previous volume
-		vol := fmt.Sprintf("%0"+fmt.Sprint(hi-lo)+"d", n)
-		v.file = v.file[:lo] + vol + v.file[hi:]
-		return
 	}
+	// extract and increment volume number
+	lo, hi := m[0][0], m[0][1]
+	n, err := strconv.Atoi(file[lo:hi])
+	if err != nil {
+		n = 0
+	} else {
+		n++
+	}
+	// volume number must use at least the same number of characters as previous volume
+	vol := fmt.Sprintf("%0"+fmt.Sprint(hi-lo)+"d", n)
+	file = file[:lo] + vol + file[hi:]
+	return file
+}
+
+func nextOldVolName(file string) string {
 	// old style volume naming
-	i := strings.LastIndex(v.file, ".")
+	i := strings.LastIndex(file, ".")
+	// For old style naming if 2nd and 3rd character of file extension is not a digit replace
+	// with "00" and ignore any trailing characters.
+	if len(file) < i+4 || file[i+2] < '0' || file[i+2] > '9' || file[i+3] < '0' || file[i+3] > '9' {
+		file = file[:i+2] + "00"
+		return file
+	}
 	// get file extension
-	b := []byte(v.file[i+1:])
+	b := []byte(file[i+1:])
 	// start incrementing volume number digits from rightmost
 	for j := 2; j >= 0; j-- {
 		if b[j] != '9' {
@@ -222,7 +209,54 @@ func (v *volume) nextVolName() {
 			b[j] = '0'
 		}
 	}
-	v.file = v.file[:i+1] + string(b)
+	file = file[:i+1] + string(b)
+	return file
+}
+
+// openNextFile opens the next volume file in the archive.
+func (v *volume) openNextFile() error {
+	file := v.file
+	if v.num == 0 {
+		// check file extensions
+		i := strings.LastIndex(file, ".")
+		if i < 0 {
+			// no file extension, add one
+			file += ".rar"
+		} else {
+			ext := strings.ToLower(file[i+1:])
+			// replace with .rar for empty extensions & self extracting archives
+			if ext == "" || ext == "exe" || ext == "sfx" {
+				file = file[:i+1] + "rar"
+			}
+		}
+		if a, ok := v.fileBlockReader.(*archive15); ok {
+			v.old = a.old
+		}
+		// new naming scheme must have volume number in filename
+		if !v.old {
+			if reDigits.FindStringIndex(file) != nil {
+				// found digits, try using new naming scheme
+				err := v.openFile(nextNewVolName(file))
+				if err != nil && os.IsNotExist(err) {
+					// file didn't exist, try old naming scheme
+					oldErr := v.openFile(nextOldVolName(file))
+					if oldErr == nil || !os.IsNotExist(err) {
+						v.old = true
+						return oldErr
+					}
+				}
+				return err
+			}
+			v.old = true
+		}
+	}
+	// new style volume naming
+	if !v.old {
+		file = nextNewVolName(file)
+	} else {
+		file = nextOldVolName(file)
+	}
+	return v.openFile(file)
 }
 
 func (v *volume) next() (*fileBlockHeader, error) {
@@ -241,8 +275,7 @@ func (v *volume) next() (*fileBlockHeader, error) {
 		}
 
 		v.f.Close()
-		v.nextVolName()
-		v.f, err = os.Open(v.dir + v.file) // Open next volume file
+		err = v.openNextFile() // Open next volume file
 		if err != nil {
 			if atEOF && os.IsNotExist(err) {
 				// volume not found so assume that the archive has ended
diff --git a/vendor/github.com/nwaples/rardecode/decode_reader.go b/vendor/github.com/nwaples/rardecode/decode_reader.go
index b346936..36699f9 100644
--- a/vendor/github.com/nwaples/rardecode/decode_reader.go
+++ b/vendor/github.com/nwaples/rardecode/decode_reader.go
@@ -162,9 +162,6 @@ func (d *decodeReader) queueFilter(f *filterBlock) error {
 	if len(d.filters) >= maxQueuedFilters {
 		return errTooManyFilters
 	}
-	// offset & length must be < window size
-	f.offset &= d.win.mask
-	f.length &= d.win.mask
 	// make offset relative to previous filter in list
 	for _, fb := range d.filters {
 		if f.offset < fb.offset {
@@ -173,6 +170,9 @@ func (d *decodeReader) queueFilter(f *filterBlock) error {
 		}
 		f.offset -= fb.offset
 	}
+	// offset & length must be < window size
+	f.offset &= d.win.mask
+	f.length &= d.win.mask
 	d.filters = append(d.filters, f)
 	return nil
 }
diff --git a/vendor/github.com/nwaples/rardecode/ppm_model.go b/vendor/github.com/nwaples/rardecode/ppm_model.go
index 58a545a..fd55a74 100644
--- a/vendor/github.com/nwaples/rardecode/ppm_model.go
+++ b/vendor/github.com/nwaples/rardecode/ppm_model.go
@@ -193,112 +193,26 @@ func (s *state) setUint16(n uint16) { s.sym = byte(n); s.freq = byte(n >> 8) }
 // If there is only one state, the second state contains that state.
 // If there is more than one state, the second state contains the summFreq
 // and the index to the slice of states.
-type context struct {
-	i int32   // index into the states array for context
-	s []state // slice of two states representing context
-	a *subAllocator
-}
-
-// succPtr returns a pointer value for the context to be stored in a state.succ
-func (c *context) succPtr() int32 { return c.i }
-
-func (c *context) numStates() int { return int(c.s[0].uint16()) }
-
-func (c *context) setNumStates(n int) { c.s[0].setUint16(uint16(n)) }
-
-func (c *context) statesIndex() int32 { return c.s[1].succ }
-
-func (c *context) setStatesIndex(n int32) { c.s[1].succ = n }
-
-func (c *context) suffix() *context { return c.a.succContext(c.s[0].succ) }
-
-func (c *context) setSuffix(sc *context) { c.s[0].succ = sc.i }
-
-func (c *context) summFreq() uint16 { return c.s[1].uint16() }
-
-func (c *context) setSummFreq(f uint16) { c.s[1].setUint16(f) }
-
-func (c *context) notEq(ctx *context) bool { return c.i != ctx.i }
-
-func (c *context) states() []state {
-	if ns := int32(c.s[0].uint16()); ns != 1 {
-		i := c.s[1].succ
-		return c.a.states[i : i+ns]
-	}
-	return c.s[1:]
-}
-
-// shrinkStates shrinks the state list down to size states
-func (c *context) shrinkStates(states []state, size int) []state {
-	i1 := units2Index[(len(states)+1)>>1]
-	i2 := units2Index[(size+1)>>1]
+// The context is represented by the index into the states array for these two states.
+type context int32
 
-	if size == 1 {
-		// store state in context, and free states block
-		n := c.statesIndex()
-		c.s[1] = states[0]
-		states = c.s[1:]
-		c.a.addFreeBlock(n, i1)
-	} else if i1 != i2 {
-		if n := c.a.removeFreeBlock(i2); n > 0 {
-			// allocate new block and copy
-			copy(c.a.states[n:], states[:size])
-			states = c.a.states[n:]
-			// free old block
-			c.a.addFreeBlock(c.statesIndex(), i1)
-			c.setStatesIndex(n)
-		} else {
-			// split current block, and free units not needed
-			n = c.statesIndex() + index2Units[i2]<<1
-			u := index2Units[i1] - index2Units[i2]
-			c.a.freeUnits(n, u)
-		}
-	}
-	c.setNumStates(size)
-	return states[:size]
-}
-
-// expandStates expands the states list by one
-func (c *context) expandStates() []state {
-	states := c.states()
-	ns := len(states)
-	if ns == 1 {
-		s := states[0]
-		n := c.a.allocUnits(1)
-		if n == 0 {
-			return nil
-		}
-		c.setStatesIndex(n)
-		states = c.a.states[n:]
-		states[0] = s
-	} else if ns&0x1 == 0 {
-		u := ns >> 1
-		i1 := units2Index[u]
-		i2 := units2Index[u+1]
-		if i1 != i2 {
-			n := c.a.allocUnits(i2)
-			if n == 0 {
-				return nil
-			}
-			copy(c.a.states[n:], states)
-			c.a.addFreeBlock(c.statesIndex(), i1)
-			c.setStatesIndex(n)
-			states = c.a.states[n:]
-		}
+// succContext returns a context given a state.succ index
+func succContext(i int32) context {
+	if i <= 0 {
+		return 0
 	}
-	c.setNumStates(ns + 1)
-	return states[:ns+1]
+	return context(i)
 }
 
 type subAllocator struct {
 	// memory for allocation is split into two heaps
 
+	glueCount     int
 	heap1MaxBytes int32 // maximum bytes available in heap1
 	heap1Lo       int32 // heap1 bottom in number of bytes
 	heap1Hi       int32 // heap1 top in number of bytes
 	heap2Lo       int32 // heap2 bottom index in states
 	heap2Hi       int32 // heap2 top index in states
-	glueCount     int
 
 	// Each freeList entry contains an index into states for the beginning
 	// of a free block. The first state in that block may contain an index
@@ -340,9 +254,6 @@ func (a *subAllocator) restart() {
 	for i := range a.freeList {
 		a.freeList[i] = 0
 	}
-	for i := range a.states {
-		a.states[i] = state{}
-	}
 }
 
 // pushByte puts a byte on the heap and returns a state.succ index that
@@ -389,17 +300,6 @@ func (a *subAllocator) succByte(i int32) byte {
 	}
 }
 
-// succContext returns a context given a state.succ index
-func (a *subAllocator) succContext(i int32) *context {
-	if i <= 0 {
-		return nil
-	}
-	return &context{i: i, s: a.states[i : i+2 : i+2], a: a}
-}
-
-// succIsNil returns whether a state.succ points to nothing
-func (a *subAllocator) succIsNil(i int32) bool { return i == 0 }
-
 // nextByteAddr takes a state.succ value representing a pointer
 // to a byte, and returns the next bytes address
 func (a *subAllocator) nextByteAddr(n int32) int32 { return n - 1 }
@@ -521,7 +421,7 @@ func (a *subAllocator) allocUnits(i byte) int32 {
 	return a.allocUnitsRare(i)
 }
 
-func (a *subAllocator) newContext(s state, suffix *context) *context {
+func (a *subAllocator) newContext(s state, suffix context) context {
 	var n int32
 	if a.heap2Lo < a.heap2Hi {
 		// allocate from top of heap2
@@ -529,28 +429,121 @@ func (a *subAllocator) newContext(s state, suffix *context) *context {
 		n = a.heap2Hi
 	} else if n = a.removeFreeBlock(1); n == 0 {
 		if n = a.allocUnitsRare(1); n == 0 {
-			return nil
+			return 0
 		}
 	}
-	c := &context{i: n, s: a.states[n : n+2 : n+2], a: a}
-	c.s[0] = state{}
-	c.setNumStates(1)
-	c.s[1] = s
-	if suffix != nil {
-		c.setSuffix(suffix)
-	}
-	return c
+	// we don't need to set numStates to 1 as the default value of 0 in the sym
+	// field is always incremented by 1 to get numStates.
+	a.states[n] = state{succ: int32(suffix)}
+	a.states[n+1] = s
+	return context(n)
 }
 
-func (a *subAllocator) newContextSize(ns int) *context {
-	c := a.newContext(state{}, nil)
-	c.setNumStates(ns)
+func (a *subAllocator) newContextSize(ns int) context {
+	c := a.newContext(state{}, context(0))
+	a.contextSetNumStates(c, ns)
 	i := units2Index[(ns+1)>>1]
 	n := a.allocUnits(i)
-	c.setStatesIndex(n)
+	a.contextSetStatesIndex(c, n)
 	return c
 }
 
+// since number of states is always > 0 && <= 256, we can fit it in a single byte
+func (a *subAllocator) contextNumStates(c context) int       { return int(a.states[c].sym) + 1 }
+func (a *subAllocator) contextSetNumStates(c context, n int) { a.states[c].sym = byte(n - 1) }
+
+func (a *subAllocator) contextSummFreq(c context) uint16       { return a.states[c+1].uint16() }
+func (a *subAllocator) contextSetSummFreq(c context, n uint16) { a.states[c+1].setUint16(n) }
+func (a *subAllocator) contextIncSummFreq(c context, n uint16) {
+	a.states[c+1].setUint16(a.states[c+1].uint16() + n)
+}
+
+func (a *subAllocator) contextSuffix(c context) context { return succContext(a.states[c].succ) }
+
+func (a *subAllocator) contextStatesIndex(c context) int32       { return a.states[c+1].succ }
+func (a *subAllocator) contextSetStatesIndex(c context, n int32) { a.states[c+1].succ = n }
+
+func (a *subAllocator) contextStates(c context) []state {
+	if ns := int32(a.states[c].sym) + 1; ns != 1 {
+		i := a.states[c+1].succ
+		return a.states[i : i+ns]
+	}
+	return a.states[c+1 : c+2]
+}
+
+// shrinkStates shrinks the state list down to size states
+func (a *subAllocator) shrinkStates(c context, states []state, size int) []state {
+	i1 := units2Index[(len(states)+1)>>1]
+	i2 := units2Index[(size+1)>>1]
+
+	if size == 1 {
+		// store state in context, and free states block
+		n := a.contextStatesIndex(c)
+		a.states[c+1] = states[0]
+		states = a.states[c+1:]
+		a.addFreeBlock(n, i1)
+	} else if i1 != i2 {
+		if n := a.removeFreeBlock(i2); n > 0 {
+			// allocate new block and copy
+			copy(a.states[n:], states[:size])
+			states = a.states[n:]
+			// free old block
+			a.addFreeBlock(a.contextStatesIndex(c), i1)
+			a.contextSetStatesIndex(c, n)
+		} else {
+			// split current block, and free units not needed
+			n = a.contextStatesIndex(c) + index2Units[i2]<<1
+			u := index2Units[i1] - index2Units[i2]
+			a.freeUnits(n, u)
+		}
+	}
+	a.contextSetNumStates(c, size)
+	return states[:size]
+}
+
+// expandStates expands the states list by one
+func (a *subAllocator) expandStates(c context) []state {
+	states := a.contextStates(c)
+	ns := len(states)
+	if ns == 1 {
+		s := states[0]
+		n := a.allocUnits(1)
+		if n == 0 {
+			return nil
+		}
+		a.contextSetStatesIndex(c, n)
+		states = a.states[n:]
+		states[0] = s
+	} else if ns&0x1 == 0 {
+		u := ns >> 1
+		i1 := units2Index[u]
+		i2 := units2Index[u+1]
+		if i1 != i2 {
+			n := a.allocUnits(i2)
+			if n == 0 {
+				return nil
+			}
+			copy(a.states[n:], states)
+			a.addFreeBlock(a.contextStatesIndex(c), i1)
+			a.contextSetStatesIndex(c, n)
+			states = a.states[n:]
+		}
+	}
+	a.contextSetNumStates(c, ns+1)
+	return states[:ns+1]
+}
+
+func (a *subAllocator) findState(c context, sym byte) *state {
+	var i int
+	states := a.contextStates(c)
+	for i = range states {
+		if states[i].sym == sym {
+			break
+		}
+	}
+	return &states[i]
+}
+
 type model struct {
 	maxOrder    int
 	orderFall   int
@@ -560,13 +553,14 @@ type model struct {
 	escCount    byte
 	prevSym     byte
 	initEsc     byte
-	minC        *context
-	maxC        *context
+	c           context
 	rc          rangeCoder
 	a           subAllocator
 	charMask    [256]byte
 	binSumm     [128][64]uint16
 	see2Cont    [25][16]see2Context
+	ibuf        [256]int
+	sbuf        [256]*state
 }
 
 func (m *model) restart() {
@@ -586,15 +580,12 @@ func (m *model) restart() {
 
 	m.a.restart()
 
-	c := m.a.newContextSize(256)
-	c.setSummFreq(257)
-	states := c.states()
+	m.c = m.a.newContextSize(256)
+	m.a.contextSetSummFreq(m.c, 257)
+	states := m.a.contextStates(m.c)
 	for i := range states {
 		states[i] = state{sym: byte(i), freq: 1}
 	}
-	m.minC = c
-	m.maxC = c
-	m.prevSym = 0
 
 	for i := range m.binSumm {
 		for j, esc := range initBinEsc {
@@ -619,9 +610,6 @@ func (m *model) init(br io.ByteReader, reset bool, maxOrder, maxMB int) error {
 		return err
 	}
 	if !reset {
-		if m.minC == nil {
-			return errCorruptPPM
-		}
 		return nil
 	}
 
@@ -631,21 +619,21 @@ func (m *model) init(br io.ByteReader, reset bool, maxOrder, maxMB int) error {
 		return errCorruptPPM
 	}
 	m.maxOrder = maxOrder
-	m.restart()
+	m.prevSym = 0
+	m.c = 0
 	return nil
 }
 
-func (m *model) rescale(s *state) *state {
+func (m *model) rescale(c context, s *state) *state {
 	if s.freq <= maxFreq {
 		return s
 	}
-	c := m.minC
 
 	var summFreq uint16
 
 	s.freq += 4
-	states := c.states()
-	escFreq := c.summFreq() + 4
+	states := m.a.contextStates(c)
+	escFreq := m.a.contextSummFreq(c) + 4
 
 	for i := range states {
 		f := states[i].freq
@@ -675,7 +663,7 @@ func (m *model) rescale(s *state) *state {
 		escFreq++
 	}
 	if i != len(states)-1 {
-		states = c.shrinkStates(states, i+1)
+		states = m.a.shrinkStates(c, states, i+1)
 	}
 	s = &states[0]
 	if i == 0 {
@@ -688,15 +676,14 @@ func (m *model) rescale(s *state) *state {
 		}
 	}
 	summFreq += escFreq - (escFreq >> 1)
-	c.setSummFreq(summFreq)
+	m.a.contextSetSummFreq(c, summFreq)
 	return s
 }
 
-func (m *model) decodeBinSymbol() (*state, error) {
-	c := m.minC
-	s := &c.states()[0]
+func (m *model) decodeBinSymbol(c context) (*state, error) {
+	s := &m.a.contextStates(c)[0]
 
-	ns := c.suffix().numStates()
+	ns := m.a.contextNumStates(m.a.contextSuffix(c))
 	i := m.prevSuccess + ns2BSIndex[ns-1] + byte(m.runLength>>26)&0x20
 	if m.prevSym >= 64 {
 		i += 8
@@ -725,10 +712,9 @@ func (m *model) decodeBinSymbol() (*state, error) {
 	return nil, err
 }
 
-func (m *model) decodeSymbol1() (*state, error) {
-	c := m.minC
-	states := c.states()
-	scale := uint32(c.summFreq())
+func (m *model) decodeSymbol1(c context) (*state, error) {
+	states := m.a.contextStates(c)
+	scale := uint32(m.a.contextSummFreq(c))
 	// protect against divide by zero
 	// TODO: look at why this happens, may be problem elsewhere
 	if scale == 0 {
@@ -746,7 +732,7 @@ func (m *model) decodeSymbol1() (*state, error) {
 		}
 		err := m.rc.decode(n-uint32(s.freq), n)
 		s.freq += 4
-		c.setSummFreq(uint16(scale + 4))
+		m.a.contextSetSummFreq(c, uint16(scale+4))
 		if i == 0 {
 			if 2*n > scale {
 				m.prevSuccess = 1
@@ -759,7 +745,7 @@ func (m *model) decodeSymbol1() (*state, error) {
 			states[i-1], states[i] = states[i], states[i-1]
 			s = &states[i-1]
 		}
-		return m.rescale(s), err
+		return m.rescale(c, s), err
 	}
 
 	for _, s := range states {
@@ -768,8 +754,8 @@ func (m *model) decodeSymbol1() (*state, error) {
 	return nil, m.rc.decode(n, scale)
 }
 
-func (m *model) makeEscFreq(c *context, numMasked int) *see2Context {
-	ns := c.numStates()
+func (m *model) makeEscFreq(c context, numMasked int) *see2Context {
+	ns := m.a.contextNumStates(c)
 	if ns == 256 {
 		return nil
 	}
@@ -779,10 +765,10 @@ func (m *model) makeEscFreq(c *context, numMasked int) *see2Context {
 	if m.prevSym >= 64 {
 		i = 8
 	}
-	if diff < c.suffix().numStates()-ns {
+	if diff < m.a.contextNumStates(m.a.contextSuffix(c))-ns {
 		i++
 	}
-	if int(c.summFreq()) < 11*ns {
+	if int(m.a.contextSummFreq(c)) < 11*ns {
 		i += 2
 	}
 	if numMasked > diff {
@@ -791,22 +777,21 @@ func (m *model) makeEscFreq(c *context, numMasked int) *see2Context {
 	return &m.see2Cont[ns2Index[diff-1]][i]
 }
 
-func (m *model) decodeSymbol2(numMasked int) (*state, error) {
-	c := m.minC
-
+func (m *model) decodeSymbol2(c context, numMasked int) (*state, error) {
 	see := m.makeEscFreq(c, numMasked)
 	scale := see.mean()
 
 	var i int
 	var hi uint32
-	states := c.states()
-	sl := make([]*state, len(states)-numMasked)
+	states := m.a.contextStates(c)
+	n := len(states) - numMasked
+	sl := m.ibuf[:n]
 	for j := range sl {
 		for m.charMask[states[i].sym] == m.escCount {
 			i++
 		}
 		hi += uint32(states[i].freq)
-		sl[j] = &states[i]
+		sl[j] = i
 		i++
 	}
 
@@ -821,18 +806,19 @@ func (m *model) decodeSymbol2(numMasked int) (*state, error) {
 		if see != nil {
 			see.summ += uint16(scale)
 		}
-		for _, s := range sl {
-			m.charMask[s.sym] = m.escCount
+		for _, i := range sl {
+			m.charMask[states[i].sym] = m.escCount
 		}
 		return nil, err
 	}
 
-	hi = uint32(sl[0].freq)
+	hi = uint32(states[sl[0]].freq)
+	n = 0
 	for hi <= count {
-		sl = sl[1:]
-		hi += uint32(sl[0].freq)
+		n++
+		hi += uint32(states[sl[n]].freq)
 	}
-	s := sl[0]
+	s := &states[sl[n]]
 
 	err := m.rc.decode(hi-uint32(s.freq), hi)
 
@@ -842,37 +828,25 @@ func (m *model) decodeSymbol2(numMasked int) (*state, error) {
 	m.runLength = m.initRL
 
 	s.freq += 4
-	c.setSummFreq(c.summFreq() + 4)
-	return m.rescale(s), err
+	m.a.contextIncSummFreq(c, 4)
+	return m.rescale(c, s), err
 }
 
-func (c *context) findState(sym byte) *state {
-	var i int
-	states := c.states()
-	for i = range states {
-		if states[i].sym == sym {
-			break
-		}
-	}
-	return &states[i]
-}
-
-func (m *model) createSuccessors(s, ss *state) *context {
-	var sl []*state
+func (m *model) createSuccessors(c context, s, ss *state) context {
+	sl := m.sbuf[:0]
 
 	if m.orderFall != 0 {
 		sl = append(sl, s)
 	}
 
-	c := m.minC
-	for suff := c.suffix(); suff != nil; suff = c.suffix() {
+	for suff := m.a.contextSuffix(c); suff > 0; suff = m.a.contextSuffix(c) {
 		c = suff
 
 		if ss == nil {
-			ss = c.findState(s.sym)
+			ss = m.a.findState(c, s.sym)
 		}
 		if ss.succ != s.succ {
-			c = m.a.succContext(ss.succ)
+			c = succContext(ss.succ)
 			break
 		}
 		sl = append(sl, ss)
@@ -887,12 +861,12 @@ func (m *model) createSuccessors(s, ss *state) *context {
 	up.sym = m.a.succByte(s.succ)
 	up.succ = m.a.nextByteAddr(s.succ)
 
-	states := c.states()
+	states := m.a.contextStates(c)
 	if len(states) > 1 {
-		s = c.findState(up.sym)
+		s = m.a.findState(c, up.sym)
 
 		cf := uint16(s.freq) - 1
-		s0 := c.summFreq() - uint16(len(states)) - cf
+		s0 := m.a.contextSummFreq(c) - uint16(len(states)) - cf
 
 		if 2*cf <= s0 {
 			if 5*cf > s0 {
@@ -909,20 +883,18 @@ func (m *model) createSuccessors(s, ss *state) *context {
 
 	for i := len(sl) - 1; i >= 0; i-- {
 		c = m.a.newContext(up, c)
-		if c == nil {
-			return nil
+		if c == 0 {
+			return c
 		}
-		sl[i].succ = c.succPtr()
+		sl[i].succ = int32(c)
 	}
 	return c
 }
 
-func (m *model) update(s *state) {
+func (m *model) update(minC, maxC context, s *state) context {
 	if m.orderFall == 0 {
-		if c := m.a.succContext(s.succ); c != nil {
-			m.minC = c
-			m.maxC = c
-			return
+		if s.succ > 0 {
+			return context(s.succ)
 		}
 	}
 
@@ -935,9 +907,9 @@ func (m *model) update(s *state) {
 
 	var ss *state // matching minC.suffix state
 
-	if s.freq < maxFreq/4 && m.minC.suffix() != nil {
-		c := m.minC.suffix()
-		states := c.states()
+	if s.freq < maxFreq/4 && m.a.contextSuffix(minC) > 0 {
+		c := m.a.contextSuffix(minC)
+		states := m.a.contextStates(c)
 
 		var i int
 		if len(states) > 1 {
@@ -950,7 +922,7 @@ func (m *model) update(s *state) {
 			}
 			if states[i].freq < maxFreq-9 {
 				states[i].freq += 2
-				c.setSummFreq(c.summFreq() + 2)
+				m.a.contextIncSummFreq(c, 2)
 			}
 		} else if states[0].freq < 32 {
 			states[0].freq++
@@ -959,57 +931,49 @@ func (m *model) update(s *state) {
 	}
 
 	if m.orderFall == 0 {
-		c := m.createSuccessors(s, ss)
-		if c == nil {
-			m.restart()
-		} else {
-			m.minC = c
-			m.maxC = c
-			s.succ = c.succPtr()
-		}
-		return
+		minC = m.createSuccessors(minC, s, ss)
+		s.succ = int32(minC)
+		return minC
 	}
 
 	succ := m.a.pushByte(s.sym)
-	if m.a.succIsNil(succ) {
-		m.restart()
-		return
+	if succ == 0 {
+		return context(0)
 	}
 
-	var minC *context
-	if m.a.succIsNil(s.succ) {
+	var newC context
+	if s.succ == 0 {
 		s.succ = succ
-		minC = m.minC
+		newC = minC
 	} else {
-		minC = m.a.succContext(s.succ)
-		if minC == nil {
-			minC = m.createSuccessors(s, ss)
-			if minC == nil {
-				m.restart()
-				return
+		if s.succ > 0 {
+			newC = context(s.succ)
+		} else {
+			newC = m.createSuccessors(minC, s, ss)
+			if newC == 0 {
+				return context(0)
 			}
 		}
 		m.orderFall--
 		if m.orderFall == 0 {
-			succ = minC.succPtr()
-			if m.maxC.notEq(m.minC) {
+			succ = int32(newC)
+			if maxC != minC {
 				m.a.popByte()
 			}
 		}
 	}
 
-	n := m.minC.numStates()
-	s0 := int(m.minC.summFreq()) - n - int(s.freq-1)
-	for c := m.maxC; c.notEq(m.minC); c = c.suffix() {
+	n := m.a.contextNumStates(minC)
+	s0 := int(m.a.contextSummFreq(minC)) - n - int(s.freq-1)
+	for c := maxC; c != minC; c = m.a.contextSuffix(c) {
 		var summFreq uint16
 
-		states := c.expandStates()
+		states := m.a.expandStates(c)
 		if states == nil {
-			m.restart()
-			return
+			return context(0)
 		}
 		if ns := len(states) - 1; ns != 1 {
-			summFreq = c.summFreq()
+			summFreq = m.a.contextSummFreq(c)
 			if 4*ns <= n && int(summFreq) <= 8*ns {
 				summFreq += 2
 			}
@@ -1056,41 +1020,40 @@ func (m *model) update(s *state) {
 			summFreq += 3
 		}
 		states[len(states)-1] = state{sym: s.sym, freq: freq, succ: succ}
-		c.setSummFreq(summFreq)
+		m.a.contextSetSummFreq(c, summFreq)
 	}
-	m.minC = minC
-	m.maxC = minC
+	return newC
 }
 
 func (m *model) ReadByte() (byte, error) {
-	if m.minC == nil {
-		return 0, errCorruptPPM
+	if m.c == 0 {
+		m.restart()
 	}
+	minC := m.c
+	maxC := minC
 	var s *state
 	var err error
-	if m.minC.numStates() == 1 {
-		s, err = m.decodeBinSymbol()
+	if m.a.contextNumStates(minC) == 1 {
+		s, err = m.decodeBinSymbol(minC)
 	} else {
-		s, err = m.decodeSymbol1()
+		s, err = m.decodeSymbol1(minC)
 	}
 	for s == nil && err == nil {
-		n := m.minC.numStates()
-		for m.minC.numStates() == n {
+		n := m.a.contextNumStates(minC)
+		for m.a.contextNumStates(minC) == n {
 			m.orderFall++
-			m.minC = m.minC.suffix()
-			if m.minC == nil {
+			minC = m.a.contextSuffix(minC)
+			if minC <= 0 {
 				return 0, errCorruptPPM
 			}
 		}
-		s, err = m.decodeSymbol2(n)
+		s, err = m.decodeSymbol2(minC, n)
 	}
 	if err != nil {
 		return 0, err
 	}
 
-	// save sym so it doesn't get overwritten by a possible restart()
-	sym := s.sym
-	m.update(s)
-	m.prevSym = sym
-	return sym, nil
+	m.c = m.update(minC, maxC, s)
+	m.prevSym = s.sym
+	return s.sym, nil
 }
diff --git a/vendor/github.com/pierrec/lz4/lz4.go b/vendor/github.com/pierrec/lz4/lz4.go
index 6c73539..a3284bd 100644
--- a/vendor/github.com/pierrec/lz4/lz4.go
+++ b/vendor/github.com/pierrec/lz4/lz4.go
@@ -10,9 +10,10 @@
 //
 package lz4
 
-import "math/bits"
-
-import "sync"
+import (
+	"math/bits"
+	"sync"
+)
 
 const (
 	// Extension is the LZ4 frame file name extension
@@ -20,8 +21,9 @@ const (
 	// Version is the LZ4 frame format version
 	Version = 1
 
-	frameMagic     uint32 = 0x184D2204
-	frameSkipMagic uint32 = 0x184D2A50
+	frameMagic       uint32 = 0x184D2204
+	frameSkipMagic   uint32 = 0x184D2A50
+	frameMagicLegacy uint32 = 0x184C2102
 
 	// The following constants are used to setup the compression algorithm.
 	minMatch            = 4  // the minimum size of the match sequence size (4 bytes)
@@ -108,6 +110,7 @@ type Header struct {
 	done             bool   // Header processed flag (Read or Write and checked).
 }
 
+// Reset reset internal status
 func (h *Header) Reset() {
 	h.done = false
 }
diff --git a/vendor/github.com/pierrec/lz4/reader_legacy.go b/vendor/github.com/pierrec/lz4/reader_legacy.go
new file mode 100644
index 0000000..1670a77
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/reader_legacy.go
@@ -0,0 +1,207 @@
+package lz4
+
+import (
+	"encoding/binary"
+	"fmt"
+	"io"
+)
+
+// ReaderLegacy implements the LZ4Demo frame decoder.
+// The Header is set after the first call to Read().
+type ReaderLegacy struct {
+	Header
+	// Handler called when a block has been successfully read.
+	// It provides the number of bytes read.
+	OnBlockDone func(size int)
+
+	lastBlock bool
+	buf       [8]byte   // Scrap buffer.
+	pos       int64     // Current position in src.
+	src       io.Reader // Source.
+	zdata     []byte    // Compressed data.
+	data      []byte    // Uncompressed data.
+	idx       int       // Index of unread bytes into data.
+	skip      int64     // Bytes to skip before next read.
+	dpos      int64     // Position in dest
+}
+
+// NewReaderLegacy returns a new LZ4Demo frame decoder.
+// No access to the underlying io.Reader is performed.
+func NewReaderLegacy(src io.Reader) *ReaderLegacy {
+	r := &ReaderLegacy{src: src}
+	return r
+}
+
+// readHeader checks the frame magic number and parses the frame descriptoz.
+// Skippable frames are supported even as a first frame although the LZ4
+// specifications recommends skippable frames not to be used as first frames.
+func (z *ReaderLegacy) readLegacyHeader() error {
+	z.lastBlock = false
+	magic, err := z.readUint32()
+	if err != nil {
+		z.pos += 4
+		if err == io.ErrUnexpectedEOF {
+			return io.EOF
+		}
+		return err
+	}
+	if magic != frameMagicLegacy {
+		return ErrInvalid
+	}
+	z.pos += 4
+
+	// Legacy has fixed 8MB blocksizes
+	// https://github.com/lz4/lz4/blob/dev/doc/lz4_Frame_format.md#legacy-frame
+	bSize := blockSize4M * 2
+
+	// Allocate the compressed/uncompressed buffers.
+	// The compressed buffer cannot exceed the uncompressed one.
+	if n := 2 * bSize; cap(z.zdata) < n {
+		z.zdata = make([]byte, n, n)
+	}
+	if debugFlag {
+		debug("header block max size size=%d", bSize)
+	}
+	z.zdata = z.zdata[:bSize]
+	z.data = z.zdata[:cap(z.zdata)][bSize:]
+	z.idx = len(z.data)
+
+	z.Header.done = true
+	if debugFlag {
+		debug("header read: %v", z.Header)
+	}
+
+	return nil
+}
+
+// Read decompresses data from the underlying source into the supplied buffer.
+//
+// Since there can be multiple streams concatenated, Header values may
+// change between calls to Read(). If that is the case, no data is actually read from
+// the underlying io.Reader, to allow for potential input buffer resizing.
+func (z *ReaderLegacy) Read(buf []byte) (int, error) {
+	if debugFlag {
+		debug("Read buf len=%d", len(buf))
+	}
+	if !z.Header.done {
+		if err := z.readLegacyHeader(); err != nil {
+			return 0, err
+		}
+		if debugFlag {
+			debug("header read OK compressed buffer %d / %d uncompressed buffer %d : %d index=%d",
+				len(z.zdata), cap(z.zdata), len(z.data), cap(z.data), z.idx)
+		}
+	}
+
+	if len(buf) == 0 {
+		return 0, nil
+	}
+
+	if z.idx == len(z.data) {
+		// No data ready for reading, process the next block.
+		if debugFlag {
+			debug("  reading block from writer %d %d", z.idx, blockSize4M*2)
+		}
+
+		// Reset uncompressed buffer
+		z.data = z.zdata[:cap(z.zdata)][len(z.zdata):]
+
+		bLen, err := z.readUint32()
+		if err != nil {
+			return 0, err
+		}
+		if debugFlag {
+			debug("   bLen %d (0x%x) offset = %d (0x%x)", bLen, bLen, z.pos, z.pos)
+		}
+		z.pos += 4
+
+		// Legacy blocks are always compressed, even when detrimental
+		if debugFlag {
+			debug("   compressed block size %d", bLen)
+		}
+
+		if int(bLen) > cap(z.data) {
+			return 0, fmt.Errorf("lz4: invalid block size: %d", bLen)
+		}
+		zdata := z.zdata[:bLen]
+		if _, err := io.ReadFull(z.src, zdata); err != nil {
+			return 0, err
+		}
+		z.pos += int64(bLen)
+
+		n, err := UncompressBlock(zdata, z.data)
+		if err != nil {
+			return 0, err
+		}
+
+		z.data = z.data[:n]
+		if z.OnBlockDone != nil {
+			z.OnBlockDone(n)
+		}
+
+		z.idx = 0
+
+		// Legacy blocks are fixed to 8MB, if we read a decompressed block smaller than this
+		// it means we've reached the end...
+		if n < blockSize4M*2 {
+			z.lastBlock = true
+		}
+	}
+
+	if z.skip > int64(len(z.data[z.idx:])) {
+		z.skip -= int64(len(z.data[z.idx:]))
+		z.dpos += int64(len(z.data[z.idx:]))
+		z.idx = len(z.data)
+		return 0, nil
+	}
+
+	z.idx += int(z.skip)
+	z.dpos += z.skip
+	z.skip = 0
+
+	n := copy(buf, z.data[z.idx:])
+	z.idx += n
+	z.dpos += int64(n)
+	if debugFlag {
+		debug("%v] copied %d bytes to input (%d:%d)", z.lastBlock, n, z.idx, len(z.data))
+	}
+	if z.lastBlock && len(z.data) == z.idx {
+		return n, io.EOF
+	}
+	return n, nil
+}
+
+// Seek implements io.Seeker, but supports seeking forward from the current
+// position only. Any other seek will return an error. Allows skipping output
+// bytes which aren't needed, which in some scenarios is faster than reading
+// and discarding them.
+// Note this may cause future calls to Read() to read 0 bytes if all of the
+// data they would have returned is skipped.
+func (z *ReaderLegacy) Seek(offset int64, whence int) (int64, error) {
+	if offset < 0 || whence != io.SeekCurrent {
+		return z.dpos + z.skip, ErrUnsupportedSeek
+	}
+	z.skip += offset
+	return z.dpos + z.skip, nil
+}
+
+// Reset discards the Reader's state and makes it equivalent to the
+// result of its original state from NewReader, but reading from r instead.
+// This permits reusing a Reader rather than allocating a new one.
+func (z *ReaderLegacy) Reset(r io.Reader) {
+	z.Header = Header{}
+	z.pos = 0
+	z.src = r
+	z.zdata = z.zdata[:0]
+	z.data = z.data[:0]
+	z.idx = 0
+}
+
+// readUint32 reads an uint32 into the supplied buffer.
+// The idea is to make use of the already allocated buffers avoiding additional allocations.
+func (z *ReaderLegacy) readUint32() (uint32, error) {
+	buf := z.buf[:4]
+	_, err := io.ReadFull(z.src, buf)
+	x := binary.LittleEndian.Uint32(buf)
+	return x, err
+}
diff --git a/vendor/github.com/pierrec/lz4/writer.go b/vendor/github.com/pierrec/lz4/writer.go
index 324f138..f066d56 100644
--- a/vendor/github.com/pierrec/lz4/writer.go
+++ b/vendor/github.com/pierrec/lz4/writer.go
@@ -3,9 +3,10 @@ package lz4
 import (
 	"encoding/binary"
 	"fmt"
-	"github.com/pierrec/lz4/internal/xxh32"
 	"io"
 	"runtime"
+
+	"github.com/pierrec/lz4/internal/xxh32"
 )
 
 // zResult contains the results of compressing a block.
@@ -83,10 +84,8 @@ func (z *Writer) WithConcurrency(n int) *Writer {
 					z.err = err
 				}
 			}
-			if isCompressed := res.size&compressedBlockFlag == 0; isCompressed {
-				// It is now safe to release the buffer as no longer in use by any goroutine.
-				putBuffer(cap(res.data), res.data)
-			}
+			// It is now safe to release the buffer as no longer in use by any goroutine.
+			putBuffer(cap(res.data), res.data)
 			if h := z.OnBlockDone; h != nil {
 				h(n)
 			}
@@ -230,7 +229,12 @@ func (z *Writer) compressBlock(data []byte) error {
 	if z.c != nil {
 		c := make(chan zResult)
 		z.c <- c // Send now to guarantee order
-		go writerCompressBlock(c, z.Header, data)
+
+		// get a buffer from the pool and copy the data over
+		block := getBuffer(z.Header.BlockMaxSize)[:len(data)]
+		copy(block, data)
+
+		go writerCompressBlock(c, z.Header, block)
 		return nil
 	}
 
@@ -298,7 +302,9 @@ func (z *Writer) Flush() error {
 		return nil
 	}
 
-	data := z.data[:z.idx]
+	data := getBuffer(z.Header.BlockMaxSize)[:len(z.data[:z.idx])]
+	copy(data, z.data[:z.idx])
+
 	z.idx = 0
 	if z.c == nil {
 		return z.compressBlock(data)
@@ -370,6 +376,10 @@ func (z *Writer) Reset(w io.Writer) {
 	z.checksum.Reset()
 	z.idx = 0
 	z.err = nil
+	// reset hashtable to ensure deterministic output.
+	for i := range z.hashtable {
+		z.hashtable[i] = 0
+	}
 	z.WithConcurrency(n)
 }
 
@@ -397,9 +407,13 @@ func writerCompressBlock(c chan zResult, header Header, data []byte) {
 	if zn > 0 && zn < len(data) {
 		res.size = uint32(zn)
 		res.data = zdata[:zn]
+		// release the uncompressed block since it is not used anymore
+		putBuffer(header.BlockMaxSize, data)
 	} else {
 		res.size = uint32(len(data)) | compressedBlockFlag
 		res.data = data
+		// release the compressed block since it was not used
+		putBuffer(header.BlockMaxSize, zdata)
 	}
 	if header.BlockChecksum {
 		res.checksum = xxh32.ChecksumZero(res.data)
diff --git a/vendor/github.com/pierrec/lz4/writer_legacy.go b/vendor/github.com/pierrec/lz4/writer_legacy.go
new file mode 100644
index 0000000..ca8dc8c
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/writer_legacy.go
@@ -0,0 +1,182 @@
+package lz4
+
+import (
+	"encoding/binary"
+	"io"
+)
+
+// WriterLegacy implements the LZ4Demo frame decoder.
+type WriterLegacy struct {
+	Header
+	// Handler called when a block has been successfully read.
+	// It provides the number of bytes read.
+	OnBlockDone func(size int)
+
+	dst       io.Writer    // Destination.
+	data      []byte       // Data to be compressed + buffer for compressed data.
+	idx       int          // Index into data.
+	hashtable [winSize]int // Hash table used in CompressBlock().
+}
+
+// NewWriterLegacy returns a new LZ4 encoder for the legacy frame format.
+// No access to the underlying io.Writer is performed.
+// The supplied Header is checked at the first Write.
+// It is ok to change it before the first Write but then not until a Reset() is performed.
+func NewWriterLegacy(dst io.Writer) *WriterLegacy {
+	z := new(WriterLegacy)
+	z.Reset(dst)
+	return z
+}
+
+// Write compresses data from the supplied buffer into the underlying io.Writer.
+// Write does not return until the data has been written.
+func (z *WriterLegacy) Write(buf []byte) (int, error) {
+	if !z.Header.done {
+		if err := z.writeHeader(); err != nil {
+			return 0, err
+		}
+	}
+	if debugFlag {
+		debug("input buffer len=%d index=%d", len(buf), z.idx)
+	}
+
+	zn := len(z.data)
+	var n int
+	for len(buf) > 0 {
+		if z.idx == 0 && len(buf) >= zn {
+			// Avoid a copy as there is enough data for a block.
+			if err := z.compressBlock(buf[:zn]); err != nil {
+				return n, err
+			}
+			n += zn
+			buf = buf[zn:]
+			continue
+		}
+		// Accumulate the data to be compressed.
+		m := copy(z.data[z.idx:], buf)
+		n += m
+		z.idx += m
+		buf = buf[m:]
+		if debugFlag {
+			debug("%d bytes copied to buf, current index %d", n, z.idx)
+		}
+
+		if z.idx < len(z.data) {
+			// Buffer not filled.
+			if debugFlag {
+				debug("need more data for compression")
+			}
+			return n, nil
+		}
+
+		// Buffer full.
+		if err := z.compressBlock(z.data); err != nil {
+			return n, err
+		}
+		z.idx = 0
+	}
+
+	return n, nil
+}
+
+// writeHeader builds and writes the header to the underlying io.Writer.
+func (z *WriterLegacy) writeHeader() error {
+	// Legacy has fixed 8MB blocksizes
+	// https://github.com/lz4/lz4/blob/dev/doc/lz4_Frame_format.md#legacy-frame
+	bSize := 2 * blockSize4M
+
+	buf := make([]byte, 2*bSize, 2*bSize)
+	z.data = buf[:bSize] // Uncompressed buffer is the first half.
+
+	z.idx = 0
+
+	// Header consists of one mageic number, write it out.
+	if err := binary.Write(z.dst, binary.LittleEndian, frameMagicLegacy); err != nil {
+		return err
+	}
+	z.Header.done = true
+	if debugFlag {
+		debug("wrote header %v", z.Header)
+	}
+
+	return nil
+}
+
+// compressBlock compresses a block.
+func (z *WriterLegacy) compressBlock(data []byte) error {
+	bSize := 2 * blockSize4M
+	zdata := z.data[bSize:cap(z.data)]
+	// The compressed block size cannot exceed the input's.
+	var zn int
+
+	if level := z.Header.CompressionLevel; level != 0 {
+		zn, _ = CompressBlockHC(data, zdata, level)
+	} else {
+		zn, _ = CompressBlock(data, zdata, z.hashtable[:])
+	}
+
+	if debugFlag {
+		debug("block compression %d => %d", len(data), zn)
+	}
+	zdata = zdata[:zn]
+
+	// Write the block.
+	if err := binary.Write(z.dst, binary.LittleEndian, uint32(zn)); err != nil {
+		return err
+	}
+	written, err := z.dst.Write(zdata)
+	if err != nil {
+		return err
+	}
+	if h := z.OnBlockDone; h != nil {
+		h(written)
+	}
+	return nil
+}
+
+// Flush flushes any pending compressed data to the underlying writer.
+// Flush does not return until the data has been written.
+// If the underlying writer returns an error, Flush returns that error.
+func (z *WriterLegacy) Flush() error {
+	if debugFlag {
+		debug("flush with index %d", z.idx)
+	}
+	if z.idx == 0 {
+		return nil
+	}
+
+	data := z.data[:z.idx]
+	z.idx = 0
+	return z.compressBlock(data)
+}
+
+// Close closes the WriterLegacy, flushing any unwritten data to the underlying io.Writer, but does not close the underlying io.Writer.
+func (z *WriterLegacy) Close() error {
+	if !z.Header.done {
+		if err := z.writeHeader(); err != nil {
+			return err
+		}
+	}
+	if err := z.Flush(); err != nil {
+		return err
+	}
+
+	if debugFlag {
+		debug("writing last empty block")
+	}
+
+	return nil
+}
+
+// Reset clears the state of the WriterLegacy z such that it is equivalent to its
+// initial state from NewWriterLegacy, but instead writing to w.
+// No access to the underlying io.Writer is performed.
+func (z *WriterLegacy) Reset(w io.Writer) {
+	z.Header.Reset()
+	z.dst = w
+	z.idx = 0
+	// reset hashtable to ensure deterministic output.
+	for i := range z.hashtable {
+		z.hashtable[i] = 0
+	}
+}
diff --git a/vendor/github.com/stretchr/testify/assert/assertion_compare.go b/vendor/github.com/stretchr/testify/assert/assertion_compare.go
index dc20039..41649d2 100644
--- a/vendor/github.com/stretchr/testify/assert/assertion_compare.go
+++ b/vendor/github.com/stretchr/testify/assert/assertion_compare.go
@@ -13,12 +13,42 @@ const (
 	compareGreater
 )
 
+var (
+	intType   = reflect.TypeOf(int(1))
+	int8Type  = reflect.TypeOf(int8(1))
+	int16Type = reflect.TypeOf(int16(1))
+	int32Type = reflect.TypeOf(int32(1))
+	int64Type = reflect.TypeOf(int64(1))
+
+	uintType   = reflect.TypeOf(uint(1))
+	uint8Type  = reflect.TypeOf(uint8(1))
+	uint16Type = reflect.TypeOf(uint16(1))
+	uint32Type = reflect.TypeOf(uint32(1))
+	uint64Type = reflect.TypeOf(uint64(1))
+
+	float32Type = reflect.TypeOf(float32(1))
+	float64Type = reflect.TypeOf(float64(1))
+
+	stringType = reflect.TypeOf("")
+)
+
 func compare(obj1, obj2 interface{}, kind reflect.Kind) (CompareType, bool) {
+	obj1Value := reflect.ValueOf(obj1)
+	obj2Value := reflect.ValueOf(obj2)
+
+	// throughout this switch we try and avoid calling .Convert() if possible,
+	// as this has a pretty big performance impact
 	switch kind {
 	case reflect.Int:
 		{
-			intobj1 := obj1.(int)
-			intobj2 := obj2.(int)
+			intobj1, ok := obj1.(int)
+			if !ok {
+				intobj1 = obj1Value.Convert(intType).Interface().(int)
+			}
+			intobj2, ok := obj2.(int)
+			if !ok {
+				intobj2 = obj2Value.Convert(intType).Interface().(int)
+			}
 			if intobj1 > intobj2 {
 				return compareGreater, true
 			}
@@ -31,8 +61,14 @@ func compare(obj1, obj2 interface{}, kind reflect.Kind) (CompareType, bool) {
 		}
 	case reflect.Int8:
 		{
-			int8obj1 := obj1.(int8)
-			int8obj2 := obj2.(int8)
+			int8obj1, ok := obj1.(int8)
+			if !ok {
+				int8obj1 = obj1Value.Convert(int8Type).Interface().(int8)
+			}
+			int8obj2, ok := obj2.(int8)
+			if !ok {
+				int8obj2 = obj2Value.Convert(int8Type).Interface().(int8)
+			}
 			if int8obj1 > int8obj2 {
 				return compareGreater, true
 			}
@@ -45,8 +81,14 @@ func compare(obj1, obj2 interface{}, kind reflect.Kind) (CompareType, bool) {
 		}
 	case reflect.Int16:
 		{
-			int16obj1 := obj1.(int16)
-			int16obj2 := obj2.(int16)
+			int16obj1, ok := obj1.(int16)
+			if !ok {
+				int16obj1 = obj1Value.Convert(int16Type).Interface().(int16)
+			}
+			int16obj2, ok := obj2.(int16)
+			if !ok {
+				int16obj2 = obj2Value.Convert(int16Type).Interface().(int16)
+			}
 			if int16obj1 > int16obj2 {
 				return compareGreater, true
 			}
@@ -59,8 +101,14 @@ func compare(obj1, obj2 interface{}, kind reflect.Kind) (CompareType, bool) {
 		}
 	case reflect.Int32:
 		{
-			int32obj1 := obj1.(int32)
-			int32obj2 := obj2.(int32)
+			int32obj1, ok := obj1.(int32)
+			if !ok {
+				int32obj1 = obj1Value.Convert(int32Type).Interface().(int32)
+			}
+			int32obj2, ok := obj2.(int32)
+			if !ok {
+				int32obj2 = obj2Value.Convert(int32Type).Interface().(int32)
+			}
 			if int32obj1 > int32obj2 {
 				return compareGreater, true
 			}
@@ -73,8 +121,14 @@ func compare(obj1, obj2 interface{}, kind reflect.Kind) (CompareType, bool) {
 		}
 	case reflect.Int64:
 		{
-			int64obj1 := obj1.(int64)
-			int64obj2 := obj2.(int64)
+			int64obj1, ok := obj1.(int64)
+			if !ok {
+				int64obj1 = obj1Value.Convert(int64Type).Interface().(int64)
+			}
+			int64obj2, ok := obj2.(int64)
+			if !ok {
+				int64obj2 = obj2Value.Convert(int64Type).Interface().(int64)
+			}
 			if int64obj1 > int64obj2 {
 				return compareGreater, true
 			}
@@ -87,8 +141,14 @@ func compare(obj1, obj2 interface{}, kind reflect.Kind) (CompareType, bool) {
 		}
 	case reflect.Uint:
 		{
-			uintobj1 := obj1.(uint)
-			uintobj2 := obj2.(uint)
+			uintobj1, ok := obj1.(uint)
+			if !ok {
+				uintobj1 = obj1Value.Convert(uintType).Interface().(uint)
+			}
+			uintobj2, ok := obj2.(uint)
+			if !ok {
+				uintobj2 = obj2Value.Convert(uintType).Interface().(uint)
+			}
 			if uintobj1 > uintobj2 {
 				return compareGreater, true
 			}
@@ -101,8 +161,14 @@ func compare(obj1, obj2 interface{}, kind reflect.Kind) (CompareType, bool) {
 		}
 	case reflect.Uint8:
 		{
-			uint8obj1 := obj1.(uint8)
-			uint8obj2 := obj2.(uint8)
+			uint8obj1, ok := obj1.(uint8)
+			if !ok {
+				uint8obj1 = obj1Value.Convert(uint8Type).Interface().(uint8)
+			}
+			uint8obj2, ok := obj2.(uint8)
+			if !ok {
+				uint8obj2 = obj2Value.Convert(uint8Type).Interface().(uint8)
+			}
 			if uint8obj1 > uint8obj2 {
 				return compareGreater, true
 			}
@@ -115,8 +181,14 @@ func compare(obj1, obj2 interface{}, kind reflect.Kind) (CompareType, bool) {
 		}
 	case reflect.Uint16:
 		{
-			uint16obj1 := obj1.(uint16)
-			uint16obj2 := obj2.(uint16)
+			uint16obj1, ok := obj1.(uint16)
+			if !ok {
+				uint16obj1 = obj1Value.Convert(uint16Type).Interface().(uint16)
+			}
+			uint16obj2, ok := obj2.(uint16)
+			if !ok {
+				uint16obj2 = obj2Value.Convert(uint16Type).Interface().(uint16)
+			}
 			if uint16obj1 > uint16obj2 {
 				return compareGreater, true
 			}
@@ -129,8 +201,14 @@ func compare(obj1, obj2 interface{}, kind reflect.Kind) (CompareType, bool) {
 		}
 	case reflect.Uint32:
 		{
-			uint32obj1 := obj1.(uint32)
-			uint32obj2 := obj2.(uint32)
+			uint32obj1, ok := obj1.(uint32)
+			if !ok {
+				uint32obj1 = obj1Value.Convert(uint32Type).Interface().(uint32)
+			}
+			uint32obj2, ok := obj2.(uint32)
+			if !ok {
+				uint32obj2 = obj2Value.Convert(uint32Type).Interface().(uint32)
+			}
 			if uint32obj1 > uint32obj2 {
 				return compareGreater, true
 			}
@@ -143,8 +221,14 @@ func compare(obj1, obj2 interface{}, kind reflect.Kind) (CompareType, bool) {
 		}
 	case reflect.Uint64:
 		{
-			uint64obj1 := obj1.(uint64)
-			uint64obj2 := obj2.(uint64)
+			uint64obj1, ok := obj1.(uint64)
+			if !ok {
+				uint64obj1 = obj1Value.Convert(uint64Type).Interface().(uint64)
+			}
+			uint64obj2, ok := obj2.(uint64)
+			if !ok {
+				uint64obj2 = obj2Value.Convert(uint64Type).Interface().(uint64)
+			}
 			if uint64obj1 > uint64obj2 {
 				return compareGreater, true
 			}
@@ -157,8 +241,14 @@ func compare(obj1, obj2 interface{}, kind reflect.Kind) (CompareType, bool) {
 		}
 	case reflect.Float32:
 		{
-			float32obj1 := obj1.(float32)
-			float32obj2 := obj2.(float32)
+			float32obj1, ok := obj1.(float32)
+			if !ok {
+				float32obj1 = obj1Value.Convert(float32Type).Interface().(float32)
+			}
+			float32obj2, ok := obj2.(float32)
+			if !ok {
+				float32obj2 = obj2Value.Convert(float32Type).Interface().(float32)
+			}
 			if float32obj1 > float32obj2 {
 				return compareGreater, true
 			}
@@ -171,8 +261,14 @@ func compare(obj1, obj2 interface{}, kind reflect.Kind) (CompareType, bool) {
 		}
 	case reflect.Float64:
 		{
-			float64obj1 := obj1.(float64)
-			float64obj2 := obj2.(float64)
+			float64obj1, ok := obj1.(float64)
+			if !ok {
+				float64obj1 = obj1Value.Convert(float64Type).Interface().(float64)
+			}
+			float64obj2, ok := obj2.(float64)
+			if !ok {
+				float64obj2 = obj2Value.Convert(float64Type).Interface().(float64)
+			}
 			if float64obj1 > float64obj2 {
 				return compareGreater, true
 			}
@@ -185,8 +281,14 @@ func compare(obj1, obj2 interface{}, kind reflect.Kind) (CompareType, bool) {
 		}
 	case reflect.String:
 		{
-			stringobj1 := obj1.(string)
-			stringobj2 := obj2.(string)
+			stringobj1, ok := obj1.(string)
+			if !ok {
+				stringobj1 = obj1Value.Convert(stringType).Interface().(string)
+			}
+			stringobj2, ok := obj2.(string)
+			if !ok {
+				stringobj2 = obj2Value.Convert(stringType).Interface().(string)
+			}
 			if stringobj1 > stringobj2 {
 				return compareGreater, true
 			}
@@ -240,6 +342,24 @@ func LessOrEqual(t TestingT, e1 interface{}, e2 interface{}, msgAndArgs ...inter
 	return compareTwoValues(t, e1, e2, []CompareType{compareLess, compareEqual}, "\"%v\" is not less than or equal to \"%v\"", msgAndArgs)
 }
 
+// Positive asserts that the specified element is positive
+//
+//    assert.Positive(t, 1)
+//    assert.Positive(t, 1.23)
+func Positive(t TestingT, e interface{}, msgAndArgs ...interface{}) bool {
+	zero := reflect.Zero(reflect.TypeOf(e))
+	return compareTwoValues(t, e, zero.Interface(), []CompareType{compareGreater}, "\"%v\" is not positive", msgAndArgs)
+}
+
+// Negative asserts that the specified element is negative
+//
+//    assert.Negative(t, -1)
+//    assert.Negative(t, -1.23)
+func Negative(t TestingT, e interface{}, msgAndArgs ...interface{}) bool {
+	zero := reflect.Zero(reflect.TypeOf(e))
+	return compareTwoValues(t, e, zero.Interface(), []CompareType{compareLess}, "\"%v\" is not negative", msgAndArgs)
+}
+
 func compareTwoValues(t TestingT, e1 interface{}, e2 interface{}, allowedComparesResults []CompareType, failMessage string, msgAndArgs ...interface{}) bool {
 	if h, ok := t.(tHelper); ok {
 		h.Helper()
diff --git a/vendor/github.com/stretchr/testify/assert/assertion_format.go b/vendor/github.com/stretchr/testify/assert/assertion_format.go
index 49370eb..4dfd122 100644
--- a/vendor/github.com/stretchr/testify/assert/assertion_format.go
+++ b/vendor/github.com/stretchr/testify/assert/assertion_format.go
@@ -114,6 +114,24 @@ func Errorf(t TestingT, err error, msg string, args ...interface{}) bool {
 	return Error(t, err, append([]interface{}{msg}, args...)...)
 }
 
+// ErrorAsf asserts that at least one of the errors in err's chain matches target, and if so, sets target to that error value.
+// This is a wrapper for errors.As.
+func ErrorAsf(t TestingT, err error, target interface{}, msg string, args ...interface{}) bool {
+	if h, ok := t.(tHelper); ok {
+		h.Helper()
+	}
+	return ErrorAs(t, err, target, append([]interface{}{msg}, args...)...)
+}
+
+// ErrorIsf asserts that at least one of the errors in err's chain matches target.
+// This is a wrapper for errors.Is.
+func ErrorIsf(t TestingT, err error, target error, msg string, args ...interface{}) bool {
+	if h, ok := t.(tHelper); ok {
+		h.Helper()
+	}
+	return ErrorIs(t, err, target, append([]interface{}{msg}, args...)...)
+}
+
 // Eventuallyf asserts that given condition will be met in waitFor time,
 // periodically checking target function each tick.
 //
@@ -321,6 +339,54 @@ func InEpsilonSlicef(t TestingT, expected interface{}, actual interface{}, epsil
 	return InEpsilonSlice(t, expected, actual, epsilon, append([]interface{}{msg}, args...)...)
 }
 
+// IsDecreasingf asserts that the collection is decreasing
+//
+//    assert.IsDecreasingf(t, []int{2, 1, 0}, "error message %s", "formatted")
+//    assert.IsDecreasingf(t, []float{2, 1}, "error message %s", "formatted")
+//    assert.IsDecreasingf(t, []string{"b", "a"}, "error message %s", "formatted")
+func IsDecreasingf(t TestingT, object interface{}, msg string, args ...interface{}) bool {
+	if h, ok := t.(tHelper); ok {
+		h.Helper()
+	}
+	return IsDecreasing(t, object, append([]interface{}{msg}, args...)...)
+}
+
+// IsIncreasingf asserts that the collection is increasing
+//
+//    assert.IsIncreasingf(t, []int{1, 2, 3}, "error message %s", "formatted")
+//    assert.IsIncreasingf(t, []float{1, 2}, "error message %s", "formatted")
+//    assert.IsIncreasingf(t, []string{"a", "b"}, "error message %s", "formatted")
+func IsIncreasingf(t TestingT, object interface{}, msg string, args ...interface{}) bool {
+	if h, ok := t.(tHelper); ok {
+		h.Helper()
+	}
+	return IsIncreasing(t, object, append([]interface{}{msg}, args...)...)
+}
+
+// IsNonDecreasingf asserts that the collection is not decreasing
+//
+//    assert.IsNonDecreasingf(t, []int{1, 1, 2}, "error message %s", "formatted")
+//    assert.IsNonDecreasingf(t, []float{1, 2}, "error message %s", "formatted")
+//    assert.IsNonDecreasingf(t, []string{"a", "b"}, "error message %s", "formatted")
+func IsNonDecreasingf(t TestingT, object interface{}, msg string, args ...interface{}) bool {
+	if h, ok := t.(tHelper); ok {
+		h.Helper()
+	}
+	return IsNonDecreasing(t, object, append([]interface{}{msg}, args...)...)
+}
+
+// IsNonIncreasingf asserts that the collection is not increasing
+//
+//    assert.IsNonIncreasingf(t, []int{2, 1, 1}, "error message %s", "formatted")
+//    assert.IsNonIncreasingf(t, []float{2, 1}, "error message %s", "formatted")
+//    assert.IsNonIncreasingf(t, []string{"b", "a"}, "error message %s", "formatted")
+func IsNonIncreasingf(t TestingT, object interface{}, msg string, args ...interface{}) bool {
+	if h, ok := t.(tHelper); ok {
+		h.Helper()
+	}
+	return IsNonIncreasing(t, object, append([]interface{}{msg}, args...)...)
+}
+
 // IsTypef asserts that the specified objects are of the same type.
 func IsTypef(t TestingT, expectedType interface{}, object interface{}, msg string, args ...interface{}) bool {
 	if h, ok := t.(tHelper); ok {
@@ -375,6 +441,17 @@ func LessOrEqualf(t TestingT, e1 interface{}, e2 interface{}, msg string, args .
 	return LessOrEqual(t, e1, e2, append([]interface{}{msg}, args...)...)
 }
 
+// Negativef asserts that the specified element is negative
+//
+//    assert.Negativef(t, -1, "error message %s", "formatted")
+//    assert.Negativef(t, -1.23, "error message %s", "formatted")
+func Negativef(t TestingT, e interface{}, msg string, args ...interface{}) bool {
+	if h, ok := t.(tHelper); ok {
+		h.Helper()
+	}
+	return Negative(t, e, append([]interface{}{msg}, args...)...)
+}
+
 // Neverf asserts that the given condition doesn't satisfy in waitFor time,
 // periodically checking the target function each tick.
 //
@@ -476,6 +553,15 @@ func NotEqualValuesf(t TestingT, expected interface{}, actual interface{}, msg s
 	return NotEqualValues(t, expected, actual, append([]interface{}{msg}, args...)...)
 }
 
+// NotErrorIsf asserts that at none of the errors in err's chain matches target.
+// This is a wrapper for errors.Is.
+func NotErrorIsf(t TestingT, err error, target error, msg string, args ...interface{}) bool {
+	if h, ok := t.(tHelper); ok {
+		h.Helper()
+	}
+	return NotErrorIs(t, err, target, append([]interface{}{msg}, args...)...)
+}
+
 // NotNilf asserts that the specified object is not nil.
 //
 //    assert.NotNilf(t, err, "error message %s", "formatted")
@@ -572,6 +658,17 @@ func PanicsWithValuef(t TestingT, expected interface{}, f PanicTestFunc, msg str
 	return PanicsWithValue(t, expected, f, append([]interface{}{msg}, args...)...)
 }
 
+// Positivef asserts that the specified element is positive
+//
+//    assert.Positivef(t, 1, "error message %s", "formatted")
+//    assert.Positivef(t, 1.23, "error message %s", "formatted")
+func Positivef(t TestingT, e interface{}, msg string, args ...interface{}) bool {
+	if h, ok := t.(tHelper); ok {
+		h.Helper()
+	}
+	return Positive(t, e, append([]interface{}{msg}, args...)...)
+}
+
 // Regexpf asserts that a specified regexp matches a string.
 //
 //  assert.Regexpf(t, regexp.MustCompile("start"), "it's starting", "error message %s", "formatted")
diff --git a/vendor/github.com/stretchr/testify/assert/assertion_forward.go b/vendor/github.com/stretchr/testify/assert/assertion_forward.go
index 9db8894..25337a6 100644
--- a/vendor/github.com/stretchr/testify/assert/assertion_forward.go
+++ b/vendor/github.com/stretchr/testify/assert/assertion_forward.go
@@ -204,6 +204,42 @@ func (a *Assertions) Error(err error, msgAndArgs ...interface{}) bool {
 	return Error(a.t, err, msgAndArgs...)
 }
 
+// ErrorAs asserts that at least one of the errors in err's chain matches target, and if so, sets target to that error value.
+// This is a wrapper for errors.As.
+func (a *Assertions) ErrorAs(err error, target interface{}, msgAndArgs ...interface{}) bool {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	return ErrorAs(a.t, err, target, msgAndArgs...)
+}
+
+// ErrorAsf asserts that at least one of the errors in err's chain matches target, and if so, sets target to that error value.
+// This is a wrapper for errors.As.
+func (a *Assertions) ErrorAsf(err error, target interface{}, msg string, args ...interface{}) bool {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	return ErrorAsf(a.t, err, target, msg, args...)
+}
+
+// ErrorIs asserts that at least one of the errors in err's chain matches target.
+// This is a wrapper for errors.Is.
+func (a *Assertions) ErrorIs(err error, target error, msgAndArgs ...interface{}) bool {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	return ErrorIs(a.t, err, target, msgAndArgs...)
+}
+
+// ErrorIsf asserts that at least one of the errors in err's chain matches target.
+// This is a wrapper for errors.Is.
+func (a *Assertions) ErrorIsf(err error, target error, msg string, args ...interface{}) bool {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	return ErrorIsf(a.t, err, target, msg, args...)
+}
+
 // Errorf asserts that a function returned an error (i.e. not `nil`).
 //
 //   actualObj, err := SomeFunction()
@@ -631,6 +667,102 @@ func (a *Assertions) InEpsilonf(expected interface{}, actual interface{}, epsilo
 	return InEpsilonf(a.t, expected, actual, epsilon, msg, args...)
 }
 
+// IsDecreasing asserts that the collection is decreasing
+//
+//    a.IsDecreasing([]int{2, 1, 0})
+//    a.IsDecreasing([]float{2, 1})
+//    a.IsDecreasing([]string{"b", "a"})
+func (a *Assertions) IsDecreasing(object interface{}, msgAndArgs ...interface{}) bool {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	return IsDecreasing(a.t, object, msgAndArgs...)
+}
+
+// IsDecreasingf asserts that the collection is decreasing
+//
+//    a.IsDecreasingf([]int{2, 1, 0}, "error message %s", "formatted")
+//    a.IsDecreasingf([]float{2, 1}, "error message %s", "formatted")
+//    a.IsDecreasingf([]string{"b", "a"}, "error message %s", "formatted")
+func (a *Assertions) IsDecreasingf(object interface{}, msg string, args ...interface{}) bool {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	return IsDecreasingf(a.t, object, msg, args...)
+}
+
+// IsIncreasing asserts that the collection is increasing
+//
+//    a.IsIncreasing([]int{1, 2, 3})
+//    a.IsIncreasing([]float{1, 2})
+//    a.IsIncreasing([]string{"a", "b"})
+func (a *Assertions) IsIncreasing(object interface{}, msgAndArgs ...interface{}) bool {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	return IsIncreasing(a.t, object, msgAndArgs...)
+}
+
+// IsIncreasingf asserts that the collection is increasing
+//
+//    a.IsIncreasingf([]int{1, 2, 3}, "error message %s", "formatted")
+//    a.IsIncreasingf([]float{1, 2}, "error message %s", "formatted")
+//    a.IsIncreasingf([]string{"a", "b"}, "error message %s", "formatted")
+func (a *Assertions) IsIncreasingf(object interface{}, msg string, args ...interface{}) bool {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	return IsIncreasingf(a.t, object, msg, args...)
+}
+
+// IsNonDecreasing asserts that the collection is not decreasing
+//
+//    a.IsNonDecreasing([]int{1, 1, 2})
+//    a.IsNonDecreasing([]float{1, 2})
+//    a.IsNonDecreasing([]string{"a", "b"})
+func (a *Assertions) IsNonDecreasing(object interface{}, msgAndArgs ...interface{}) bool {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	return IsNonDecreasing(a.t, object, msgAndArgs...)
+}
+
+// IsNonDecreasingf asserts that the collection is not decreasing
+//
+//    a.IsNonDecreasingf([]int{1, 1, 2}, "error message %s", "formatted")
+//    a.IsNonDecreasingf([]float{1, 2}, "error message %s", "formatted")
+//    a.IsNonDecreasingf([]string{"a", "b"}, "error message %s", "formatted")
+func (a *Assertions) IsNonDecreasingf(object interface{}, msg string, args ...interface{}) bool {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	return IsNonDecreasingf(a.t, object, msg, args...)
+}
+
+// IsNonIncreasing asserts that the collection is not increasing
+//
+//    a.IsNonIncreasing([]int{2, 1, 1})
+//    a.IsNonIncreasing([]float{2, 1})
+//    a.IsNonIncreasing([]string{"b", "a"})
+func (a *Assertions) IsNonIncreasing(object interface{}, msgAndArgs ...interface{}) bool {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	return IsNonIncreasing(a.t, object, msgAndArgs...)
+}
+
+// IsNonIncreasingf asserts that the collection is not increasing
+//
+//    a.IsNonIncreasingf([]int{2, 1, 1}, "error message %s", "formatted")
+//    a.IsNonIncreasingf([]float{2, 1}, "error message %s", "formatted")
+//    a.IsNonIncreasingf([]string{"b", "a"}, "error message %s", "formatted")
+func (a *Assertions) IsNonIncreasingf(object interface{}, msg string, args ...interface{}) bool {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	return IsNonIncreasingf(a.t, object, msg, args...)
+}
+
 // IsType asserts that the specified objects are of the same type.
 func (a *Assertions) IsType(expectedType interface{}, object interface{}, msgAndArgs ...interface{}) bool {
 	if h, ok := a.t.(tHelper); ok {
@@ -739,6 +871,28 @@ func (a *Assertions) Lessf(e1 interface{}, e2 interface{}, msg string, args ...i
 	return Lessf(a.t, e1, e2, msg, args...)
 }
 
+// Negative asserts that the specified element is negative
+//
+//    a.Negative(-1)
+//    a.Negative(-1.23)
+func (a *Assertions) Negative(e interface{}, msgAndArgs ...interface{}) bool {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	return Negative(a.t, e, msgAndArgs...)
+}
+
+// Negativef asserts that the specified element is negative
+//
+//    a.Negativef(-1, "error message %s", "formatted")
+//    a.Negativef(-1.23, "error message %s", "formatted")
+func (a *Assertions) Negativef(e interface{}, msg string, args ...interface{}) bool {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	return Negativef(a.t, e, msg, args...)
+}
+
 // Never asserts that the given condition doesn't satisfy in waitFor time,
 // periodically checking the target function each tick.
 //
@@ -941,6 +1095,24 @@ func (a *Assertions) NotEqualf(expected interface{}, actual interface{}, msg str
 	return NotEqualf(a.t, expected, actual, msg, args...)
 }
 
+// NotErrorIs asserts that at none of the errors in err's chain matches target.
+// This is a wrapper for errors.Is.
+func (a *Assertions) NotErrorIs(err error, target error, msgAndArgs ...interface{}) bool {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	return NotErrorIs(a.t, err, target, msgAndArgs...)
+}
+
+// NotErrorIsf asserts that at none of the errors in err's chain matches target.
+// This is a wrapper for errors.Is.
+func (a *Assertions) NotErrorIsf(err error, target error, msg string, args ...interface{}) bool {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	return NotErrorIsf(a.t, err, target, msg, args...)
+}
+
 // NotNil asserts that the specified object is not nil.
 //
 //    a.NotNil(err)
@@ -1133,6 +1305,28 @@ func (a *Assertions) Panicsf(f PanicTestFunc, msg string, args ...interface{}) b
 	return Panicsf(a.t, f, msg, args...)
 }
 
+// Positive asserts that the specified element is positive
+//
+//    a.Positive(1)
+//    a.Positive(1.23)
+func (a *Assertions) Positive(e interface{}, msgAndArgs ...interface{}) bool {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	return Positive(a.t, e, msgAndArgs...)
+}
+
+// Positivef asserts that the specified element is positive
+//
+//    a.Positivef(1, "error message %s", "formatted")
+//    a.Positivef(1.23, "error message %s", "formatted")
+func (a *Assertions) Positivef(e interface{}, msg string, args ...interface{}) bool {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	return Positivef(a.t, e, msg, args...)
+}
+
 // Regexp asserts that a specified regexp matches a string.
 //
 //  a.Regexp(regexp.MustCompile("start"), "it's starting")
diff --git a/vendor/github.com/stretchr/testify/assert/assertion_order.go b/vendor/github.com/stretchr/testify/assert/assertion_order.go
new file mode 100644
index 0000000..1c3b471
--- /dev/null
+++ b/vendor/github.com/stretchr/testify/assert/assertion_order.go
@@ -0,0 +1,81 @@
+package assert
+
+import (
+	"fmt"
+	"reflect"
+)
+
+// isOrdered checks that collection contains orderable elements.
+func isOrdered(t TestingT, object interface{}, allowedComparesResults []CompareType, failMessage string, msgAndArgs ...interface{}) bool {
+	objKind := reflect.TypeOf(object).Kind()
+	if objKind != reflect.Slice && objKind != reflect.Array {
+		return false
+	}
+
+	objValue := reflect.ValueOf(object)
+	objLen := objValue.Len()
+
+	if objLen <= 1 {
+		return true
+	}
+
+	value := objValue.Index(0)
+	valueInterface := value.Interface()
+	firstValueKind := value.Kind()
+
+	for i := 1; i < objLen; i++ {
+		prevValue := value
+		prevValueInterface := valueInterface
+
+		value = objValue.Index(i)
+		valueInterface = value.Interface()
+
+		compareResult, isComparable := compare(prevValueInterface, valueInterface, firstValueKind)
+
+		if !isComparable {
+			return Fail(t, fmt.Sprintf("Can not compare type \"%s\" and \"%s\"", reflect.TypeOf(value), reflect.TypeOf(prevValue)), msgAndArgs...)
+		}
+
+		if !containsValue(allowedComparesResults, compareResult) {
+			return Fail(t, fmt.Sprintf(failMessage, prevValue, value), msgAndArgs...)
+		}
+	}
+
+	return true
+}
+
+// IsIncreasing asserts that the collection is increasing
+//
+//    assert.IsIncreasing(t, []int{1, 2, 3})
+//    assert.IsIncreasing(t, []float{1, 2})
+//    assert.IsIncreasing(t, []string{"a", "b"})
+func IsIncreasing(t TestingT, object interface{}, msgAndArgs ...interface{}) bool {
+	return isOrdered(t, object, []CompareType{compareLess}, "\"%v\" is not less than \"%v\"", msgAndArgs)
+}
+
+// IsNonIncreasing asserts that the collection is not increasing
+//
+//    assert.IsNonIncreasing(t, []int{2, 1, 1})
+//    assert.IsNonIncreasing(t, []float{2, 1})
+//    assert.IsNonIncreasing(t, []string{"b", "a"})
+func IsNonIncreasing(t TestingT, object interface{}, msgAndArgs ...interface{}) bool {
+	return isOrdered(t, object, []CompareType{compareEqual, compareGreater}, "\"%v\" is not greater than or equal to \"%v\"", msgAndArgs)
+}
+
+// IsDecreasing asserts that the collection is decreasing
+//
+//    assert.IsDecreasing(t, []int{2, 1, 0})
+//    assert.IsDecreasing(t, []float{2, 1})
+//    assert.IsDecreasing(t, []string{"b", "a"})
+func IsDecreasing(t TestingT, object interface{}, msgAndArgs ...interface{}) bool {
+	return isOrdered(t, object, []CompareType{compareGreater}, "\"%v\" is not greater than \"%v\"", msgAndArgs)
+}
+
+// IsNonDecreasing asserts that the collection is not decreasing
+//
+//    assert.IsNonDecreasing(t, []int{1, 1, 2})
+//    assert.IsNonDecreasing(t, []float{1, 2})
+//    assert.IsNonDecreasing(t, []string{"a", "b"})
+func IsNonDecreasing(t TestingT, object interface{}, msgAndArgs ...interface{}) bool {
+	return isOrdered(t, object, []CompareType{compareLess, compareEqual}, "\"%v\" is not less than or equal to \"%v\"", msgAndArgs)
+}
diff --git a/vendor/github.com/stretchr/testify/assert/assertions.go b/vendor/github.com/stretchr/testify/assert/assertions.go
index 914a10d..bcac440 100644
--- a/vendor/github.com/stretchr/testify/assert/assertions.go
+++ b/vendor/github.com/stretchr/testify/assert/assertions.go
@@ -172,8 +172,8 @@ func isTest(name, prefix string) bool {
 	if len(name) == len(prefix) { // "Test" is ok
 		return true
 	}
-	rune, _ := utf8.DecodeRuneInString(name[len(prefix):])
-	return !unicode.IsLower(rune)
+	r, _ := utf8.DecodeRuneInString(name[len(prefix):])
+	return !unicode.IsLower(r)
 }
 
 func messageFromMsgAndArgs(msgAndArgs ...interface{}) string {
@@ -1622,6 +1622,7 @@ var spewConfig = spew.ConfigState{
 	DisableCapacities:       true,
 	SortKeys:                true,
 	DisableMethods:          true,
+	MaxDepth:                10,
 }
 
 type tHelper interface {
@@ -1693,3 +1694,81 @@ func Never(t TestingT, condition func() bool, waitFor time.Duration, tick time.D
 		}
 	}
 }
+
+// ErrorIs asserts that at least one of the errors in err's chain matches target.
+// This is a wrapper for errors.Is.
+func ErrorIs(t TestingT, err, target error, msgAndArgs ...interface{}) bool {
+	if h, ok := t.(tHelper); ok {
+		h.Helper()
+	}
+	if errors.Is(err, target) {
+		return true
+	}
+
+	var expectedText string
+	if target != nil {
+		expectedText = target.Error()
+	}
+
+	chain := buildErrorChainString(err)
+
+	return Fail(t, fmt.Sprintf("Target error should be in err chain:\n"+
+		"expected: %q\n"+
+		"in chain: %s", expectedText, chain,
+	), msgAndArgs...)
+}
+
+// NotErrorIs asserts that at none of the errors in err's chain matches target.
+// This is a wrapper for errors.Is.
+func NotErrorIs(t TestingT, err, target error, msgAndArgs ...interface{}) bool {
+	if h, ok := t.(tHelper); ok {
+		h.Helper()
+	}
+	if !errors.Is(err, target) {
+		return true
+	}
+
+	var expectedText string
+	if target != nil {
+		expectedText = target.Error()
+	}
+
+	chain := buildErrorChainString(err)
+
+	return Fail(t, fmt.Sprintf("Target error should not be in err chain:\n"+
+		"found: %q\n"+
+		"in chain: %s", expectedText, chain,
+	), msgAndArgs...)
+}
+
+// ErrorAs asserts that at least one of the errors in err's chain matches target, and if so, sets target to that error value.
+// This is a wrapper for errors.As.
+func ErrorAs(t TestingT, err error, target interface{}, msgAndArgs ...interface{}) bool {
+	if h, ok := t.(tHelper); ok {
+		h.Helper()
+	}
+	if errors.As(err, target) {
+		return true
+	}
+
+	chain := buildErrorChainString(err)
+
+	return Fail(t, fmt.Sprintf("Should be in error chain:\n"+
+		"expected: %q\n"+
+		"in chain: %s", target, chain,
+	), msgAndArgs...)
+}
+
+func buildErrorChainString(err error) string {
+	if err == nil {
+		return ""
+	}
+
+	e := errors.Unwrap(err)
+	chain := fmt.Sprintf("%q", err.Error())
+	for e != nil {
+		chain += fmt.Sprintf("\n\t%q", e.Error())
+		e = errors.Unwrap(e)
+	}
+	return chain
+}
diff --git a/vendor/github.com/stretchr/testify/require/require.go b/vendor/github.com/stretchr/testify/require/require.go
index ec4624b..51820df 100644
--- a/vendor/github.com/stretchr/testify/require/require.go
+++ b/vendor/github.com/stretchr/testify/require/require.go
@@ -256,6 +256,54 @@ func Error(t TestingT, err error, msgAndArgs ...interface{}) {
 	t.FailNow()
 }
 
+// ErrorAs asserts that at least one of the errors in err's chain matches target, and if so, sets target to that error value.
+// This is a wrapper for errors.As.
+func ErrorAs(t TestingT, err error, target interface{}, msgAndArgs ...interface{}) {
+	if h, ok := t.(tHelper); ok {
+		h.Helper()
+	}
+	if assert.ErrorAs(t, err, target, msgAndArgs...) {
+		return
+	}
+	t.FailNow()
+}
+
+// ErrorAsf asserts that at least one of the errors in err's chain matches target, and if so, sets target to that error value.
+// This is a wrapper for errors.As.
+func ErrorAsf(t TestingT, err error, target interface{}, msg string, args ...interface{}) {
+	if h, ok := t.(tHelper); ok {
+		h.Helper()
+	}
+	if assert.ErrorAsf(t, err, target, msg, args...) {
+		return
+	}
+	t.FailNow()
+}
+
+// ErrorIs asserts that at least one of the errors in err's chain matches target.
+// This is a wrapper for errors.Is.
+func ErrorIs(t TestingT, err error, target error, msgAndArgs ...interface{}) {
+	if h, ok := t.(tHelper); ok {
+		h.Helper()
+	}
+	if assert.ErrorIs(t, err, target, msgAndArgs...) {
+		return
+	}
+	t.FailNow()
+}
+
+// ErrorIsf asserts that at least one of the errors in err's chain matches target.
+// This is a wrapper for errors.Is.
+func ErrorIsf(t TestingT, err error, target error, msg string, args ...interface{}) {
+	if h, ok := t.(tHelper); ok {
+		h.Helper()
+	}
+	if assert.ErrorIsf(t, err, target, msg, args...) {
+		return
+	}
+	t.FailNow()
+}
+
 // Errorf asserts that a function returned an error (i.e. not `nil`).
 //
 //   actualObj, err := SomeFunction()
@@ -806,6 +854,126 @@ func InEpsilonf(t TestingT, expected interface{}, actual interface{}, epsilon fl
 	t.FailNow()
 }
 
+// IsDecreasing asserts that the collection is decreasing
+//
+//    assert.IsDecreasing(t, []int{2, 1, 0})
+//    assert.IsDecreasing(t, []float{2, 1})
+//    assert.IsDecreasing(t, []string{"b", "a"})
+func IsDecreasing(t TestingT, object interface{}, msgAndArgs ...interface{}) {
+	if h, ok := t.(tHelper); ok {
+		h.Helper()
+	}
+	if assert.IsDecreasing(t, object, msgAndArgs...) {
+		return
+	}
+	t.FailNow()
+}
+
+// IsDecreasingf asserts that the collection is decreasing
+//
+//    assert.IsDecreasingf(t, []int{2, 1, 0}, "error message %s", "formatted")
+//    assert.IsDecreasingf(t, []float{2, 1}, "error message %s", "formatted")
+//    assert.IsDecreasingf(t, []string{"b", "a"}, "error message %s", "formatted")
+func IsDecreasingf(t TestingT, object interface{}, msg string, args ...interface{}) {
+	if h, ok := t.(tHelper); ok {
+		h.Helper()
+	}
+	if assert.IsDecreasingf(t, object, msg, args...) {
+		return
+	}
+	t.FailNow()
+}
+
+// IsIncreasing asserts that the collection is increasing
+//
+//    assert.IsIncreasing(t, []int{1, 2, 3})
+//    assert.IsIncreasing(t, []float{1, 2})
+//    assert.IsIncreasing(t, []string{"a", "b"})
+func IsIncreasing(t TestingT, object interface{}, msgAndArgs ...interface{}) {
+	if h, ok := t.(tHelper); ok {
+		h.Helper()
+	}
+	if assert.IsIncreasing(t, object, msgAndArgs...) {
+		return
+	}
+	t.FailNow()
+}
+
+// IsIncreasingf asserts that the collection is increasing
+//
+//    assert.IsIncreasingf(t, []int{1, 2, 3}, "error message %s", "formatted")
+//    assert.IsIncreasingf(t, []float{1, 2}, "error message %s", "formatted")
+//    assert.IsIncreasingf(t, []string{"a", "b"}, "error message %s", "formatted")
+func IsIncreasingf(t TestingT, object interface{}, msg string, args ...interface{}) {
+	if h, ok := t.(tHelper); ok {
+		h.Helper()
+	}
+	if assert.IsIncreasingf(t, object, msg, args...) {
+		return
+	}
+	t.FailNow()
+}
+
+// IsNonDecreasing asserts that the collection is not decreasing
+//
+//    assert.IsNonDecreasing(t, []int{1, 1, 2})
+//    assert.IsNonDecreasing(t, []float{1, 2})
+//    assert.IsNonDecreasing(t, []string{"a", "b"})
+func IsNonDecreasing(t TestingT, object interface{}, msgAndArgs ...interface{}) {
+	if h, ok := t.(tHelper); ok {
+		h.Helper()
+	}
+	if assert.IsNonDecreasing(t, object, msgAndArgs...) {
+		return
+	}
+	t.FailNow()
+}
+
+// IsNonDecreasingf asserts that the collection is not decreasing
+//
+//    assert.IsNonDecreasingf(t, []int{1, 1, 2}, "error message %s", "formatted")
+//    assert.IsNonDecreasingf(t, []float{1, 2}, "error message %s", "formatted")
+//    assert.IsNonDecreasingf(t, []string{"a", "b"}, "error message %s", "formatted")
+func IsNonDecreasingf(t TestingT, object interface{}, msg string, args ...interface{}) {
+	if h, ok := t.(tHelper); ok {
+		h.Helper()
+	}
+	if assert.IsNonDecreasingf(t, object, msg, args...) {
+		return
+	}
+	t.FailNow()
+}
+
+// IsNonIncreasing asserts that the collection is not increasing
+//
+//    assert.IsNonIncreasing(t, []int{2, 1, 1})
+//    assert.IsNonIncreasing(t, []float{2, 1})
+//    assert.IsNonIncreasing(t, []string{"b", "a"})
+func IsNonIncreasing(t TestingT, object interface{}, msgAndArgs ...interface{}) {
+	if h, ok := t.(tHelper); ok {
+		h.Helper()
+	}
+	if assert.IsNonIncreasing(t, object, msgAndArgs...) {
+		return
+	}
+	t.FailNow()
+}
+
+// IsNonIncreasingf asserts that the collection is not increasing
+//
+//    assert.IsNonIncreasingf(t, []int{2, 1, 1}, "error message %s", "formatted")
+//    assert.IsNonIncreasingf(t, []float{2, 1}, "error message %s", "formatted")
+//    assert.IsNonIncreasingf(t, []string{"b", "a"}, "error message %s", "formatted")
+func IsNonIncreasingf(t TestingT, object interface{}, msg string, args ...interface{}) {
+	if h, ok := t.(tHelper); ok {
+		h.Helper()
+	}
+	if assert.IsNonIncreasingf(t, object, msg, args...) {
+		return
+	}
+	t.FailNow()
+}
+
 // IsType asserts that the specified objects are of the same type.
 func IsType(t TestingT, expectedType interface{}, object interface{}, msgAndArgs ...interface{}) {
 	if h, ok := t.(tHelper); ok {
@@ -944,6 +1112,34 @@ func Lessf(t TestingT, e1 interface{}, e2 interface{}, msg string, args ...inter
 	t.FailNow()
 }
 
+// Negative asserts that the specified element is negative
+//
+//    assert.Negative(t, -1)
+//    assert.Negative(t, -1.23)
+func Negative(t TestingT, e interface{}, msgAndArgs ...interface{}) {
+	if h, ok := t.(tHelper); ok {
+		h.Helper()
+	}
+	if assert.Negative(t, e, msgAndArgs...) {
+		return
+	}
+	t.FailNow()
+}
+
+// Negativef asserts that the specified element is negative
+//
+//    assert.Negativef(t, -1, "error message %s", "formatted")
+//    assert.Negativef(t, -1.23, "error message %s", "formatted")
+func Negativef(t TestingT, e interface{}, msg string, args ...interface{}) {
+	if h, ok := t.(tHelper); ok {
+		h.Helper()
+	}
+	if assert.Negativef(t, e, msg, args...) {
+		return
+	}
+	t.FailNow()
+}
+
 // Never asserts that the given condition doesn't satisfy in waitFor time,
 // periodically checking the target function each tick.
 //
@@ -1200,6 +1396,30 @@ func NotEqualf(t TestingT, expected interface{}, actual interface{}, msg string,
 	t.FailNow()
 }
 
+// NotErrorIs asserts that at none of the errors in err's chain matches target.
+// This is a wrapper for errors.Is.
+func NotErrorIs(t TestingT, err error, target error, msgAndArgs ...interface{}) {
+	if h, ok := t.(tHelper); ok {
+		h.Helper()
+	}
+	if assert.NotErrorIs(t, err, target, msgAndArgs...) {
+		return
+	}
+	t.FailNow()
+}
+
+// NotErrorIsf asserts that at none of the errors in err's chain matches target.
+// This is a wrapper for errors.Is.
+func NotErrorIsf(t TestingT, err error, target error, msg string, args ...interface{}) {
+	if h, ok := t.(tHelper); ok {
+		h.Helper()
+	}
+	if assert.NotErrorIsf(t, err, target, msg, args...) {
+		return
+	}
+	t.FailNow()
+}
+
 // NotNil asserts that the specified object is not nil.
 //
 //    assert.NotNil(t, err)
@@ -1446,6 +1666,34 @@ func Panicsf(t TestingT, f assert.PanicTestFunc, msg string, args ...interface{}
 	t.FailNow()
 }
 
+// Positive asserts that the specified element is positive
+//
+//    assert.Positive(t, 1)
+//    assert.Positive(t, 1.23)
+func Positive(t TestingT, e interface{}, msgAndArgs ...interface{}) {
+	if h, ok := t.(tHelper); ok {
+		h.Helper()
+	}
+	if assert.Positive(t, e, msgAndArgs...) {
+		return
+	}
+	t.FailNow()
+}
+
+// Positivef asserts that the specified element is positive
+//
+//    assert.Positivef(t, 1, "error message %s", "formatted")
+//    assert.Positivef(t, 1.23, "error message %s", "formatted")
+func Positivef(t TestingT, e interface{}, msg string, args ...interface{}) {
+	if h, ok := t.(tHelper); ok {
+		h.Helper()
+	}
+	if assert.Positivef(t, e, msg, args...) {
+		return
+	}
+	t.FailNow()
+}
+
 // Regexp asserts that a specified regexp matches a string.
 //
 //  assert.Regexp(t, regexp.MustCompile("start"), "it's starting")
diff --git a/vendor/github.com/stretchr/testify/require/require_forward.go b/vendor/github.com/stretchr/testify/require/require_forward.go
index 103d7dc..ed54a9d 100644
--- a/vendor/github.com/stretchr/testify/require/require_forward.go
+++ b/vendor/github.com/stretchr/testify/require/require_forward.go
@@ -205,6 +205,42 @@ func (a *Assertions) Error(err error, msgAndArgs ...interface{}) {
 	Error(a.t, err, msgAndArgs...)
 }
 
+// ErrorAs asserts that at least one of the errors in err's chain matches target, and if so, sets target to that error value.
+// This is a wrapper for errors.As.
+func (a *Assertions) ErrorAs(err error, target interface{}, msgAndArgs ...interface{}) {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	ErrorAs(a.t, err, target, msgAndArgs...)
+}
+
+// ErrorAsf asserts that at least one of the errors in err's chain matches target, and if so, sets target to that error value.
+// This is a wrapper for errors.As.
+func (a *Assertions) ErrorAsf(err error, target interface{}, msg string, args ...interface{}) {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	ErrorAsf(a.t, err, target, msg, args...)
+}
+
+// ErrorIs asserts that at least one of the errors in err's chain matches target.
+// This is a wrapper for errors.Is.
+func (a *Assertions) ErrorIs(err error, target error, msgAndArgs ...interface{}) {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	ErrorIs(a.t, err, target, msgAndArgs...)
+}
+
+// ErrorIsf asserts that at least one of the errors in err's chain matches target.
+// This is a wrapper for errors.Is.
+func (a *Assertions) ErrorIsf(err error, target error, msg string, args ...interface{}) {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	ErrorIsf(a.t, err, target, msg, args...)
+}
+
 // Errorf asserts that a function returned an error (i.e. not `nil`).
 //
 //   actualObj, err := SomeFunction()
@@ -632,6 +668,102 @@ func (a *Assertions) InEpsilonf(expected interface{}, actual interface{}, epsilo
 	InEpsilonf(a.t, expected, actual, epsilon, msg, args...)
 }
 
+// IsDecreasing asserts that the collection is decreasing
+//
+//    a.IsDecreasing([]int{2, 1, 0})
+//    a.IsDecreasing([]float{2, 1})
+//    a.IsDecreasing([]string{"b", "a"})
+func (a *Assertions) IsDecreasing(object interface{}, msgAndArgs ...interface{}) {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	IsDecreasing(a.t, object, msgAndArgs...)
+}
+
+// IsDecreasingf asserts that the collection is decreasing
+//
+//    a.IsDecreasingf([]int{2, 1, 0}, "error message %s", "formatted")
+//    a.IsDecreasingf([]float{2, 1}, "error message %s", "formatted")
+//    a.IsDecreasingf([]string{"b", "a"}, "error message %s", "formatted")
+func (a *Assertions) IsDecreasingf(object interface{}, msg string, args ...interface{}) {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	IsDecreasingf(a.t, object, msg, args...)
+}
+
+// IsIncreasing asserts that the collection is increasing
+//
+//    a.IsIncreasing([]int{1, 2, 3})
+//    a.IsIncreasing([]float{1, 2})
+//    a.IsIncreasing([]string{"a", "b"})
+func (a *Assertions) IsIncreasing(object interface{}, msgAndArgs ...interface{}) {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	IsIncreasing(a.t, object, msgAndArgs...)
+}
+
+// IsIncreasingf asserts that the collection is increasing
+//
+//    a.IsIncreasingf([]int{1, 2, 3}, "error message %s", "formatted")
+//    a.IsIncreasingf([]float{1, 2}, "error message %s", "formatted")
+//    a.IsIncreasingf([]string{"a", "b"}, "error message %s", "formatted")
+func (a *Assertions) IsIncreasingf(object interface{}, msg string, args ...interface{}) {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	IsIncreasingf(a.t, object, msg, args...)
+}
+
+// IsNonDecreasing asserts that the collection is not decreasing
+//
+//    a.IsNonDecreasing([]int{1, 1, 2})
+//    a.IsNonDecreasing([]float{1, 2})
+//    a.IsNonDecreasing([]string{"a", "b"})
+func (a *Assertions) IsNonDecreasing(object interface{}, msgAndArgs ...interface{}) {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	IsNonDecreasing(a.t, object, msgAndArgs...)
+}
+
+// IsNonDecreasingf asserts that the collection is not decreasing
+//
+//    a.IsNonDecreasingf([]int{1, 1, 2}, "error message %s", "formatted")
+//    a.IsNonDecreasingf([]float{1, 2}, "error message %s", "formatted")
+//    a.IsNonDecreasingf([]string{"a", "b"}, "error message %s", "formatted")
+func (a *Assertions) IsNonDecreasingf(object interface{}, msg string, args ...interface{}) {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	IsNonDecreasingf(a.t, object, msg, args...)
+}
+
+// IsNonIncreasing asserts that the collection is not increasing
+//
+//    a.IsNonIncreasing([]int{2, 1, 1})
+//    a.IsNonIncreasing([]float{2, 1})
+//    a.IsNonIncreasing([]string{"b", "a"})
+func (a *Assertions) IsNonIncreasing(object interface{}, msgAndArgs ...interface{}) {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	IsNonIncreasing(a.t, object, msgAndArgs...)
+}
+
+// IsNonIncreasingf asserts that the collection is not increasing
+//
+//    a.IsNonIncreasingf([]int{2, 1, 1}, "error message %s", "formatted")
+//    a.IsNonIncreasingf([]float{2, 1}, "error message %s", "formatted")
+//    a.IsNonIncreasingf([]string{"b", "a"}, "error message %s", "formatted")
+func (a *Assertions) IsNonIncreasingf(object interface{}, msg string, args ...interface{}) {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	IsNonIncreasingf(a.t, object, msg, args...)
+}
+
 // IsType asserts that the specified objects are of the same type.
 func (a *Assertions) IsType(expectedType interface{}, object interface{}, msgAndArgs ...interface{}) {
 	if h, ok := a.t.(tHelper); ok {
@@ -740,6 +872,28 @@ func (a *Assertions) Lessf(e1 interface{}, e2 interface{}, msg string, args ...i
 	Lessf(a.t, e1, e2, msg, args...)
 }
 
+// Negative asserts that the specified element is negative
+//
+//    a.Negative(-1)
+//    a.Negative(-1.23)
+func (a *Assertions) Negative(e interface{}, msgAndArgs ...interface{}) {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	Negative(a.t, e, msgAndArgs...)
+}
+
+// Negativef asserts that the specified element is negative
+//
+//    a.Negativef(-1, "error message %s", "formatted")
+//    a.Negativef(-1.23, "error message %s", "formatted")
+func (a *Assertions) Negativef(e interface{}, msg string, args ...interface{}) {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	Negativef(a.t, e, msg, args...)
+}
+
 // Never asserts that the given condition doesn't satisfy in waitFor time,
 // periodically checking the target function each tick.
 //
@@ -942,6 +1096,24 @@ func (a *Assertions) NotEqualf(expected interface{}, actual interface{}, msg str
 	NotEqualf(a.t, expected, actual, msg, args...)
 }
 
+// NotErrorIs asserts that at none of the errors in err's chain matches target.
+// This is a wrapper for errors.Is.
+func (a *Assertions) NotErrorIs(err error, target error, msgAndArgs ...interface{}) {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	NotErrorIs(a.t, err, target, msgAndArgs...)
+}
+
+// NotErrorIsf asserts that at none of the errors in err's chain matches target.
+// This is a wrapper for errors.Is.
+func (a *Assertions) NotErrorIsf(err error, target error, msg string, args ...interface{}) {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	NotErrorIsf(a.t, err, target, msg, args...)
+}
+
 // NotNil asserts that the specified object is not nil.
 //
 //    a.NotNil(err)
@@ -1134,6 +1306,28 @@ func (a *Assertions) Panicsf(f assert.PanicTestFunc, msg string, args ...interfa
 	Panicsf(a.t, f, msg, args...)
 }
 
+// Positive asserts that the specified element is positive
+//
+//    a.Positive(1)
+//    a.Positive(1.23)
+func (a *Assertions) Positive(e interface{}, msgAndArgs ...interface{}) {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	Positive(a.t, e, msgAndArgs...)
+}
+
+// Positivef asserts that the specified element is positive
+//
+//    a.Positivef(1, "error message %s", "formatted")
+//    a.Positivef(1.23, "error message %s", "formatted")
+func (a *Assertions) Positivef(e interface{}, msg string, args ...interface{}) {
+	if h, ok := a.t.(tHelper); ok {
+		h.Helper()
+	}
+	Positivef(a.t, e, msg, args...)
+}
+
 // Regexp asserts that a specified regexp matches a string.
 //
 //  a.Regexp(regexp.MustCompile("start"), "it's starting")
diff --git a/vendor/github.com/ulikunitz/xz/LICENSE b/vendor/github.com/ulikunitz/xz/LICENSE
index d321499..009b848 100644
--- a/vendor/github.com/ulikunitz/xz/LICENSE
+++ b/vendor/github.com/ulikunitz/xz/LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2014-2020  Ulrich Kunitz
+Copyright (c) 2014-2021  Ulrich Kunitz
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/vendor/github.com/ulikunitz/xz/SECURITY.md b/vendor/github.com/ulikunitz/xz/SECURITY.md
new file mode 100644
index 0000000..5f7ec01
--- /dev/null
+++ b/vendor/github.com/ulikunitz/xz/SECURITY.md
@@ -0,0 +1,10 @@
+# Security Policy
+
+## Supported Versions
+
+Currently the last minor version v0.5.x is supported.
+
+## Reporting a Vulnerability
+
+Report a vulnerability by creating a Github issue at
+<https://github.com/ulikunitz/xz/issues>. Expect a response in a week.
diff --git a/vendor/github.com/ulikunitz/xz/TODO.md b/vendor/github.com/ulikunitz/xz/TODO.md
index a4224ce..594e0c7 100644
--- a/vendor/github.com/ulikunitz/xz/TODO.md
+++ b/vendor/github.com/ulikunitz/xz/TODO.md
@@ -8,19 +8,17 @@
 
 1. Review encoder and check for lzma improvements under xz.
 2. Fix binary tree matcher.
-3. Compare compression ratio with xz tool using comparable parameters
-   and optimize parameters
-4. Do some optimizations
-    - rename operation action and make it a simple type of size 8
-    - make maxMatches, wordSize parameters
-    - stop searching after a certain length is found (parameter sweetLen)
+3. Compare compression ratio with xz tool using comparable parameters and optimize parameters
+4. rename operation action and make it a simple type of size 8
+5. make maxMatches, wordSize parameters
+6. stop searching after a certain length is found (parameter sweetLen)
 
 ## Release v0.7
 
 1. Optimize code
 2. Do statistical analysis to get linear presets.
 3. Test sync.Pool compatability for xz and lzma Writer and Reader
-3. Fuzz optimized code.
+4. Fuzz optimized code.
 
 ## Release v0.8
 
@@ -44,52 +42,85 @@
 
 ## Package lzma
 
-### Release v0.6
-
-- Rewrite Encoder into a simple greedy one-op-at-a-time encoder
-  including
-    + simple scan at the dictionary head for the same byte
-    + use the killer byte (requiring matches to get longer, the first
-      test should be the byte that would make the match longer)
+### v0.6
 
+* Rewrite Encoder into a simple greedy one-op-at-a-time encoder including
+  * simple scan at the dictionary head for the same byte
+  * use the killer byte (requiring matches to get longer, the first test should be the byte that would make the match longer)
 
 ## Optimizations
 
-- There may be a lot of false sharing in lzma.State; check whether this
-  can be improved by reorganizing the internal structure of it.
-- Check whether batching encoding and decoding improves speed.
+* There may be a lot of false sharing in lzma. State; check whether this  can be improved by reorganizing the internal structure of it.
+
+* Check whether batching encoding and decoding improves speed.
 
 ### DAG optimizations
 
-- Use full buffer to create minimal bit-length above range encoder.
-- Might be too slow (see v0.4)
+* Use full buffer to create minimal bit-length above range encoder.
+* Might be too slow (see v0.4)
 
 ### Different match finders
 
-- hashes with 2, 3 characters additional to 4 characters
-- binary trees with 2-7 characters (uint64 as key, use uint32 as
+* hashes with 2, 3 characters additional to 4 characters
+* binary trees with 2-7 characters (uint64 as key, use uint32 as
+
   pointers into a an array)
-- rb-trees with 2-7 characters (uint64 as key, use uint32 as pointers
+
+* rb-trees with 2-7 characters (uint64 as key, use uint32 as pointers
+
   into an array with bit-steeling for the colors)
 
 ## Release Procedure
 
-- execute goch -l for all packages; probably with lower param like 0.5.
-- check orthography with gospell
-- Write release notes in doc/relnotes.
-- Update README.md
-- xb copyright . in xz directory to ensure all new files have Copyright
-  header
-- VERSION=<version> go generate github.com/ulikunitz/xz/... to update
-  version files
-- Execute test for Linux/amd64, Linux/x86 and Windows/amd64.
-- Update TODO.md - write short log entry
-- git checkout master && git merge dev
-- git tag -a <version>
-- git push
+* execute goch -l for all packages; probably with lower param like 0.5.
+* check orthography with gospell
+* Write release notes in doc/relnotes.
+* Update README.md
+* xb copyright . in xz directory to ensure all new files have Copyright header
+* `VERSION=<version> go generate github.com/ulikunitz/xz/...` to update version files
+* Execute test for Linux/amd64, Linux/x86 and Windows/amd64.
+* Update TODO.md - write short log entry
+* `git checkout master && git merge dev`
+* `git tag -a <version>`
+* `git push`
 
 ## Log
 
+### 2021-02-02
+
+Mituo Heijo has fuzzed xz and found a bug in the function readIndexBody. The
+function allocated a slice of records immediately after reading the value
+without further checks. Since the number has been too large the make function
+did panic. The fix is to check the number against the expected number of records
+before allocating the records.
+
+### 2020-12-17
+
+Release v0.5.9 fixes warnings, a typo and adds SECURITY.md.
+
+One fix is interesting.
+
+```go
+const (
+  a byte = 0x1
+  b      = 0x2
+)
+```
+
+The constants a and b don't have the same type. Correct is
+
+```go
+const (
+  a byte = 0x1
+  b byte = 0x2
+)
+```
+
+### 2020-08-19
+
+Release v0.5.8 fixes issue
+[issue #35](https://github.com/ulikunitz/xz/issues/35).
+
 ### 2020-02-24
 
 Release v0.5.7 supports the check-ID None and fixes
@@ -203,8 +234,8 @@ MININT.
 
 ### 2015-06-04
 
-It has been a productive day. I improved the interface of lzma.Reader
-and lzma.Writer and fixed the error handling.
+It has been a productive day. I improved the interface of lzma. Reader
+and lzma. Writer and fixed the error handling.
 
 ### 2015-06-01
 
@@ -255,7 +286,7 @@ needed anymore.
 
 However I will implement a ReaderState and WriterState type to use
 static typing to ensure the right State object is combined with the
-right lzbase.Reader and lzbase.Writer.
+right lzbase. Reader and lzbase. Writer.
 
 As a start I have implemented ReaderState and WriterState to ensure
 that the state for reading is only used by readers and WriterState only
@@ -277,11 +308,11 @@ old lzma package has been completely removed.
 
 ### 2015-04-05
 
-Implemented lzma.Reader and tested it.
+Implemented lzma. Reader and tested it.
 
 ### 2015-04-04
 
-Implemented baseReader by adapting code form lzma.Reader.
+Implemented baseReader by adapting code form lzma. Reader.
 
 ### 2015-04-03
 
@@ -297,7 +328,7 @@ However in Francesco Campoy's presentation "Go for Javaneros
 (Javaïstes?)" is the the idea that using an embedded field E, all the
 methods of E will be defined on T. If E is an interface T satisfies E.
 
-https://talks.golang.org/2014/go4java.slide#51
+<https://talks.golang.org/2014/go4java.slide#51>
 
 I have never used this, but it seems to be a cool idea.
 
@@ -322,11 +353,11 @@ and the opCodec.
 
 1. Implemented simple lzmago tool
 2. Tested tool against large 4.4G file
-    - compression worked correctly; tested decompression with lzma
-    - decompression hits a full buffer condition
+   * compression worked correctly; tested decompression with lzma
+   * decompression hits a full buffer condition
 3. Fixed a bug in the compressor and wrote a test for it
 4. Executed full cycle for 4.4 GB file; performance can be improved ;-)
 
 ### 2015-01-11
 
-- Release v0.2 because of the working LZMA encoder and decoder
+* Release v0.2 because of the working LZMA encoder and decoder
diff --git a/vendor/github.com/ulikunitz/xz/bits.go b/vendor/github.com/ulikunitz/xz/bits.go
index 364213d..e48450c 100644
--- a/vendor/github.com/ulikunitz/xz/bits.go
+++ b/vendor/github.com/ulikunitz/xz/bits.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -54,6 +54,8 @@ var errOverflowU64 = errors.New("xz: uvarint overflows 64-bit unsigned integer")
 
 // readUvarint reads a uvarint from the given byte reader.
 func readUvarint(r io.ByteReader) (x uint64, n int, err error) {
+	const maxUvarintLen = 10
+
 	var s uint
 	i := 0
 	for {
@@ -62,8 +64,11 @@ func readUvarint(r io.ByteReader) (x uint64, n int, err error) {
 			return x, i, err
 		}
 		i++
+		if i > maxUvarintLen {
+			return x, i, errOverflowU64
+		}
 		if b < 0x80 {
-			if i > 10 || i == 10 && b > 1 {
+			if i == maxUvarintLen && b > 1 {
 				return x, i, errOverflowU64
 			}
 			return x | uint64(b)<<s, i, nil
diff --git a/vendor/github.com/ulikunitz/xz/crc.go b/vendor/github.com/ulikunitz/xz/crc.go
index 638774a..a5c57fb 100644
--- a/vendor/github.com/ulikunitz/xz/crc.go
+++ b/vendor/github.com/ulikunitz/xz/crc.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
diff --git a/vendor/github.com/ulikunitz/xz/format.go b/vendor/github.com/ulikunitz/xz/format.go
index edfec9a..c98c12d 100644
--- a/vendor/github.com/ulikunitz/xz/format.go
+++ b/vendor/github.com/ulikunitz/xz/format.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -47,9 +47,9 @@ const HeaderLen = 12
 // Constants for the checksum methods supported by xz.
 const (
 	None   byte = 0x0
-	CRC32       = 0x1
-	CRC64       = 0x4
-	SHA256      = 0xa
+	CRC32  byte = 0x1
+	CRC64  byte = 0x4
+	SHA256 byte = 0xa
 )
 
 // errInvalidFlags indicates that flags are invalid.
@@ -569,22 +569,6 @@ func readFilters(r io.Reader, count int) (filters []filter, err error) {
 	return []filter{f}, err
 }
 
-// writeFilters writes the filters.
-func writeFilters(w io.Writer, filters []filter) (n int, err error) {
-	for _, f := range filters {
-		p, err := f.MarshalBinary()
-		if err != nil {
-			return n, err
-		}
-		k, err := w.Write(p)
-		n += k
-		if err != nil {
-			return n, err
-		}
-	}
-	return n, nil
-}
-
 /*** Index ***/
 
 // record describes a block in the xz file index.
@@ -678,7 +662,7 @@ func writeIndex(w io.Writer, index []record) (n int64, err error) {
 
 // readIndexBody reads the index from the reader. It assumes that the
 // index indicator has already been read.
-func readIndexBody(r io.Reader) (records []record, n int64, err error) {
+func readIndexBody(r io.Reader, expectedRecordLen int) (records []record, n int64, err error) {
 	crc := crc32.NewIEEE()
 	// index indicator
 	crc.Write([]byte{0})
@@ -695,6 +679,11 @@ func readIndexBody(r io.Reader) (records []record, n int64, err error) {
 	if recLen < 0 || uint64(recLen) != u {
 		return nil, n, errors.New("xz: record number overflow")
 	}
+	if recLen != expectedRecordLen {
+		return nil, n, fmt.Errorf(
+			"xz: index length is %d; want %d",
+			recLen, expectedRecordLen)
+	}
 
 	// list of records
 	records = make([]record, recLen)
diff --git a/vendor/github.com/ulikunitz/xz/internal/hash/cyclic_poly.go b/vendor/github.com/ulikunitz/xz/internal/hash/cyclic_poly.go
index f2861ba..f723cf2 100644
--- a/vendor/github.com/ulikunitz/xz/internal/hash/cyclic_poly.go
+++ b/vendor/github.com/ulikunitz/xz/internal/hash/cyclic_poly.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
diff --git a/vendor/github.com/ulikunitz/xz/internal/hash/doc.go b/vendor/github.com/ulikunitz/xz/internal/hash/doc.go
index e28d23b..cc60a6b 100644
--- a/vendor/github.com/ulikunitz/xz/internal/hash/doc.go
+++ b/vendor/github.com/ulikunitz/xz/internal/hash/doc.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
diff --git a/vendor/github.com/ulikunitz/xz/internal/hash/rabin_karp.go b/vendor/github.com/ulikunitz/xz/internal/hash/rabin_karp.go
index b8e66d9..c643291 100644
--- a/vendor/github.com/ulikunitz/xz/internal/hash/rabin_karp.go
+++ b/vendor/github.com/ulikunitz/xz/internal/hash/rabin_karp.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
diff --git a/vendor/github.com/ulikunitz/xz/internal/hash/roller.go b/vendor/github.com/ulikunitz/xz/internal/hash/roller.go
index 34c81b3..f1de88b 100644
--- a/vendor/github.com/ulikunitz/xz/internal/hash/roller.go
+++ b/vendor/github.com/ulikunitz/xz/internal/hash/roller.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
diff --git a/vendor/github.com/ulikunitz/xz/internal/xlog/xlog.go b/vendor/github.com/ulikunitz/xz/internal/xlog/xlog.go
index 678b5a0..6c20c77 100644
--- a/vendor/github.com/ulikunitz/xz/internal/xlog/xlog.go
+++ b/vendor/github.com/ulikunitz/xz/internal/xlog/xlog.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
diff --git a/vendor/github.com/ulikunitz/xz/lzma/bintree.go b/vendor/github.com/ulikunitz/xz/lzma/bintree.go
index 58d6a92..2a7bd19 100644
--- a/vendor/github.com/ulikunitz/xz/lzma/bintree.go
+++ b/vendor/github.com/ulikunitz/xz/lzma/bintree.go
@@ -1,14 +1,11 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 package lzma
 
 import (
-	"bufio"
 	"errors"
-	"fmt"
-	"io"
 	"unicode"
 )
 
@@ -349,6 +346,7 @@ func dumpX(x uint32) string {
 	return string(a)
 }
 
+/*
 // dumpNode writes a representation of the node v into the io.Writer.
 func (t *binTree) dumpNode(w io.Writer, v uint32, indent int) {
 	if v == null {
@@ -377,6 +375,7 @@ func (t *binTree) dump(w io.Writer) error {
 	t.dumpNode(bw, t.root, 0)
 	return bw.Flush()
 }
+*/
 
 func (t *binTree) distance(v uint32) int {
 	dist := int(t.front) - int(v)
diff --git a/vendor/github.com/ulikunitz/xz/lzma/bitops.go b/vendor/github.com/ulikunitz/xz/lzma/bitops.go
index 2784ec6..d2c07e8 100644
--- a/vendor/github.com/ulikunitz/xz/lzma/bitops.go
+++ b/vendor/github.com/ulikunitz/xz/lzma/bitops.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -18,6 +18,7 @@ var ntz32Table = [32]int8{
 	30, 17, 8, 14, 29, 13, 28, 27,
 }
 
+/*
 // ntz32 computes the number of trailing zeros for an unsigned 32-bit integer.
 func ntz32(x uint32) int {
 	if x == 0 {
@@ -26,6 +27,7 @@ func ntz32(x uint32) int {
 	x = (x & -x) * ntz32Const
 	return int(ntz32Table[x>>27])
 }
+*/
 
 // nlz32 computes the number of leading zeros for an unsigned 32-bit integer.
 func nlz32(x uint32) int {
diff --git a/vendor/github.com/ulikunitz/xz/lzma/breader.go b/vendor/github.com/ulikunitz/xz/lzma/breader.go
index 4ad09a1..939be88 100644
--- a/vendor/github.com/ulikunitz/xz/lzma/breader.go
+++ b/vendor/github.com/ulikunitz/xz/lzma/breader.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
diff --git a/vendor/github.com/ulikunitz/xz/lzma/buffer.go b/vendor/github.com/ulikunitz/xz/lzma/buffer.go
index 9cb7838..2761de5 100644
--- a/vendor/github.com/ulikunitz/xz/lzma/buffer.go
+++ b/vendor/github.com/ulikunitz/xz/lzma/buffer.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
diff --git a/vendor/github.com/ulikunitz/xz/lzma/bytewriter.go b/vendor/github.com/ulikunitz/xz/lzma/bytewriter.go
index 290606d..040874c 100644
--- a/vendor/github.com/ulikunitz/xz/lzma/bytewriter.go
+++ b/vendor/github.com/ulikunitz/xz/lzma/bytewriter.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
diff --git a/vendor/github.com/ulikunitz/xz/lzma/decoder.go b/vendor/github.com/ulikunitz/xz/lzma/decoder.go
index e5a760a..cbb943a 100644
--- a/vendor/github.com/ulikunitz/xz/lzma/decoder.go
+++ b/vendor/github.com/ulikunitz/xz/lzma/decoder.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -200,7 +200,7 @@ func (d *decoder) decompress() error {
 		op, err := d.readOp()
 		switch err {
 		case nil:
-			break
+			// break
 		case errEOS:
 			d.eos = true
 			if !d.rd.possiblyAtEnd() {
diff --git a/vendor/github.com/ulikunitz/xz/lzma/decoderdict.go b/vendor/github.com/ulikunitz/xz/lzma/decoderdict.go
index ba06712..8cd616e 100644
--- a/vendor/github.com/ulikunitz/xz/lzma/decoderdict.go
+++ b/vendor/github.com/ulikunitz/xz/lzma/decoderdict.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -126,10 +126,3 @@ func (d *decoderDict) Available() int { return d.buf.Available() }
 
 // Read reads data from the buffer contained in the decoder dictionary.
 func (d *decoderDict) Read(p []byte) (n int, err error) { return d.buf.Read(p) }
-
-// Buffered returns the number of bytes currently buffered in the
-// decoder dictionary.
-func (d *decoderDict) buffered() int { return d.buf.Buffered() }
-
-// Peek gets data from the buffer without advancing the rear index.
-func (d *decoderDict) peek(p []byte) (n int, err error) { return d.buf.Peek(p) }
diff --git a/vendor/github.com/ulikunitz/xz/lzma/directcodec.go b/vendor/github.com/ulikunitz/xz/lzma/directcodec.go
index e6e0c6d..20b256a 100644
--- a/vendor/github.com/ulikunitz/xz/lzma/directcodec.go
+++ b/vendor/github.com/ulikunitz/xz/lzma/directcodec.go
@@ -1,24 +1,13 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 package lzma
 
-import "fmt"
-
 // directCodec allows the encoding and decoding of values with a fixed number
 // of bits. The number of bits must be in the range [1,32].
 type directCodec byte
 
-// makeDirectCodec creates a directCodec. The function panics if the number of
-// bits is not in the range [1,32].
-func makeDirectCodec(bits int) directCodec {
-	if !(1 <= bits && bits <= 32) {
-		panic(fmt.Errorf("bits=%d out of range", bits))
-	}
-	return directCodec(bits)
-}
-
 // Bits returns the number of bits supported by this codec.
 func (dc directCodec) Bits() int {
 	return int(dc)
diff --git a/vendor/github.com/ulikunitz/xz/lzma/distcodec.go b/vendor/github.com/ulikunitz/xz/lzma/distcodec.go
index 69871c0..60ed9ae 100644
--- a/vendor/github.com/ulikunitz/xz/lzma/distcodec.go
+++ b/vendor/github.com/ulikunitz/xz/lzma/distcodec.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -20,8 +20,6 @@ const (
 	posSlotBits = 6
 	// number of align bits
 	alignBits = 4
-	// maximum position slot
-	maxPosSlot = 63
 )
 
 // distCodec provides encoding and decoding of distance values.
@@ -45,20 +43,6 @@ func (dc *distCodec) deepcopy(src *distCodec) {
 	dc.alignCodec.deepcopy(&src.alignCodec)
 }
 
-// distBits returns the number of bits required to encode dist.
-func distBits(dist uint32) int {
-	if dist < startPosModel {
-		return 6
-	}
-	// slot s > 3, dist d
-	// s = 2(bits(d)-1) + bit(d, bits(d)-2)
-	// s>>1 = bits(d)-1
-	// bits(d) = 32-nlz32(d)
-	// s>>1=31-nlz32(d)
-	// n = 5 + (s>>1) = 36 - nlz32(d)
-	return 36 - nlz32(dist)
-}
-
 // newDistCodec creates a new distance codec.
 func (dc *distCodec) init() {
 	for i := range dc.posSlotCodecs {
diff --git a/vendor/github.com/ulikunitz/xz/lzma/encoder.go b/vendor/github.com/ulikunitz/xz/lzma/encoder.go
index 59055eb..5ed057a 100644
--- a/vendor/github.com/ulikunitz/xz/lzma/encoder.go
+++ b/vendor/github.com/ulikunitz/xz/lzma/encoder.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
diff --git a/vendor/github.com/ulikunitz/xz/lzma/encoderdict.go b/vendor/github.com/ulikunitz/xz/lzma/encoderdict.go
index 40f3d3f..056f897 100644
--- a/vendor/github.com/ulikunitz/xz/lzma/encoderdict.go
+++ b/vendor/github.com/ulikunitz/xz/lzma/encoderdict.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -19,7 +19,7 @@ type matcher interface {
 }
 
 // encoderDict provides the dictionary of the encoder. It includes an
-// addtional buffer atop of the actual dictionary.
+// additional buffer atop of the actual dictionary.
 type encoderDict struct {
 	buf      buffer
 	m        matcher
diff --git a/vendor/github.com/ulikunitz/xz/lzma/hashtable.go b/vendor/github.com/ulikunitz/xz/lzma/hashtable.go
index e82970e..0fb7910 100644
--- a/vendor/github.com/ulikunitz/xz/lzma/hashtable.go
+++ b/vendor/github.com/ulikunitz/xz/lzma/hashtable.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
diff --git a/vendor/github.com/ulikunitz/xz/lzma/header.go b/vendor/github.com/ulikunitz/xz/lzma/header.go
index cda3946..04276c8 100644
--- a/vendor/github.com/ulikunitz/xz/lzma/header.go
+++ b/vendor/github.com/ulikunitz/xz/lzma/header.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
diff --git a/vendor/github.com/ulikunitz/xz/lzma/header2.go b/vendor/github.com/ulikunitz/xz/lzma/header2.go
index cd14881..be54dd8 100644
--- a/vendor/github.com/ulikunitz/xz/lzma/header2.go
+++ b/vendor/github.com/ulikunitz/xz/lzma/header2.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -264,7 +264,7 @@ type chunkState byte
 // state
 const (
 	start chunkState = 'S'
-	stop             = 'T'
+	stop  chunkState = 'T'
 )
 
 // errors for the chunk state handling
diff --git a/vendor/github.com/ulikunitz/xz/lzma/lengthcodec.go b/vendor/github.com/ulikunitz/xz/lzma/lengthcodec.go
index 927395b..6e0edfc 100644
--- a/vendor/github.com/ulikunitz/xz/lzma/lengthcodec.go
+++ b/vendor/github.com/ulikunitz/xz/lzma/lengthcodec.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -56,19 +56,6 @@ func (lc *lengthCodec) init() {
 	lc.high = makeTreeCodec(8)
 }
 
-// lBits gives the number of bits used for the encoding of the l value
-// provided to the range encoder.
-func lBits(l uint32) int {
-	switch {
-	case l < 8:
-		return 4
-	case l < 16:
-		return 5
-	default:
-		return 10
-	}
-}
-
 // Encode encodes the length offset. The length offset l can be compute by
 // subtracting minMatchLen (2) from the actual length.
 //
diff --git a/vendor/github.com/ulikunitz/xz/lzma/literalcodec.go b/vendor/github.com/ulikunitz/xz/lzma/literalcodec.go
index ca31530..0bfc763 100644
--- a/vendor/github.com/ulikunitz/xz/lzma/literalcodec.go
+++ b/vendor/github.com/ulikunitz/xz/lzma/literalcodec.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -123,10 +123,3 @@ const (
 	minLP = 0
 	maxLP = 4
 )
-
-// minState and maxState define a range for the state values stored in
-// the State values.
-const (
-	minState = 0
-	maxState = 11
-)
diff --git a/vendor/github.com/ulikunitz/xz/lzma/matchalgorithm.go b/vendor/github.com/ulikunitz/xz/lzma/matchalgorithm.go
index 7d03ec0..96ebda0 100644
--- a/vendor/github.com/ulikunitz/xz/lzma/matchalgorithm.go
+++ b/vendor/github.com/ulikunitz/xz/lzma/matchalgorithm.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
diff --git a/vendor/github.com/ulikunitz/xz/lzma/operation.go b/vendor/github.com/ulikunitz/xz/lzma/operation.go
index a75c9b4..026ce48 100644
--- a/vendor/github.com/ulikunitz/xz/lzma/operation.go
+++ b/vendor/github.com/ulikunitz/xz/lzma/operation.go
@@ -1,11 +1,10 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 package lzma
 
 import (
-	"errors"
 	"fmt"
 	"unicode"
 )
@@ -24,30 +23,6 @@ type match struct {
 	n int
 }
 
-// verify checks whether the match is valid. If that is not the case an
-// error is returned.
-func (m match) verify() error {
-	if !(minDistance <= m.distance && m.distance <= maxDistance) {
-		return errors.New("distance out of range")
-	}
-	if !(1 <= m.n && m.n <= maxMatchLen) {
-		return errors.New("length out of range")
-	}
-	return nil
-}
-
-// l return the l-value for the match, which is the difference of length
-// n and 2.
-func (m match) l() uint32 {
-	return uint32(m.n - minMatchLen)
-}
-
-// dist returns the dist value for the match, which is one less of the
-// distance stored in the match.
-func (m match) dist() uint32 {
-	return uint32(m.distance - minDistance)
-}
-
 // Len returns the number of bytes matched.
 func (m match) Len() int {
 	return m.n
diff --git a/vendor/github.com/ulikunitz/xz/lzma/prob.go b/vendor/github.com/ulikunitz/xz/lzma/prob.go
index 6987a16..9a2648e 100644
--- a/vendor/github.com/ulikunitz/xz/lzma/prob.go
+++ b/vendor/github.com/ulikunitz/xz/lzma/prob.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
diff --git a/vendor/github.com/ulikunitz/xz/lzma/properties.go b/vendor/github.com/ulikunitz/xz/lzma/properties.go
index 662feba..f229fc9 100644
--- a/vendor/github.com/ulikunitz/xz/lzma/properties.go
+++ b/vendor/github.com/ulikunitz/xz/lzma/properties.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
diff --git a/vendor/github.com/ulikunitz/xz/lzma/rangecodec.go b/vendor/github.com/ulikunitz/xz/lzma/rangecodec.go
index 7189a03..57f1ab9 100644
--- a/vendor/github.com/ulikunitz/xz/lzma/rangecodec.go
+++ b/vendor/github.com/ulikunitz/xz/lzma/rangecodec.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -131,32 +131,6 @@ type rangeDecoder struct {
 	code   uint32
 }
 
-// init initializes the range decoder, by reading from the byte reader.
-func (d *rangeDecoder) init() error {
-	d.nrange = 0xffffffff
-	d.code = 0
-
-	b, err := d.br.ReadByte()
-	if err != nil {
-		return err
-	}
-	if b != 0 {
-		return errors.New("newRangeDecoder: first byte not zero")
-	}
-
-	for i := 0; i < 4; i++ {
-		if err = d.updateCode(); err != nil {
-			return err
-		}
-	}
-
-	if d.code >= d.nrange {
-		return errors.New("newRangeDecoder: d.code >= d.nrange")
-	}
-
-	return nil
-}
-
 // newRangeDecoder initializes a range decoder. It reads five bytes from the
 // reader and therefore may return an error.
 func newRangeDecoder(br io.ByteReader) (d *rangeDecoder, err error) {
diff --git a/vendor/github.com/ulikunitz/xz/lzma/reader.go b/vendor/github.com/ulikunitz/xz/lzma/reader.go
index 7b7eef3..2ed13c8 100644
--- a/vendor/github.com/ulikunitz/xz/lzma/reader.go
+++ b/vendor/github.com/ulikunitz/xz/lzma/reader.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
diff --git a/vendor/github.com/ulikunitz/xz/lzma/reader2.go b/vendor/github.com/ulikunitz/xz/lzma/reader2.go
index 33074e6..de3da37 100644
--- a/vendor/github.com/ulikunitz/xz/lzma/reader2.go
+++ b/vendor/github.com/ulikunitz/xz/lzma/reader2.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -48,7 +48,6 @@ type Reader2 struct {
 	chunkReader io.Reader
 
 	cstate chunkState
-	ctype  chunkType
 }
 
 // NewReader2 creates a reader for an LZMA2 chunk sequence.
diff --git a/vendor/github.com/ulikunitz/xz/lzma/state.go b/vendor/github.com/ulikunitz/xz/lzma/state.go
index 03f061c..09d62f7 100644
--- a/vendor/github.com/ulikunitz/xz/lzma/state.go
+++ b/vendor/github.com/ulikunitz/xz/lzma/state.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -53,12 +53,6 @@ func (s *state) Reset() {
 	s.distCodec.init()
 }
 
-// initState initializes the state.
-func initState(s *state, p Properties) {
-	*s = state{Properties: p}
-	s.Reset()
-}
-
 // newState creates a new state from the give Properties.
 func newState(p Properties) *state {
 	s := &state{Properties: p}
diff --git a/vendor/github.com/ulikunitz/xz/lzma/treecodecs.go b/vendor/github.com/ulikunitz/xz/lzma/treecodecs.go
index 1cb3596..6e927e9 100644
--- a/vendor/github.com/ulikunitz/xz/lzma/treecodecs.go
+++ b/vendor/github.com/ulikunitz/xz/lzma/treecodecs.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
diff --git a/vendor/github.com/ulikunitz/xz/lzma/writer.go b/vendor/github.com/ulikunitz/xz/lzma/writer.go
index 5803ecc..d0d220f 100644
--- a/vendor/github.com/ulikunitz/xz/lzma/writer.go
+++ b/vendor/github.com/ulikunitz/xz/lzma/writer.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
diff --git a/vendor/github.com/ulikunitz/xz/lzma/writer2.go b/vendor/github.com/ulikunitz/xz/lzma/writer2.go
index c263b06..dfaaec9 100644
--- a/vendor/github.com/ulikunitz/xz/lzma/writer2.go
+++ b/vendor/github.com/ulikunitz/xz/lzma/writer2.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
diff --git a/vendor/github.com/ulikunitz/xz/lzmafilter.go b/vendor/github.com/ulikunitz/xz/lzmafilter.go
index 6f4aa2c..4f1bb33 100644
--- a/vendor/github.com/ulikunitz/xz/lzmafilter.go
+++ b/vendor/github.com/ulikunitz/xz/lzmafilter.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
diff --git a/vendor/github.com/ulikunitz/xz/none-check.go b/vendor/github.com/ulikunitz/xz/none-check.go
index e12d8e4..9524013 100644
--- a/vendor/github.com/ulikunitz/xz/none-check.go
+++ b/vendor/github.com/ulikunitz/xz/none-check.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
diff --git a/vendor/github.com/ulikunitz/xz/reader.go b/vendor/github.com/ulikunitz/xz/reader.go
index 22cd6d5..7f974ff 100644
--- a/vendor/github.com/ulikunitz/xz/reader.go
+++ b/vendor/github.com/ulikunitz/xz/reader.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -26,13 +26,6 @@ type ReaderConfig struct {
 	SingleStream bool
 }
 
-// fill replaces all zero values with their default values.
-func (c *ReaderConfig) fill() {
-	if c.DictCap == 0 {
-		c.DictCap = 8 * 1024 * 1024
-	}
-}
-
 // Verify checks the reader parameters for Validity. Zero values will be
 // replaced by default values.
 func (c *ReaderConfig) Verify() error {
@@ -165,22 +158,16 @@ func (c ReaderConfig) newStreamReader(xz io.Reader) (r *streamReader, err error)
 	return r, nil
 }
 
-// errIndex indicates an error with the xz file index.
-var errIndex = errors.New("xz: error in xz file index")
-
 // readTail reads the index body and the xz footer.
 func (r *streamReader) readTail() error {
-	index, n, err := readIndexBody(r.xz)
+	index, n, err := readIndexBody(r.xz, len(r.index))
 	if err != nil {
 		if err == io.EOF {
 			err = io.ErrUnexpectedEOF
 		}
 		return err
 	}
-	if len(index) != len(r.index) {
-		return fmt.Errorf("xz: index length is %d; want %d",
-			len(index), len(r.index))
-	}
+
 	for i, rec := range r.index {
 		if rec != index[i] {
 			return fmt.Errorf("xz: record %d is %v; want %v",
@@ -265,7 +252,6 @@ type blockReader struct {
 	n         int64
 	hash      hash.Hash
 	r         io.Reader
-	err       error
 }
 
 // newBlockReader creates a new block reader.
@@ -315,10 +301,6 @@ func (br *blockReader) record() record {
 	return record{br.unpaddedSize(), br.uncompressedSize()}
 }
 
-// errBlockSize indicates that the size of the block in the block header
-// is wrong.
-var errBlockSize = errors.New("xz: wrong uncompressed size for block")
-
 // Read reads data from the block.
 func (br *blockReader) Read(p []byte) (n int, err error) {
 	n, err = br.r.Read(p)
diff --git a/vendor/github.com/ulikunitz/xz/writer.go b/vendor/github.com/ulikunitz/xz/writer.go
index aec10df..6b3a666 100644
--- a/vendor/github.com/ulikunitz/xz/writer.go
+++ b/vendor/github.com/ulikunitz/xz/writer.go
@@ -1,4 +1,4 @@
-// Copyright 2014-2019 Ulrich Kunitz. All rights reserved.
+// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -6,6 +6,7 @@ package xz
 
 import (
 	"errors"
+	"fmt"
 	"hash"
 	"io"
 
@@ -190,6 +191,9 @@ func (c WriterConfig) NewWriter(xz io.Writer) (w *Writer, err error) {
 		return nil, err
 	}
 	data, err := w.h.MarshalBinary()
+	if err != nil {
+		return nil, fmt.Errorf("w.h.MarshalBinary(): error %w", err)
+	}
 	if _, err = xz.Write(data); err != nil {
 		return nil, err
 	}
diff --git a/vendor/gopkg.in/yaml.v3/.travis.yml b/vendor/gopkg.in/yaml.v3/.travis.yml
deleted file mode 100644
index a130fe8..0000000
--- a/vendor/gopkg.in/yaml.v3/.travis.yml
+++ /dev/null
@@ -1,17 +0,0 @@
-language: go
-
-go:
-    - "1.4.x"
-    - "1.5.x"
-    - "1.6.x"
-    - "1.7.x"
-    - "1.8.x"
-    - "1.9.x"
-    - "1.10.x"
-    - "1.11.x"
-    - "1.12.x"
-    - "1.13.x"
-    - "1.14.x"
-    - "tip"
-
-go_import_path: gopkg.in/yaml.v3
diff --git a/vendor/gopkg.in/yaml.v3/decode.go b/vendor/gopkg.in/yaml.v3/decode.go
index 21c0dac..df36e3a 100644
--- a/vendor/gopkg.in/yaml.v3/decode.go
+++ b/vendor/gopkg.in/yaml.v3/decode.go
@@ -399,7 +399,7 @@ func (d *decoder) callObsoleteUnmarshaler(n *Node, u obsoleteUnmarshaler) (good
 //
 // If n holds a null value, prepare returns before doing anything.
 func (d *decoder) prepare(n *Node, out reflect.Value) (newout reflect.Value, unmarshaled, good bool) {
-	if n.ShortTag() == nullTag || n.Kind == 0 && n.IsZero() {
+	if n.ShortTag() == nullTag {
 		return out, false, false
 	}
 	again := true
@@ -808,8 +808,10 @@ func (d *decoder) mapping(n *Node, out reflect.Value) (good bool) {
 		}
 	}
 
+	mapIsNew := false
 	if out.IsNil() {
 		out.Set(reflect.MakeMap(outt))
+		mapIsNew = true
 	}
 	for i := 0; i < l; i += 2 {
 		if isMerge(n.Content[i]) {
@@ -826,7 +828,7 @@ func (d *decoder) mapping(n *Node, out reflect.Value) (good bool) {
 				failf("invalid map key: %#v", k.Interface())
 			}
 			e := reflect.New(et).Elem()
-			if d.unmarshal(n.Content[i+1], e) {
+			if d.unmarshal(n.Content[i+1], e) || n.Content[i+1].ShortTag() == nullTag && (mapIsNew || !out.MapIndex(k).IsValid()) {
 				out.SetMapIndex(k, e)
 			}
 		}
diff --git a/vendor/gopkg.in/yaml.v3/emitterc.go b/vendor/gopkg.in/yaml.v3/emitterc.go
index c29217e..0f47c9c 100644
--- a/vendor/gopkg.in/yaml.v3/emitterc.go
+++ b/vendor/gopkg.in/yaml.v3/emitterc.go
@@ -814,26 +814,24 @@ func yaml_emitter_emit_block_mapping_value(emitter *yaml_emitter_t, event *yaml_
 		}
 	}
 	if len(emitter.key_line_comment) > 0 {
-		// [Go] A line comment was previously provided for the key. Handle it before
-		//      the value so the inline comments are placed correctly.
-		if yaml_emitter_silent_nil_event(emitter, event) && len(emitter.line_comment) == 0 {
-			// Nothing other than the line comment will be written on the line.
-			emitter.line_comment = emitter.key_line_comment
-			emitter.key_line_comment = nil
-		} else {
-			// An actual value is coming, so emit the comment line.
+		// [Go] Line comments are generally associated with the value, but when there's
+		//      no value on the same line as a mapping key they end up attached to the
+		//      key itself.
+		if event.typ == yaml_SCALAR_EVENT {
+			if len(emitter.line_comment) == 0 {
+				// A scalar is coming and it has no line comments by itself yet,
+				// so just let it handle the line comment as usual. If it has a
+				// line comment, we can't have both so the one from the key is lost.
+				emitter.line_comment = emitter.key_line_comment
+				emitter.key_line_comment = nil
+			}
+		} else if event.sequence_style() != yaml_FLOW_SEQUENCE_STYLE && (event.typ == yaml_MAPPING_START_EVENT || event.typ == yaml_SEQUENCE_START_EVENT) {
+			// An indented block follows, so write the comment right now.
 			emitter.line_comment, emitter.key_line_comment = emitter.key_line_comment, emitter.line_comment
 			if !yaml_emitter_process_line_comment(emitter) {
 				return false
 			}
 			emitter.line_comment, emitter.key_line_comment = emitter.key_line_comment, emitter.line_comment
-			// Indent in unless it's a block that will reindent anyway.
-			if event.sequence_style() == yaml_FLOW_SEQUENCE_STYLE || (event.typ != yaml_MAPPING_START_EVENT && event.typ != yaml_SEQUENCE_START_EVENT) {
-				emitter.indent = emitter.best_indent*((emitter.indent+emitter.best_indent)/emitter.best_indent)
-				if !yaml_emitter_write_indent(emitter) {
-					return false
-				}
-			}
 		}
 	}
 	emitter.states = append(emitter.states, yaml_EMIT_BLOCK_MAPPING_KEY_STATE)
@@ -1896,7 +1894,7 @@ func yaml_emitter_write_literal_scalar(emitter *yaml_emitter_t, value []byte) bo
 	if !yaml_emitter_write_block_scalar_hints(emitter, value) {
 		return false
 	}
-	if !put_break(emitter) {
+	if !yaml_emitter_process_line_comment(emitter) {
 		return false
 	}
 	//emitter.indention = true
@@ -1933,10 +1931,10 @@ func yaml_emitter_write_folded_scalar(emitter *yaml_emitter_t, value []byte) boo
 	if !yaml_emitter_write_block_scalar_hints(emitter, value) {
 		return false
 	}
-
-	if !put_break(emitter) {
+	if !yaml_emitter_process_line_comment(emitter) {
 		return false
 	}
+
 	//emitter.indention = true
 	emitter.whitespace = true
 
diff --git a/vendor/gopkg.in/yaml.v3/encode.go b/vendor/gopkg.in/yaml.v3/encode.go
index 45e8d1e..de9e72a 100644
--- a/vendor/gopkg.in/yaml.v3/encode.go
+++ b/vendor/gopkg.in/yaml.v3/encode.go
@@ -120,6 +120,11 @@ func (e *encoder) marshal(tag string, in reflect.Value) {
 		e.nodev(in)
 		return
 	case Node:
+		if !in.CanAddr() {
+			var n = reflect.New(in.Type()).Elem()
+			n.Set(in)
+			in = n
+		}
 		e.nodev(in.Addr())
 		return
 	case time.Time:
diff --git a/vendor/gopkg.in/yaml.v3/scannerc.go b/vendor/gopkg.in/yaml.v3/scannerc.go
index d9a539c..ca00701 100644
--- a/vendor/gopkg.in/yaml.v3/scannerc.go
+++ b/vendor/gopkg.in/yaml.v3/scannerc.go
@@ -2260,10 +2260,9 @@ func yaml_parser_scan_block_scalar(parser *yaml_parser_t, token *yaml_token_t, l
 		}
 	}
 	if parser.buffer[parser.buffer_pos] == '#' {
-		// TODO Test this and then re-enable it.
-		//if !yaml_parser_scan_line_comment(parser, start_mark) {
-		//	return false
-		//}
+		if !yaml_parser_scan_line_comment(parser, start_mark) {
+			return false
+		}
 		for !is_breakz(parser.buffer, parser.buffer_pos) {
 			skip(parser)
 			if parser.unread < 1 && !yaml_parser_update_buffer(parser, 1) {
@@ -2892,6 +2891,10 @@ func yaml_parser_scan_comments(parser *yaml_parser_t, scan_mark yaml_mark_t) boo
 
 	var token_mark = token.start_mark
 	var start_mark yaml_mark_t
+	var next_indent = parser.indent
+	if next_indent < 0 {
+		next_indent = 0
+	}
 
 	var recent_empty = false
 	var first_empty = parser.newlines <= 1
@@ -2923,15 +2926,18 @@ func yaml_parser_scan_comments(parser *yaml_parser_t, scan_mark yaml_mark_t) boo
 			continue
 		}
 		c := parser.buffer[parser.buffer_pos+peek]
-		if is_breakz(parser.buffer, parser.buffer_pos+peek) || parser.flow_level > 0 && (c == ']' || c == '}') {
+		var close_flow = parser.flow_level > 0 && (c == ']' || c == '}')
+		if close_flow || is_breakz(parser.buffer, parser.buffer_pos+peek) {
 			// Got line break or terminator.
-			if !recent_empty {
-				if first_empty && (start_mark.line == foot_line || start_mark.column-1 < parser.indent) {
+			if close_flow || !recent_empty {
+				if close_flow || first_empty && (start_mark.line == foot_line && token.typ != yaml_VALUE_TOKEN || start_mark.column-1 < next_indent) {
 					// This is the first empty line and there were no empty lines before,
 					// so this initial part of the comment is a foot of the prior token
 					// instead of being a head for the following one. Split it up.
+					// Alternatively, this might also be the last comment inside a flow
+					// scope, so it must be a footer.
 					if len(text) > 0 {
-						if start_mark.column-1 < parser.indent {
+						if start_mark.column-1 < next_indent {
 							// If dedented it's unrelated to the prior token.
 							token_mark = start_mark
 						}
@@ -2962,7 +2968,7 @@ func yaml_parser_scan_comments(parser *yaml_parser_t, scan_mark yaml_mark_t) boo
 			continue
 		}
 
-		if len(text) > 0 && column < parser.indent+1 && column != start_mark.column {
+		if len(text) > 0 && (close_flow || column-1 < next_indent && column != start_mark.column) {
 			// The comment at the different indentation is a foot of the
 			// preceding data rather than a head of the upcoming one.
 			parser.comments = append(parser.comments, yaml_comment_t{
@@ -3013,6 +3019,10 @@ func yaml_parser_scan_comments(parser *yaml_parser_t, scan_mark yaml_mark_t) boo
 		peek = 0
 		column = 0
 		line = parser.mark.line
+		next_indent = parser.indent
+		if next_indent < 0 {
+			next_indent = 0
+		}
 	}
 
 	if len(text) > 0 {
diff --git a/vendor/gopkg.in/yaml.v3/yaml.go b/vendor/gopkg.in/yaml.v3/yaml.go
index 56e8a84..8cec6da 100644
--- a/vendor/gopkg.in/yaml.v3/yaml.go
+++ b/vendor/gopkg.in/yaml.v3/yaml.go
@@ -449,6 +449,11 @@ func (n *Node) ShortTag() string {
 		case ScalarNode:
 			tag, _ := resolve("", n.Value)
 			return tag
+		case 0:
+			// Special case to make the zero value convenient.
+			if n.IsZero() {
+				return nullTag
+			}
 		}
 		return ""
 	}
diff --git a/vendor/modules.txt b/vendor/modules.txt
index f3dcd53..a0c4611 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -1,12 +1,11 @@
-# github.com/andybalholm/brotli v1.0.0
+# github.com/andybalholm/brotli v1.0.4
 ## explicit; go 1.12
 github.com/andybalholm/brotli
-# github.com/bitrise-io/go-utils v0.0.0-20200224122728-e212188d99b4
+# github.com/bitrise-io/go-utils v1.0.2
 ## explicit; go 1.13
 github.com/bitrise-io/go-utils/colorstring
 github.com/bitrise-io/go-utils/command
 github.com/bitrise-io/go-utils/command/git
-github.com/bitrise-io/go-utils/errorutil
 github.com/bitrise-io/go-utils/fileutil
 github.com/bitrise-io/go-utils/log
 github.com/bitrise-io/go-utils/parseutil
@@ -30,41 +29,43 @@ github.com/dsnet/compress/bzip2/internal/sais
 github.com/dsnet/compress/internal
 github.com/dsnet/compress/internal/errors
 github.com/dsnet/compress/internal/prefix
-# github.com/golang/snappy v0.0.1
+# github.com/golang/snappy v0.0.4
 ## explicit
 github.com/golang/snappy
-# github.com/klauspost/compress v1.10.10
-## explicit; go 1.13
+# github.com/klauspost/compress v1.15.4
+## explicit; go 1.16
+github.com/klauspost/compress
 github.com/klauspost/compress/flate
 github.com/klauspost/compress/fse
 github.com/klauspost/compress/huff0
-github.com/klauspost/compress/snappy
+github.com/klauspost/compress/internal/cpuinfo
+github.com/klauspost/compress/internal/snapref
 github.com/klauspost/compress/zstd
 github.com/klauspost/compress/zstd/internal/xxhash
-# github.com/klauspost/pgzip v1.2.4
+# github.com/klauspost/pgzip v1.2.5
 ## explicit
 github.com/klauspost/pgzip
-# github.com/magiconair/properties v1.8.2-0.20200610093828-cfba716d4c41
+# github.com/magiconair/properties v1.8.6
 ## explicit; go 1.13
 github.com/magiconair/properties
 # github.com/mholt/archiver v1.1.3-0.20190917152942-233fc16b9615
 ## explicit; go 1.12
 github.com/mholt/archiver
-# github.com/nwaples/rardecode v1.1.0
+# github.com/nwaples/rardecode v1.1.3
 ## explicit
 github.com/nwaples/rardecode
-# github.com/pierrec/lz4 v2.5.2+incompatible
+# github.com/pierrec/lz4 v2.6.1+incompatible
 ## explicit
 github.com/pierrec/lz4
 github.com/pierrec/lz4/internal/xxh32
 # github.com/pmezard/go-difflib v1.0.0
 ## explicit
 github.com/pmezard/go-difflib/difflib
-# github.com/stretchr/testify v1.6.1
+# github.com/stretchr/testify v1.7.0
 ## explicit; go 1.13
 github.com/stretchr/testify/assert
 github.com/stretchr/testify/require
-# github.com/ulikunitz/xz v0.5.7
+# github.com/ulikunitz/xz v0.5.10
 ## explicit; go 1.12
 github.com/ulikunitz/xz
 github.com/ulikunitz/xz/internal/hash
@@ -73,6 +74,6 @@ github.com/ulikunitz/xz/lzma
 # github.com/xi2/xz v0.0.0-20171230120015-48954b6210f8
 ## explicit
 github.com/xi2/xz
-# gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776
+# gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b
 ## explicit
 gopkg.in/yaml.v3