$darkmode
SeqAn3 3.3.0-rc.1
The Modern C++ library for sequence analysis.
bgzf_stream_util.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <algorithm>
16#include <array>
17#include <cassert>
18#include <cstring>
19#include <memory>
20#include <span>
21#include <thread>
22
28
29#if !defined(SEQAN3_HAS_ZLIB) && !defined(SEQAN3_HEADER_TEST)
30# error "This file cannot be used when building without GZip-support."
31#endif // !defined(SEQAN3_HAS_ZLIB) && !defined(SEQAN3_HEADER_TEST)
32
33#if defined(SEQAN3_HAS_ZLIB)
34// Zlib headers
35#include <zlib.h>
36
37namespace seqan3::contrib
38{
39
40// ============================================================================
41// Forwards
42// ============================================================================
43
44// ============================================================================
45// Classes
46// ============================================================================
47
48// Special end-of-file marker defined by the BGZF compression format.
49// See: https://samtools.github.io/hts-specs/SAMv1.pdf
50static constexpr std::array<char, 28> BGZF_END_OF_FILE_MARKER {{'\x1f', '\x8b', '\x08', '\x04',
51 '\x00', '\x00', '\x00', '\x00',
52 '\x00', '\xff', '\x06', '\x00',
53 '\x42', '\x43', '\x02', '\x00',
54 '\x1b', '\x00', '\x03', '\x00',
55 '\x00', '\x00', '\x00', '\x00',
56 '\x00', '\x00', '\x00', '\x00'}};
57
58template <typename TAlgTag>
59struct CompressionContext {};
60
61template <typename TAlgTag>
62struct DefaultPageSize;
63
64template <>
65struct CompressionContext<detail::gz_compression>
66{
67 z_stream strm;
68
69 CompressionContext()
70 {
71 std::memset(&strm, 0, sizeof(z_stream));
72 }
73};
74
75template <>
76struct CompressionContext<detail::bgzf_compression>:
77 CompressionContext<detail::gz_compression>
78{
79 static constexpr size_t BLOCK_HEADER_LENGTH = detail::bgzf_compression::magic_header.size();
80 unsigned char headerPos;
81};
82
83template <>
84struct DefaultPageSize<detail::bgzf_compression>
85{
86 static const unsigned MAX_BLOCK_SIZE = 64 * 1024;
87 static const unsigned BLOCK_FOOTER_LENGTH = 8;
88 // 5 bytes block overhead (see 3.2.4. at https://tools.ietf.org/html/rfc1951)
89 static const unsigned ZLIB_BLOCK_OVERHEAD = 5;
90
91 // Reduce the maximal input size, such that the compressed data
92 // always fits in one block even for level Z_NO_COMPRESSION.
93 enum { BLOCK_HEADER_LENGTH = CompressionContext<detail::bgzf_compression>::BLOCK_HEADER_LENGTH };
94 static const unsigned VALUE = MAX_BLOCK_SIZE - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH - ZLIB_BLOCK_OVERHEAD;
95};
96
97// ============================================================================
98// Functions
99// ============================================================================
100
101// ----------------------------------------------------------------------------
102// Function compressInit()
103// ----------------------------------------------------------------------------
104
105inline void
106compressInit(CompressionContext<detail::gz_compression> & ctx)
107{
108 const int GZIP_WINDOW_BITS = -15; // no zlib header
109 const int Z_DEFAULT_MEM_LEVEL = 8;
110
111 ctx.strm.zalloc = NULL;
112 ctx.strm.zfree = NULL;
113
114 // (weese:) We use Z_BEST_SPEED instead of Z_DEFAULT_COMPRESSION as it turned out
115 // to be 2x faster and produces only 7% bigger output
116// int status = deflateInit2(&ctx.strm, Z_DEFAULT_COMPRESSION, Z_DEFLATED,
117// GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
118 int status = deflateInit2(&ctx.strm, Z_BEST_SPEED, Z_DEFLATED,
119 GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
120 if (status != Z_OK)
121 throw io_error("Calling deflateInit2() failed for gz file.");
122}
123
124// ----------------------------------------------------------------------------
125// Function compressInit()
126// ----------------------------------------------------------------------------
127
128inline void
129compressInit(CompressionContext<detail::bgzf_compression> & ctx)
130{
131 compressInit(static_cast<CompressionContext<detail::gz_compression> &>(ctx));
132 ctx.headerPos = 0;
133}
134
135// ----------------------------------------------------------------------------
136// Helper Function _bgzfUnpackXX()
137// ----------------------------------------------------------------------------
138
139inline uint16_t
140_bgzfUnpack16(char const * buffer)
141{
142 uint16_t tmp;
143 std::uninitialized_copy(buffer, buffer + sizeof(uint16_t), reinterpret_cast<char *>(&tmp));
144 return detail::to_little_endian(tmp);
145}
146
147inline uint32_t
148_bgzfUnpack32(char const * buffer)
149{
150 uint32_t tmp;
151 std::uninitialized_copy(buffer, buffer + sizeof(uint32_t), reinterpret_cast<char *>(&tmp));
152 return detail::to_little_endian(tmp);
153}
154
155// ----------------------------------------------------------------------------
156// Helper Function _bgzfPackXX()
157// ----------------------------------------------------------------------------
158
159inline void
160_bgzfPack16(char * buffer, uint16_t value)
161{
162 value = detail::to_little_endian(value);
163 std::uninitialized_copy(reinterpret_cast<char *>(&value),
164 reinterpret_cast<char *>(&value) + sizeof(uint16_t),
165 buffer);
166}
167
168inline void
169_bgzfPack32(char * buffer, uint32_t value)
170{
171 value = detail::to_little_endian(value);
172 std::uninitialized_copy(reinterpret_cast<char *>(&value),
173 reinterpret_cast<char *>(&value) + sizeof(uint32_t),
174 buffer);
175}
176
177// ----------------------------------------------------------------------------
178// Function _compressBlock()
179// ----------------------------------------------------------------------------
180
181template <typename TDestValue, typename TDestCapacity, typename TSourceValue, typename TSourceLength>
182inline TDestCapacity
183_compressBlock(TDestValue *dstBegin, TDestCapacity dstCapacity,
184 TSourceValue *srcBegin, TSourceLength srcLength, CompressionContext<detail::bgzf_compression> & ctx)
185{
186 const size_t BLOCK_HEADER_LENGTH = DefaultPageSize<detail::bgzf_compression>::BLOCK_HEADER_LENGTH;
187 const size_t BLOCK_FOOTER_LENGTH = DefaultPageSize<detail::bgzf_compression>::BLOCK_FOOTER_LENGTH;
188
189 assert(dstCapacity > BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH);
190 assert(sizeof(TDestValue) == 1u);
191 assert(sizeof(unsigned) == 4u);
192
193 // 1. COPY HEADER
194 std::ranges::copy(detail::bgzf_compression::magic_header, dstBegin);
195
196 // 2. COMPRESS
197 compressInit(ctx);
198 ctx.strm.next_in = (Bytef *)(srcBegin);
199 ctx.strm.next_out = (Bytef *)(dstBegin + BLOCK_HEADER_LENGTH);
200 ctx.strm.avail_in = srcLength * sizeof(TSourceValue);
201 ctx.strm.avail_out = dstCapacity - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
202
203 int status = deflate(&ctx.strm, Z_FINISH);
204 if (status != Z_STREAM_END)
205 {
206 deflateEnd(&ctx.strm);
207 throw io_error("Deflation failed. Compressed BGZF data is too big.");
208 }
209
210 status = deflateEnd(&ctx.strm);
211 if (status != Z_OK)
212 throw io_error("BGZF deflateEnd() failed.");
213
214
215 // 3. APPEND FOOTER
216
217 // Set compressed length into buffer, compute CRC and write CRC into buffer.
218
219 size_t len = dstCapacity - ctx.strm.avail_out;
220 _bgzfPack16(dstBegin + 16, len - 1);
221
222 dstBegin += len - BLOCK_FOOTER_LENGTH;
223 _bgzfPack32(dstBegin, crc32(crc32(0u, NULL, 0u), (Bytef *)(srcBegin), srcLength * sizeof(TSourceValue)));
224 _bgzfPack32(dstBegin + 4, srcLength * sizeof(TSourceValue));
225
226 return dstCapacity - ctx.strm.avail_out;
227}
228
229// ----------------------------------------------------------------------------
230// Function decompressInit() - GZIP
231// ----------------------------------------------------------------------------
232
233inline void
234decompressInit(CompressionContext<detail::gz_compression> & ctx)
235{
236 const int GZIP_WINDOW_BITS = -15; // no zlib header
237
238 ctx.strm.zalloc = NULL;
239 ctx.strm.zfree = NULL;
240 int status = inflateInit2(&ctx.strm, GZIP_WINDOW_BITS);
241 if (status != Z_OK)
242 throw io_error("GZip inflateInit2() failed.");
243}
244
245// ----------------------------------------------------------------------------
246// Function decompressInit() - BGZF
247// ----------------------------------------------------------------------------
248
249inline void
250decompressInit(CompressionContext<detail::bgzf_compression> & ctx)
251{
252 decompressInit(static_cast<CompressionContext<detail::gz_compression> &>(ctx));
253 ctx.headerPos = 0;
254}
255
256// ----------------------------------------------------------------------------
257// Function _decompressBlock()
258// ----------------------------------------------------------------------------
259
260template <typename TDestValue, typename TDestCapacity, typename TSourceValue, typename TSourceLength>
261inline TDestCapacity
262_decompressBlock(TDestValue *dstBegin, TDestCapacity dstCapacity,
263 TSourceValue *srcBegin, TSourceLength srcLength, CompressionContext<detail::bgzf_compression> & ctx)
264{
265 const size_t BLOCK_HEADER_LENGTH = DefaultPageSize<detail::bgzf_compression>::BLOCK_HEADER_LENGTH;
266 const size_t BLOCK_FOOTER_LENGTH = DefaultPageSize<detail::bgzf_compression>::BLOCK_FOOTER_LENGTH;
267
268 assert(sizeof(TSourceValue) == 1u);
269 assert(sizeof(unsigned) == 4u);
270
271 // 1. CHECK HEADER
272
273 if (srcLength <= BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH)
274 throw io_error("BGZF block too short.");
275
276 if (!detail::bgzf_compression::validate_header(std::span{srcBegin, srcLength}))
277 throw io_error("Invalid BGZF block header.");
278
279 size_t compressedLen = _bgzfUnpack16(srcBegin + 16) + 1u;
280 if (compressedLen != srcLength)
281 throw io_error("BGZF compressed size mismatch.");
282
283
284 // 2. DECOMPRESS
285
286 decompressInit(ctx);
287 ctx.strm.next_in = (Bytef *)(srcBegin + BLOCK_HEADER_LENGTH);
288 ctx.strm.next_out = (Bytef *)(dstBegin);
289 ctx.strm.avail_in = srcLength - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
290 ctx.strm.avail_out = dstCapacity * sizeof(TDestValue);
291
292 int status = inflate(&ctx.strm, Z_FINISH);
293 if (status != Z_STREAM_END)
294 {
295 inflateEnd(&ctx.strm);
296 throw io_error("Inflation failed. Decompressed BGZF data is too big.");
297 }
298
299 status = inflateEnd(&ctx.strm);
300 if (status != Z_OK)
301 throw io_error("BGZF inflateEnd() failed.");
302
303
304 // 3. CHECK FOOTER
305
306 // Check compressed length in buffer, compute CRC and compare with CRC in buffer.
307
308 unsigned crc = crc32(crc32(0u, NULL, 0u), (Bytef *)(dstBegin), dstCapacity - ctx.strm.avail_out);
309
310 srcBegin += compressedLen - BLOCK_FOOTER_LENGTH;
311 if (_bgzfUnpack32(srcBegin) != crc)
312 throw io_error("BGZF wrong checksum.");
313
314 if (_bgzfUnpack32(srcBegin + 4) != dstCapacity - ctx.strm.avail_out)
315 throw io_error("BGZF size mismatch.");
316
317 return (dstCapacity - ctx.strm.avail_out) / sizeof(TDestValue);
318}
319
320} // namespace seqan3::contrib
321
322#endif // defined(SEQAN3_HAS_ZLIB)
Provides seqan3::contrib::bgzf_thread_count.
Provides various transformation traits used by the range module.
Provides exceptions used in the I/O module.
Provides seqan3::detail::magic_header.
T memset(T... args)
Provides utility functions for bit twiddling.
T uninitialized_copy(T... args)