OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_block_decoder_avx2.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2022, Aous Naman
6// Copyright (c) 2022, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2022, The University of New South Wales, Australia
8// Copyright (c) 2024, Intel Corporation
9// Copyright (c) 2026, Osamu Watanabe
10//
11// Redistribution and use in source and binary forms, with or without
12// modification, are permitted provided that the following conditions are
13// met:
14//
15// 1. Redistributions of source code must retain the above copyright
16// notice, this list of conditions and the following disclaimer.
17//
18// 2. Redistributions in binary form must reproduce the above copyright
19// notice, this list of conditions and the following disclaimer in the
20// documentation and/or other materials provided with the distribution.
21//
22// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
23// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
24// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
25// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
28// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
29// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
30// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
31// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33//***************************************************************************/
34// This file is part of the OpenJPH software implementation.
35// File: ojph_block_decoder_avx2.cpp
36//***************************************************************************/
37
38//***************************************************************************/
42
43#include "ojph_arch.h"
44#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
45
46#include <string>
47#include <iostream>
48
49#include <cassert>
50#include <cstring>
51#include "ojph_block_common.h"
52#include "ojph_block_decoder.h"
53#include "ojph_message.h"
54
55#include <immintrin.h>
56
57namespace ojph {
58 namespace local {
59
60 //************************************************************************/
67 struct dec_mel_st {
68 dec_mel_st() : data(NULL), tmp(0), bits(0), size(0), unstuff(false),
69 k(0), num_runs(0), runs(0)
70 {}
71 // data decoding machinery
72 ui8* data;
73 ui64 tmp;
74 int bits;
75 int size;
76 bool unstuff;
77 int k;
78
79 // queue of decoded runs
80 int num_runs;
81 ui64 runs;
82 };
83
84 //************************************************************************/
96 static inline
97 void mel_read(dec_mel_st *melp)
98 {
99 if (melp->bits > 32) //there are enough bits in the tmp variable
100 return; // return without reading new data
101
102 ui32 val = 0xFFFFFFFF; // feed in 0xFF if buffer is exhausted
103 if (melp->size > 4) { // if there is data in the MEL segment
104 memcpy(&val, melp->data, 4); // read 32 bits from MEL data
105 melp->data += 4; // advance pointer
106 melp->size -= 4; // reduce counter
107 }
108 else if (melp->size > 0)
109 { // 4 or less
110 int i = 0;
111 while (melp->size > 1) {
112 ui32 v = *melp->data++; // read one byte at a time
113 ui32 m = ~(0xFFu << i); // mask of location
114 val = (val & m) | (v << i);// put one byte in its correct location
115 --melp->size;
116 i += 8;
117 }
118 // size equal to 1
119 ui32 v = *melp->data++; // the one before the last is different
120 v |= 0xF; // MEL and VLC segments can overlap
121 ui32 m = ~(0xFFu << i);
122 val = (val & m) | (v << i);
123 --melp->size;
124 }
125
126 // next we unstuff them before adding them to the buffer
127 int bits = 32 - melp->unstuff; // number of bits in val, subtract 1 if
128 // the previously read byte requires
129 // unstuffing
130
131 // data is unstuffed and accumulated in t
132 // bits has the number of bits in t
133 ui32 t = val & 0xFF;
134 bool unstuff = ((val & 0xFF) == 0xFF); // true if we need unstuffing
135 bits -= unstuff; // there is one less bit in t if unstuffing is needed
136 t = t << (8 - unstuff); // move up to make room for the next byte
137
138 //this is a repeat of the above
139 t |= (val>>8) & 0xFF;
140 unstuff = (((val >> 8) & 0xFF) == 0xFF);
141 bits -= unstuff;
142 t = t << (8 - unstuff);
143
144 t |= (val>>16) & 0xFF;
145 unstuff = (((val >> 16) & 0xFF) == 0xFF);
146 bits -= unstuff;
147 t = t << (8 - unstuff);
148
149 t |= (val>>24) & 0xFF;
150 melp->unstuff = (((val >> 24) & 0xFF) == 0xFF);
151
152 // move t to tmp, and push the result all the way up, so we read from
153 // the MSB
154 melp->tmp |= ((ui64)t) << (64 - bits - melp->bits);
155 melp->bits += bits; //increment the number of bits in tmp
156 }
157
158 //************************************************************************/
173 static inline
174 void mel_decode(dec_mel_st *melp)
175 {
176 static const int mel_exp[13] = { //MEL exponents
177 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5
178 };
179
180 if (melp->bits < 6) // if there are less than 6 bits in tmp
181 mel_read(melp); // then read from the MEL bitstream
182 // 6 bits is the largest decodable MEL cwd
183
184 //repeat so long that there is enough decodable bits in tmp,
185 // and the runs store is not full (num_runs < 8)
186 while (melp->bits >= 6 && melp->num_runs < 8)
187 {
188 int eval = mel_exp[melp->k]; // number of bits associated with state
189 int run = 0;
190 if (melp->tmp & (1ull<<63)) //The next bit to decode (stored in MSB)
191 { //one is found
192 run = 1 << eval;
193 run--; // consecutive runs of 0 events - 1
194 melp->k = melp->k + 1 < 12 ? melp->k + 1 : 12;//increment, max is 12
195 melp->tmp <<= 1; // consume one bit from tmp
196 melp->bits -= 1;
197 run = run << 1; // a stretch of zeros not terminating in one
198 }
199 else
200 { //0 is found
201 run = (int)(melp->tmp >> (63 - eval)) & ((1 << eval) - 1);
202 melp->k = melp->k - 1 > 0 ? melp->k - 1 : 0; //decrement, min is 0
203 melp->tmp <<= eval + 1; //consume eval + 1 bits (max is 6)
204 melp->bits -= eval + 1;
205 run = (run << 1) + 1; // a stretch of zeros terminating with one
206 }
207 eval = melp->num_runs * 7; // 7 bits per run
208 melp->runs &= ~((ui64)0x3F << eval); // 6 bits are sufficient
209 melp->runs |= ((ui64)run) << eval; // store the value in runs
210 melp->num_runs++; // increment count
211 }
212 }
213
214 //************************************************************************/
224 static inline
225 void mel_init(dec_mel_st *melp, ui8* bbuf, int lcup, int scup)
226 {
227 melp->data = bbuf + lcup - scup; // move the pointer to the start of MEL
228 melp->bits = 0; // 0 bits in tmp
229 melp->tmp = 0; //
230 melp->unstuff = false; // no unstuffing
231 melp->size = scup - 1; // size is the length of MEL+VLC-1
232 melp->k = 0; // 0 for state
233 melp->num_runs = 0; // num_runs is 0
234 melp->runs = 0; //
235
236 //This code is borrowed; original is for a different architecture
237 //These few lines take care of the case where data is not at a multiple
238 // of 4 boundary. It reads 1,2,3 up to 4 bytes from the MEL segment
239 int num = 4 - (int)(intptr_t(melp->data) & 0x3);
240 for (int i = 0; i < num; ++i) { // this code is similar to mel_read
241 assert(melp->unstuff == false || melp->data[0] <= 0x8F);
242 ui64 d = (melp->size > 0) ? *melp->data : 0xFF;//if buffer is consumed
243 //set data to 0xFF
244 if (melp->size == 1) d |= 0xF; //if this is MEL+VLC-1, set LSBs to 0xF
245 // see the standard
246 melp->data += melp->size-- > 0; //increment if the end is not reached
247 int d_bits = 8 - melp->unstuff; //if unstuffing is needed, reduce by 1
248 melp->tmp = (melp->tmp << d_bits) | d; //store bits in tmp
249 melp->bits += d_bits; //increment tmp by number of bits
250 melp->unstuff = ((d & 0xFF) == 0xFF); //true of next byte needs
251 //unstuffing
252 }
253 melp->tmp <<= (64 - melp->bits); //push all the way up so the first bit
254 // is the MSB
255 }
256
257 //************************************************************************/
263 static inline
264 int mel_get_run(dec_mel_st *melp)
265 {
266 if (melp->num_runs == 0) //if no runs, decode more bit from MEL segment
267 mel_decode(melp);
268
269 int t = melp->runs & 0x7F; //retrieve one run
270 melp->runs >>= 7; // remove the retrieved run
271 melp->num_runs--;
272 return t; // return run
273 }
274
275
276 //************************************************************************/
302 template<int X>
303 static inline
304 ui32 destuff_frwd(const ui8* src, int size, ui8* dst, ui32 cap)
305 {
306 if (size < 0)
307 size = 0;
308 ui8* o = dst;
309 ui8* o_end = dst + cap;
310 const ui8* s = src;
311 const ui8* s_end = src + size;
312 ui64 acc = 0; // partial output byte, low nb bits are valid
313 ui32 nb = 0; // number of valid bits in acc; always < 8
314 bool prev_ff = false;
315
316 // fast path; 16 source bytes at a time when they contain no 0xFF
317 while (s + 16 <= s_end && o + 24 <= o_end)
318 {
319 __m128i v = _mm_loadu_si128((const __m128i*)s);
320 int ff = _mm_movemask_epi8(_mm_cmpeq_epi8(v, _mm_set1_epi8(-1)));
321 if (ff != 0 || prev_ff)
322 { // process these 16 bytes one at a time
323 for (int i = 0; i < 16; ++i) {
324 ui8 b = *s++;
325 acc |= (ui64)b << nb;
326 nb += prev_ff ? 7u : 8u;
327 prev_ff = (b == 0xFFu);
328 if (nb >= 8) { *o++ = (ui8)acc; acc >>= 8; nb -= 8; }
329 }
330 continue;
331 }
332 ui64 v0, v1;
333 memcpy(&v0, s, 8);
334 memcpy(&v1, s + 8, 8);
335 ui64 w0 = acc | (v0 << nb);
336 ui64 w1 = (v1 << nb) | (nb ? (v0 >> (64 - nb)) : 0);
337 memcpy(o, &w0, 8);
338 memcpy(o + 8, &w1, 8);
339 acc = nb ? (v1 >> (64 - nb)) : 0;
340 o += 16;
341 s += 16;
342 }
343 // tail; one byte at a time
344 while (s < s_end && o < o_end)
345 {
346 ui8 b = *s++;
347 acc |= (ui64)b << nb;
348 nb += prev_ff ? 7u : 8u;
349 prev_ff = (b == 0xFFu);
350 if (nb >= 8) { *o++ = (ui8)acc; acc >>= 8; nb -= 8; }
351 }
352 // fill the bits above nb with X, and pad with X bytes
353 ui32 fill = (X == 0xFF) ? (0xFFu << nb) : 0;
354 *o = (ui8)((ui32)acc | fill);
355 __m256i pad = _mm256_set1_epi8((char)X);
356 _mm256_storeu_si256((__m256i*)(o + 1), pad);
357 _mm256_storeu_si256((__m256i*)(o + 33), pad);
358 return (ui32)(o - dst) + 1;
359 }
360
361 //************************************************************************/
376 __m128i dfetch(const ui8* dbuf, ui32 limit, ui32 pos)
377 {
378 ui32 off = pos >> 3;
379 off = off < limit ? off : limit;
380 const ui8* p = dbuf + off;
381 __m128i v = _mm_loadu_si128((const __m128i*)p);
382 __m128i w = _mm_loadu_si128((const __m128i*)(p + 8));
383 int k = (int)(pos & 7);
384 __m128i r = _mm_srl_epi64(v, _mm_cvtsi32_si128(k));
385 __m128i c = _mm_sll_epi64(w, _mm_cvtsi32_si128(64 - k));
386 return _mm_or_si128(r, c);
387 }
388
389 //************************************************************************/
401 ui64 dfetch64(const ui8* dbuf, ui32 limit, ui32 pos)
402 {
403 ui32 off = pos >> 3;
404 off = off < limit ? off : limit;
405 ui64 v;
406 memcpy(&v, dbuf + off, 8);
407 return v >> (pos & 7);
408 }
409
410 //************************************************************************/
430 void drefill(ui64& val, ui32& bits, ui32& off,
431 const ui8* dbuf, ui32 limit)
432 {
433 ui64 v;
434 ui32 o = off < limit ? off : limit;
435 memcpy(&v, dbuf + o, 8);
436 val |= v << bits;
437 off += (63 - bits) >> 3;
438 bits |= 56;
439 }
440
441 //************************************************************************/
449 void dconsume(ui64& val, ui32& bits, ui32 num_bits)
450 {
451 val >>= num_bits;
452 bits -= num_bits;
453 }
454
455 //************************************************************************/
485 static inline
486 ui32 destuff_rev(const ui8* p, int size, bool unstuff,
487 ui64 acc, ui32 nb, ui8* dst, ui32 cap)
488 {
489 ui8* o = dst;
490 ui8* o_end = dst + cap;
491
492 // process the first byte with the caller-provided unstuff state;
493 // afterwards the state is implied by the byte at p[1], which the
494 // fast path checks vectorially (and which stays inside the segment)
495 if (size > 0 && o < o_end)
496 {
497 ui32 d = *p--;
498 --size;
499 acc |= (ui64)d << nb;
500 nb += 8 - ((unstuff && ((d & 0x7F) == 0x7F)) ? 1u : 0u);
501 unstuff = d > 0x8F;
502 if (nb >= 8) { *o++ = (ui8)acc; acc >>= 8; nb -= 8; }
503 }
504
505 // fast path; 16 source bytes at a time when none needs unstuffing
506 while (size >= 16 && o + 24 <= o_end)
507 {
508 __m128i v = _mm_loadu_si128((const __m128i*)(p - 15));
509 __m128i nx = _mm_loadu_si128((const __m128i*)(p - 14));
510 __m128i is7f = _mm_cmpeq_epi8(
511 _mm_and_si128(v, _mm_set1_epi8(0x7F)), _mm_set1_epi8(0x7F));
512 // le8f is 0xFF where the byte after (in memory) is <= 0x8F
513 __m128i le8f = _mm_cmpeq_epi8(
514 _mm_subs_epu8(nx, _mm_set1_epi8((char)0x8F)),
515 _mm_setzero_si128());
516 __m128i stuff = _mm_andnot_si128(le8f, is7f);
517 if (!_mm_testz_si128(stuff, stuff))
518 { // process these 16 bytes one at a time
519 for (int i = 0; i < 16; ++i) {
520 ui32 d = *p--;
521 acc |= (ui64)d << nb;
522 nb += 8 - ((unstuff && ((d & 0x7F) == 0x7F)) ? 1u : 0u);
523 unstuff = d > 0x8F;
524 if (nb >= 8) { *o++ = (ui8)acc; acc >>= 8; nb -= 8; }
525 }
526 size -= 16;
527 continue;
528 }
529 __m128i r = _mm_shuffle_epi8(v,
530 _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,
531 8, 9, 10, 11, 12, 13, 14, 15)); // reverse bytes
532 ui64 v0 = (ui64)_mm_cvtsi128_si64(r);
533 ui64 v1 = (ui64)_mm_extract_epi64(r, 1);
534 ui64 w0 = acc | (v0 << nb);
535 ui64 w1 = (v1 << nb) | (nb ? (v0 >> (64 - nb)) : 0);
536 memcpy(o, &w0, 8);
537 memcpy(o + 8, &w1, 8);
538 acc = nb ? (v1 >> (64 - nb)) : 0;
539 o += 16;
540 p -= 16;
541 size -= 16;
542 unstuff = p[1] > 0x8F;
543 }
544 // tail; one byte at a time
545 while (size > 0 && o < o_end)
546 {
547 ui32 d = *p--;
548 --size;
549 acc |= (ui64)d << nb;
550 nb += 8 - ((unstuff && ((d & 0x7F) == 0x7F)) ? 1u : 0u);
551 unstuff = d > 0x8F;
552 if (nb >= 8) { *o++ = (ui8)acc; acc >>= 8; nb -= 8; }
553 }
554 // the bits above nb are already 0 = fill; pad with zero bytes
555 *o = (ui8)acc;
556 __m256i z = _mm256_setzero_si256();
557 _mm256_storeu_si256((__m256i*)(o + 1), z);
558 _mm256_storeu_si256((__m256i*)(o + 33), z);
559 return (ui32)(o - dst) + 1;
560 }
561
562 //************************************************************************/
578 static inline
579 ui32 destuff_vlc(const ui8* data, int lcup, int scup,
580 ui8* dst, ui32 cap)
581 {
582 const ui8* p = data + lcup - 2;
583 ui32 d = *p; // first byte, only the upper 4 bits are used
584 ui64 acc = d >> 4;
585 ui32 nb = 4 - ((acc & 7) == 7); // check standard
586 bool unstuff = (d | 0xF) > 0x8F;
587 return destuff_rev(p - 1, scup - 2, unstuff, acc, nb, dst, cap);
588 }
589
590 //************************************************************************/
605 static inline
606 ui32 destuff_mrp(const ui8* data, int lcup, int len2,
607 ui8* dst, ui32 cap)
608 {
609 return destuff_rev(data + lcup + len2 - 1, len2, true, 0, 0,
610 dst, cap);
611 }
612
613 //************************************************************************/
624 __m256i decode_two_quad32_avx2(__m256i inf_u_q, __m256i U_q,
625 const ui8* dbuf, ui32 limit, ui32& pos,
626 ui32 p, __m128i& vn) {
627 __m256i row = _mm256_setzero_si256();
628
629 // we keeps e_k, e_1, and rho in w2
630 __m256i flags = _mm256_and_si256(inf_u_q, _mm256_set_epi32(0x8880, 0x4440, 0x2220, 0x1110, 0x8880, 0x4440, 0x2220, 0x1110));
631 __m256i insig = _mm256_cmpeq_epi32(flags, _mm256_setzero_si256());
632
633 if ((uint32_t)_mm256_movemask_epi8(insig) != (uint32_t)0xFFFFFFFF) //are all insignificant?
634 {
635 flags = _mm256_mullo_epi16(flags, _mm256_set_epi16(1, 1, 2, 2, 4, 4, 8, 8, 1, 1, 2, 2, 4, 4, 8, 8));
636
637 // U_q holds U_q for this quad
638 // flags has e_k, e_1, and rho such that e_k is sitting in the
639 // 0x8000, e_1 in 0x800, and rho in 0x80
640
641 // next e_k and m_n
642 __m256i m_n;
643 __m256i w0 = _mm256_srli_epi32(flags, 15); // e_k
644 m_n = _mm256_sub_epi32(U_q, w0);
645 m_n = _mm256_andnot_si256(insig, m_n);
646
647 // find cumulative sums
648 // to find at which bit in ms_vec the sample starts
649 __m256i inc_sum = m_n; // inclusive scan
650 inc_sum = _mm256_add_epi32(inc_sum, _mm256_bslli_epi128(inc_sum, 4));
651 inc_sum = _mm256_add_epi32(inc_sum, _mm256_bslli_epi128(inc_sum, 8));
652 ui32 total_mn1 = (ui32)_mm256_extract_epi16(inc_sum, 6);
653 ui32 total_mn2 = (ui32)_mm256_extract_epi16(inc_sum, 14);
654
655 __m128i ms_vec0 = dfetch(dbuf, limit, pos);
656 __m128i ms_vec1 = dfetch(dbuf, limit, pos + total_mn1);
657 pos += total_mn1 + total_mn2;
658
659 __m256i ms_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ms_vec0), ms_vec1, 0x1);
660
661 __m256i ex_sum = _mm256_bslli_epi128(inc_sum, 4); // exclusive scan
662
663 // find the starting byte and starting bit
664 __m256i byte_idx = _mm256_srli_epi32(ex_sum, 3);
665 __m256i bit_idx = _mm256_and_si256(ex_sum, _mm256_set1_epi32(7));
666 byte_idx = _mm256_shuffle_epi8(byte_idx,
667 _mm256_set_epi32(0x0C0C0C0C, 0x08080808, 0x04040404, 0x00000000, 0x0C0C0C0C, 0x08080808, 0x04040404, 0x00000000));
668 byte_idx = _mm256_add_epi32(byte_idx, _mm256_set1_epi32(0x03020100));
669 __m256i d0 = _mm256_shuffle_epi8(ms_vec, byte_idx);
670 byte_idx = _mm256_add_epi32(byte_idx, _mm256_set1_epi32(0x01010101));
671 __m256i d1 = _mm256_shuffle_epi8(ms_vec, byte_idx);
672
673 // shift samples values to correct location
674 bit_idx = _mm256_or_si256(bit_idx, _mm256_slli_epi32(bit_idx, 16));
675
676 __m128i a = _mm_set_epi8(1, 3, 7, 15, 31, 63, 127, -1, 1, 3, 7, 15, 31, 63, 127, -1);
677 __m256i aa = _mm256_inserti128_si256(_mm256_castsi128_si256(a), a, 0x1);
678
679 __m256i bit_shift = _mm256_shuffle_epi8(aa, bit_idx);
680 bit_shift = _mm256_add_epi16(bit_shift, _mm256_set1_epi16(0x0101));
681 d0 = _mm256_mullo_epi16(d0, bit_shift);
682 d0 = _mm256_srli_epi16(d0, 8); // we should have 8 bits in the LSB
683 d1 = _mm256_mullo_epi16(d1, bit_shift);
684 d1 = _mm256_and_si256(d1, _mm256_set1_epi32((si32)0xFF00FF00)); // 8 in MSB
685 d0 = _mm256_or_si256(d0, d1);
686
687 // find location of e_k and mask
688 __m256i shift;
689 __m256i ones = _mm256_set1_epi32(1);
690 __m256i twos = _mm256_set1_epi32(2);
691 __m256i U_q_m1 = _mm256_sub_epi32(U_q, ones);
692 U_q_m1 = _mm256_and_si256(U_q_m1, _mm256_set_epi32(0, 0, 0, 0x1F, 0, 0, 0, 0x1F));
693 U_q_m1 = _mm256_shuffle_epi32(U_q_m1, 0);
694 w0 = _mm256_sub_epi32(twos, w0);
695 shift = _mm256_sllv_epi32(w0, U_q_m1); // U_q_m1 must be no more than 31
696 ms_vec = _mm256_and_si256(d0, _mm256_sub_epi32(shift, ones));
697
698 // next e_1
699 w0 = _mm256_and_si256(flags, _mm256_set1_epi32(0x800));
700 w0 = _mm256_cmpeq_epi32(w0, _mm256_setzero_si256());
701 w0 = _mm256_andnot_si256(w0, shift); // e_1 in correct position
702 ms_vec = _mm256_or_si256(ms_vec, w0); // e_1
703 w0 = _mm256_slli_epi32(ms_vec, 31); // sign
704 ms_vec = _mm256_or_si256(ms_vec, ones); // bin center
705 __m256i tvn = ms_vec;
706 ms_vec = _mm256_add_epi32(ms_vec, twos);// + 2
707 ms_vec = _mm256_slli_epi32(ms_vec, (si32)p - 1);
708 ms_vec = _mm256_or_si256(ms_vec, w0); // sign
709 row = _mm256_andnot_si256(insig, ms_vec); // significant only
710
711 ms_vec = _mm256_andnot_si256(insig, tvn); // significant only
712
713 tvn = _mm256_shuffle_epi8(ms_vec, _mm256_set_epi32(-1, 0x0F0E0D0C, 0x07060504, -1, -1, -1, 0x0F0E0D0C, 0x07060504));
714
715 vn = _mm_or_si128(vn, _mm256_castsi256_si128(tvn));
716 vn = _mm_or_si128(vn, _mm256_extracti128_si256(tvn, 0x1));
717 }
718 return row;
719 }
720
721
722 //************************************************************************/
732
734 __m256i decode_four_quad16(const __m128i inf_u_q, __m128i U_q,
735 const ui8* dbuf, ui32 limit, ui32& pos,
736 ui32 p, __m128i& vn) {
737
738 __m256i w0; // workers
739 __m256i insig; // lanes hold FF's if samples are insignificant
740 __m256i flags; // lanes hold e_k, e_1, and rho
741
742 __m256i row = _mm256_setzero_si256();
743 __m128i ddd = _mm_shuffle_epi8(inf_u_q,
744 _mm_set_epi16(0x0d0c, 0x0d0c, 0x0908, 0x908, 0x0504, 0x0504, 0x0100, 0x0100));
745 w0 = _mm256_permutevar8x32_epi32(_mm256_castsi128_si256(ddd),
746 _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
747 // we keeps e_k, e_1, and rho in w2
748 flags = _mm256_and_si256(w0,
749 _mm256_set_epi16((si16)0x8880, 0x4440, 0x2220, 0x1110,
750 (si16)0x8880, 0x4440, 0x2220, 0x1110,
751 (si16)0x8880, 0x4440, 0x2220, 0x1110,
752 (si16)0x8880, 0x4440, 0x2220, 0x1110));
753 insig = _mm256_cmpeq_epi16(flags, _mm256_setzero_si256());
754 if ((uint32_t)_mm256_movemask_epi8(insig) != (uint32_t)0xFFFFFFFF) //are all insignificant?
755 {
756 ddd = _mm_or_si128(_mm_bslli_si128(U_q, 2), U_q);
757 __m256i U_q_avx = _mm256_permutevar8x32_epi32(_mm256_castsi128_si256(ddd),
758 _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
759 flags = _mm256_mullo_epi16(flags, _mm256_set_epi16(1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8));
760
761 // U_q holds U_q for this quad
762 // flags has e_k, e_1, and rho such that e_k is sitting in the
763 // 0x8000, e_1 in 0x800, and rho in 0x80
764
765 // next e_k and m_n
766 __m256i m_n;
767 w0 = _mm256_srli_epi16(flags, 15); // e_k
768 m_n = _mm256_sub_epi16(U_q_avx, w0);
769 m_n = _mm256_andnot_si256(insig, m_n);
770
771 // find cumulative sums
772 // to find at which bit in ms_vec the sample starts
773 __m256i inc_sum = m_n; // inclusive scan
774 inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 2));
775 inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 4));
776 inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 8));
777 ui32 total_mn1 = (ui32)_mm256_extract_epi16(inc_sum, 7);
778 ui32 total_mn2 = (ui32)_mm256_extract_epi16(inc_sum, 15);
779 __m256i ex_sum = _mm256_bslli_epi128(inc_sum, 2); // exclusive scan
780
781 __m128i ms_vec0 = dfetch(dbuf, limit, pos);
782 __m128i ms_vec1 = dfetch(dbuf, limit, pos + total_mn1);
783 pos += total_mn1 + total_mn2;
784
785 __m256i ms_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ms_vec0), ms_vec1, 0x1);
786
787 // find the starting byte and starting bit
788 __m256i byte_idx = _mm256_srli_epi16(ex_sum, 3);
789 __m256i bit_idx = _mm256_and_si256(ex_sum, _mm256_set1_epi16(7));
790 byte_idx = _mm256_shuffle_epi8(byte_idx,
791 _mm256_set_epi16(0x0E0E, 0x0C0C, 0x0A0A, 0x0808,
792 0x0606, 0x0404, 0x0202, 0x0000, 0x0E0E, 0x0C0C, 0x0A0A, 0x0808,
793 0x0606, 0x0404, 0x0202, 0x0000));
794 byte_idx = _mm256_add_epi16(byte_idx, _mm256_set1_epi16(0x0100));
795 __m256i d0 = _mm256_shuffle_epi8(ms_vec, byte_idx);
796 byte_idx = _mm256_add_epi16(byte_idx, _mm256_set1_epi16(0x0101));
797 __m256i d1 = _mm256_shuffle_epi8(ms_vec, byte_idx);
798
799 // shift samples values to correct location
800 __m256i bit_shift = _mm256_shuffle_epi8(
801 _mm256_set_epi8(1, 3, 7, 15, 31, 63, 127, -1,
802 1, 3, 7, 15, 31, 63, 127, -1, 1, 3, 7, 15, 31, 63, 127, -1,
803 1, 3, 7, 15, 31, 63, 127, -1), bit_idx);
804 bit_shift = _mm256_add_epi16(bit_shift, _mm256_set1_epi16(0x0101));
805 d0 = _mm256_mullo_epi16(d0, bit_shift);
806 d0 = _mm256_srli_epi16(d0, 8); // we should have 8 bits in the LSB
807 d1 = _mm256_mullo_epi16(d1, bit_shift);
808 d1 = _mm256_and_si256(d1, _mm256_set1_epi16((si16)0xFF00)); // 8 in MSB
809 d0 = _mm256_or_si256(d0, d1);
810
811 // find location of e_k and mask
812 __m256i shift;
813 __m256i ones = _mm256_set1_epi16(1);
814 __m256i twos = _mm256_set1_epi16(2);
815 // shift = (2 - e_k) << (U_q - 1); AVX2 has no _mm256_sllv_epi16,
816 // so the variable shift is emulated with a pshufb power-of-two
817 // lookup and a 16-bit multiply. U_q - 1 <= 14 in this path;
818 // for U_q == 0 the lookup indices have their MSB set, so pshufb
819 // returns 0, the same shift value the former uniform 16-bit
820 // shift emulation produced (it shifted by 31)
821 __m256i kq = _mm256_sub_epi16(U_q_avx, ones);
822 __m256i idx = _mm256_or_si256(kq,
823 _mm256_slli_epi16(_mm256_sub_epi16(kq,
824 _mm256_set1_epi16(8)), 8));
825 const __m256i pow2_tbl = _mm256_setr_epi8(
826 1, 2, 4, 8, 16, 32, 64, (char)128, 0, 0, 0, 0, 0, 0, 0, 0,
827 1, 2, 4, 8, 16, 32, 64, (char)128, 0, 0, 0, 0, 0, 0, 0, 0);
828 __m256i pow2 = _mm256_shuffle_epi8(pow2_tbl, idx);
829 w0 = _mm256_sub_epi16(twos, w0);
830 shift = _mm256_mullo_epi16(w0, pow2);
831 ms_vec = _mm256_and_si256(d0, _mm256_sub_epi16(shift, ones));
832
833 // next e_1
834 w0 = _mm256_and_si256(flags, _mm256_set1_epi16(0x800));
835 w0 = _mm256_cmpeq_epi16(w0, _mm256_setzero_si256());
836 w0 = _mm256_andnot_si256(w0, shift); // e_1 in correct position
837 ms_vec = _mm256_or_si256(ms_vec, w0); // e_1
838 w0 = _mm256_slli_epi16(ms_vec, 15); // sign
839 ms_vec = _mm256_or_si256(ms_vec, ones); // bin center
840 __m256i tvn = ms_vec;
841 ms_vec = _mm256_add_epi16(ms_vec, twos);// + 2
842 ms_vec = _mm256_slli_epi16(ms_vec, (si32)p - 1);
843 ms_vec = _mm256_or_si256(ms_vec, w0); // sign
844 row = _mm256_andnot_si256(insig, ms_vec); // significant only
845
846 ms_vec = _mm256_andnot_si256(insig, tvn); // significant only
847
848 __m256i ms_vec_shuffle1 = _mm256_shuffle_epi8(ms_vec,
849 _mm256_set_epi16(-1, -1, -1, -1, 0x0706, 0x0302, -1, -1,
850 -1, -1, -1, -1, -1, -1, 0x0706, 0x0302));
851 __m256i ms_vec_shuffle2 = _mm256_shuffle_epi8(ms_vec,
852 _mm256_set_epi16(-1, -1, -1, 0x0F0E, 0x0B0A, -1, -1, -1,
853 -1, -1, -1, -1, -1, 0x0F0E, 0x0B0A, -1));
854 ms_vec = _mm256_or_si256(ms_vec_shuffle1, ms_vec_shuffle2);
855
856 vn = _mm_or_si128(vn, _mm256_castsi256_si128(ms_vec));
857 vn = _mm_or_si128(vn, _mm256_extracti128_si256(ms_vec, 0x1));
858 }
859 return row;
860 }
861
862 // https://stackoverflow.com/a/58827596
863 inline __m256i avx2_lzcnt_epi32(__m256i v) {
864 // prevent value from being rounded up to the next power of two
865 v = _mm256_andnot_si256(_mm256_srli_epi32(v, 8), v); // keep 8 MSB
866
867 v = _mm256_castps_si256(_mm256_cvtepi32_ps(v)); // convert an integer to float
868 v = _mm256_srli_epi32(v, 23); // shift down the exponent
869 v = _mm256_subs_epu16(_mm256_set1_epi32(158), v); // undo bias
870 v = _mm256_min_epi16(v, _mm256_set1_epi32(32)); // clamp at 32
871
872 return v;
873 }
874
875 //************************************************************************/
892 //************************************************************************/
902 bool decode_cb_step2_16bit(ui16* scratch, ui32* decoded_data,
903 ui8* coded_data, ui32 width, ui32 height,
904 ui32 stride, ui32 sstr, ui32 p, ui32 mmsbp2,
905 int lcup, int scup)
906 {
907 // reduce bitplane by 16 because we now have 16 bits instead of 32
908 p -= 16;
909
910 const int v_n_size = 512 + 16;
911#ifdef __MINGW64__
912 ui16 v_n_scratch[v_n_size] = {0};
913 ui32 v_n_scratch_32[v_n_size] = {0};
914#else
915 ui16 v_n_scratch[v_n_size];
916 memset(v_n_scratch + (width >> 1) + 4, 0, 8 * sizeof(ui16));
917 ui32 v_n_scratch_32[v_n_size];
918#endif
919
920 // maximum consumable MagSgn bits: 4096 samples x (mmsbp2 < 16) bits
921 const ui32 dbuf_cap = 4096 * 15 / 8;
922 ui8 dbuf[dbuf_cap + 72];
923 ui32 limit = destuff_frwd<0xFF>(coded_data, lcup - scup, dbuf, dbuf_cap);
924 ui32 pos = 0;
925
926 {
927 ui16 *sp = scratch;
928 ui16 *vp = v_n_scratch;
929 ui32 *dp = decoded_data;
930 vp[0] = 2; // for easy calculation of emax
931
932 for (ui32 x = 0; x < width; x += 8, sp += 8, vp += 4, dp += 8) {
934 __m128i inf_u_q = _mm_loadu_si128((__m128i*)sp);
935 __m128i U_q = _mm_srli_epi32(inf_u_q, 16);
936 __m128i w = _mm_cmpgt_epi32(U_q, _mm_set1_epi32((int)mmsbp2));
937 if (!_mm_testz_si128(w, w)) {
938 return false;
939 }
940
941 __m128i vn = _mm_set1_epi16(2);
942 __m256i row = decode_four_quad16(inf_u_q, U_q, dbuf, limit, pos, p, vn);
943
944 w = _mm_cvtsi32_si128(*(unsigned short const*)(vp));
945 _mm_storeu_si128((__m128i*)vp, _mm_or_si128(w, vn));
946
947 __m256i w0 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1, 0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1));
948 __m256i w1 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1, 0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1));
949
950 _mm256_storeu_si256((__m256i*)dp, w0);
951 _mm256_storeu_si256((__m256i*)(dp + stride), w1);
952 }
953 }
954
955 for (ui32 y = 2; y < height; y += 2) {
956 {
957 // perform 15 - count_leading_zeros(*vp) here
958 ui16 *vp = v_n_scratch;
959 ui32 *vp_32 = v_n_scratch_32;
960
961 ui16* sp = scratch + (y >> 1) * sstr;
962 const __m256i avx_mmsbp2 = _mm256_set1_epi32((int)mmsbp2);
963 const __m256i avx_31 = _mm256_set1_epi32(31);
964 const __m256i avx_f0 = _mm256_set1_epi32(0xF0);
965 const __m256i avx_1 = _mm256_set1_epi32(1);
966 const __m256i avx_0 = _mm256_setzero_si256();
967
968 for (ui32 x = 0; x <= width; x += 16, vp += 8, sp += 16, vp_32 += 8) {
969 __m128i v = _mm_loadu_si128((__m128i*)vp);
970 __m128i v_p1 = _mm_loadu_si128((__m128i*)(vp + 1));
971 v = _mm_or_si128(v, v_p1);
972
973 __m256i v_avx = _mm256_cvtepu16_epi32(v);
974 v_avx = avx2_lzcnt_epi32(v_avx);
975 v_avx = _mm256_sub_epi32(avx_31, v_avx);
976
977 __m256i inf_u_q = _mm256_loadu_si256((__m256i*)sp);
978 __m256i gamma = _mm256_and_si256(inf_u_q, avx_f0);
979 __m256i w0 = _mm256_sub_epi32(gamma, avx_1);
980 gamma = _mm256_and_si256(gamma, w0);
981 gamma = _mm256_cmpeq_epi32(gamma, avx_0);
982
983 v_avx = _mm256_andnot_si256(gamma, v_avx);
984 v_avx = _mm256_max_epi32(v_avx, avx_1);
985
986 inf_u_q = _mm256_srli_epi32(inf_u_q, 16);
987 v_avx = _mm256_add_epi32(inf_u_q, v_avx);
988
989 w0 = _mm256_cmpgt_epi32(v_avx, avx_mmsbp2);
990 if (!_mm256_testz_si256(w0, w0)) {
991 return false;
992 }
993
994 _mm256_storeu_si256((__m256i*)vp_32, v_avx);
995 }
996 }
997
998 ui16 *vp = v_n_scratch;
999 ui32* vp_32 = v_n_scratch_32;
1000 ui16 *sp = scratch + (y >> 1) * sstr;
1001 ui32 *dp = decoded_data + y * stride;
1002 vp[0] = 2; // for easy calculation of emax
1003
1004 for (ui32 x = 0; x < width; x += 8, sp += 8, vp += 4, dp += 8, vp_32 += 4) {
1006 __m128i inf_u_q = _mm_loadu_si128((__m128i*)sp);
1007 __m128i U_q = _mm_loadu_si128((__m128i*)vp_32);
1008
1009 __m128i vn = _mm_set1_epi16(2);
1010 __m256i row = decode_four_quad16(inf_u_q, U_q, dbuf, limit, pos, p, vn);
1011
1012 __m128i w = _mm_cvtsi32_si128(*(unsigned short const*)(vp));
1013 _mm_storeu_si128((__m128i*)vp, _mm_or_si128(w, vn));
1014
1015 __m256i w0 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1, 0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1));
1016 __m256i w1 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1, 0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1));
1017
1018 _mm256_storeu_si256((__m256i*)dp, w0);
1019 _mm256_storeu_si256((__m256i*)(dp + stride), w1);
1020 }
1021 }
1022 return true;
1023 }
1024
1025 //************************************************************************/
1034 bool decode_cb_step2_32bit(ui16* scratch, ui32* decoded_data,
1035 ui8* coded_data, ui32 width, ui32 height,
1036 ui32 stride, ui32 sstr, ui32 p, ui32 mmsbp2,
1037 int lcup, int scup)
1038 {
1039 const int v_n_size = 512 + 16;
1040#ifdef __MINGW64__
1041 ui32 v_n_scratch[2 * v_n_size] = {0};
1042#else
1043 ui32 v_n_scratch[2 * v_n_size];
1044 memset(v_n_scratch + (width >> 1) + 2, 0, 14 * sizeof(ui32));
1045#endif
1046
1047 // maximum consumable MagSgn bits: 4096 samples x (mmsbp2 <= 32) bits
1048 const ui32 dbuf_cap = 4096 * 32 / 8;
1049 ui8 dbuf[dbuf_cap + 72];
1050 ui32 limit = destuff_frwd<0xFF>(coded_data, lcup - scup, dbuf, dbuf_cap);
1051 ui32 pos = 0;
1052
1053 const __m256i avx_mmsbp2 = _mm256_set1_epi32((int)mmsbp2);
1054
1055 {
1056 ui16 *sp = scratch;
1057 ui32 *vp = v_n_scratch;
1058 ui32 *dp = decoded_data;
1059 vp[0] = 2; // for easy calculation of emax
1060
1061 for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1062 {
1063 __m128i vn = _mm_set1_epi32(2);
1064
1065 __m256i inf_u_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)sp));
1066 inf_u_q = _mm256_permutevar8x32_epi32(inf_u_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1));
1067
1068 __m256i U_q = _mm256_srli_epi32(inf_u_q, 16);
1069 __m256i w = _mm256_cmpgt_epi32(U_q, avx_mmsbp2);
1070 if (!_mm256_testz_si256(w, w)) {
1071 return false;
1072 }
1073
1074 __m256i row = decode_two_quad32_avx2(inf_u_q, U_q, dbuf, limit, pos, p, vn);
1075 row = _mm256_permutevar8x32_epi32(row, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
1076 _mm_store_si128((__m128i*)dp, _mm256_castsi256_si128(row));
1077 _mm_store_si128((__m128i*)(dp + stride), _mm256_extracti128_si256(row, 0x1));
1078
1079 __m128i w0 = _mm_cvtsi32_si128(*(int const*)vp);
1080 w0 = _mm_or_si128(w0, vn);
1081 _mm_storeu_si128((__m128i*)vp, w0);
1082 }
1083 }
1084
1085 for (ui32 y = 2; y < height; y += 2)
1086 {
1087 {
1088 // perform 31 - count_leading_zeros(*vp) here
1089 ui32 *vp = v_n_scratch;
1090 ui16* sp = scratch + (y >> 1) * sstr;
1091
1092 const __m256i avx_31 = _mm256_set1_epi32(31);
1093 const __m256i avx_f0 = _mm256_set1_epi32(0xF0);
1094 const __m256i avx_1 = _mm256_set1_epi32(1);
1095 const __m256i avx_0 = _mm256_setzero_si256();
1096
1097 for (ui32 x = 0; x <= width; x += 16, vp += 8, sp += 16) {
1098 __m256i v = _mm256_loadu_si256((__m256i*)vp);
1099 __m256i v_p1 = _mm256_loadu_si256((__m256i*)(vp + 1));
1100 v = _mm256_or_si256(v, v_p1);
1101 v = avx2_lzcnt_epi32(v);
1102 v = _mm256_sub_epi32(avx_31, v);
1103
1104 __m256i inf_u_q = _mm256_loadu_si256((__m256i*)sp);
1105 __m256i gamma = _mm256_and_si256(inf_u_q, avx_f0);
1106 __m256i w0 = _mm256_sub_epi32(gamma, avx_1);
1107 gamma = _mm256_and_si256(gamma, w0);
1108 gamma = _mm256_cmpeq_epi32(gamma, avx_0);
1109
1110 v = _mm256_andnot_si256(gamma, v);
1111 v = _mm256_max_epi32(v, avx_1);
1112
1113 inf_u_q = _mm256_srli_epi32(inf_u_q, 16);
1114 v = _mm256_add_epi32(inf_u_q, v);
1115
1116 w0 = _mm256_cmpgt_epi32(v, avx_mmsbp2);
1117 if (!_mm256_testz_si256(w0, w0)) {
1118 return false;
1119 }
1120
1121 _mm256_storeu_si256((__m256i*)(vp + v_n_size), v);
1122 }
1123 }
1124
1125 ui32 *vp = v_n_scratch;
1126 ui16 *sp = scratch + (y >> 1) * sstr;
1127 ui32 *dp = decoded_data + y * stride;
1128 vp[0] = 2; // for easy calculation of emax
1129
1130 for (ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4) {
1131 //process two quads
1132 __m128i vn = _mm_set1_epi32(2);
1133
1134 __m256i inf_u_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)sp));
1135 inf_u_q = _mm256_permutevar8x32_epi32(inf_u_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1));
1136
1137 __m256i U_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)(vp + v_n_size)));
1138 U_q = _mm256_permutevar8x32_epi32(U_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1));
1139
1140 __m256i row = decode_two_quad32_avx2(inf_u_q, U_q, dbuf, limit, pos, p, vn);
1141 row = _mm256_permutevar8x32_epi32(row, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
1142 _mm_store_si128((__m128i*)dp, _mm256_castsi256_si128(row));
1143 _mm_store_si128((__m128i*)(dp + stride), _mm256_extracti128_si256(row, 0x1));
1144
1145 __m128i w0 = _mm_cvtsi32_si128(*(int const*)vp);
1146 w0 = _mm_or_si128(w0, vn);
1147 _mm_storeu_si128((__m128i*)vp, w0);
1148 }
1149 }
1150 return true;
1151 }
1152
1153 //************************************************************************/
1161 void decode_cb_spp_mrp(ui16* scratch, ui32* decoded_data, ui8* coded_data,
1162 ui32 width, ui32 height, ui32 stride, ui32 sstr,
1163 ui32 p, ui32 num_passes, ui32 lengths1,
1164 ui32 lengths2, bool stripe_causal)
1165 {
1166 // We use scratch again, we can divide it into multiple regions
1167 // sigma holds all the significant samples, and it cannot
1168 // be modified after it is set. it will be used during the
1169 // Magnitude Refinement Pass
1170 ui16* const sigma = scratch;
1171
1172 ui32 mstr = (width + 3u) >> 2; // divide by 4, since each
1173 // ui16 contains 4 columns
1174 mstr = ((mstr + 2u) + 7u) & ~7u; // multiples of 8
1175
1176 // We re-arrange quad significance, where each 4 consecutive
1177 // bits represent one quad, into column significance, where,
1178 // each 4 consequtive bits represent one column of 4 rows
1179 {
1180 ui32 y;
1181
1182 const __m128i mask_3 = _mm_set1_epi32(0x30);
1183 const __m128i mask_C = _mm_set1_epi32(0xC0);
1184 const __m128i shuffle_mask = _mm_set_epi32(-1, -1, -1, 0x0C080400);
1185 for (y = 0; y < height; y += 4)
1186 {
1187 ui16* sp = scratch + (y >> 1) * sstr;
1188 ui16* dp = sigma + (y >> 2) * mstr;
1189 for (ui32 x = 0; x < width; x += 8, sp += 8, dp += 2)
1190 {
1191 __m128i s0, s1, u3, uC, t0, t1;
1192
1193 s0 = _mm_loadu_si128((__m128i*)(sp));
1194 u3 = _mm_and_si128(s0, mask_3);
1195 u3 = _mm_srli_epi32(u3, 4);
1196 uC = _mm_and_si128(s0, mask_C);
1197 uC = _mm_srli_epi32(uC, 2);
1198 t0 = _mm_or_si128(u3, uC);
1199
1200 s1 = _mm_loadu_si128((__m128i*)(sp + sstr));
1201 u3 = _mm_and_si128(s1, mask_3);
1202 u3 = _mm_srli_epi32(u3, 2);
1203 uC = _mm_and_si128(s1, mask_C);
1204 t1 = _mm_or_si128(u3, uC);
1205
1206 __m128i r = _mm_or_si128(t0, t1);
1207 r = _mm_shuffle_epi8(r, shuffle_mask);
1208
1209 ui32 t = (ui32)_mm_extract_epi32(r, 0);
1210 memcpy(dp, &t, 4);
1211 }
1212 dp[0] = 0; // set an extra entry on the right with 0
1213 }
1214 {
1215 // reset one row after the codeblock
1216 ui16* dp = sigma + (y >> 2) * mstr;
1217 __m128i zero = _mm_setzero_si128();
1218 for (ui32 x = 0; x < width; x += 32, dp += 8)
1219 _mm_storeu_si128((__m128i*)dp, zero);
1220 dp[0] = 0; // set an extra entry on the right with 0
1221 }
1222 }
1223
1224 // We perform Significance Propagation Pass here
1225 {
1226 // This stores significance information of the previous
1227 // 4 rows. Significance information in this array includes
1228 // all signicant samples in bitplane p - 1; that is,
1229 // significant samples for bitplane p (discovered during the
1230 // cleanup pass and stored in sigma) and samples that have recently
1231 // became significant (during the SPP) in bitplane p-1.
1232 // We store enough for the widest row, containing 1024 columns,
1233 // which is equivalent to 256 of ui16, since each stores 4 columns.
1234 // We add an extra 8 entries, just in case we need more
1235 ui16 prev_row_sig[256 + 8] = {0}; // 528 Bytes
1236
1237 // maximum consumable SPP bits: 4096 samples x 2 bits (one
1238 // significance bit and one sign bit per sample)
1239 const ui32 spp_cap = 4096 * 2 / 8;
1240 ui8 spp_buf[spp_cap + 72];
1241 ui32 spp_limit = destuff_frwd<0>(coded_data + lengths1,
1242 (int)lengths2, spp_buf, spp_cap);
1243 ui32 spp_pos = 0;
1244
1245 for (ui32 y = 0; y < height; y += 4)
1246 {
1247 ui32 pattern = 0xFFFFu; // a pattern needed samples
1248 if (height - y < 4) {
1249 pattern = 0x7777u;
1250 if (height - y < 3) {
1251 pattern = 0x3333u;
1252 if (height - y < 2)
1253 pattern = 0x1111u;
1254 }
1255 }
1256
1257 // prev holds sign. info. for the previous quad, together
1258 // with the rows on top of it and below it.
1259 ui32 prev = 0;
1260 ui16 *prev_sig = prev_row_sig;
1261 ui16 *cur_sig = sigma + (y >> 2) * mstr;
1262 ui32 *dpp = decoded_data + y * stride;
1263 for (ui32 x = 0; x < width; x += 4, dpp += 4, ++cur_sig, ++prev_sig)
1264 {
1265 // only rows and columns inside the stripe are included
1266 si32 s = (si32)x + 4 - (si32)width;
1267 s = ojph_max(s, 0);
1268 pattern = pattern >> (s * 4);
1269
1270 // We first find locations that need to be tested (potential
1271 // SPP members); these location will end up in mbr
1272 // In each iteration, we produce 16 bits because cwd can have
1273 // up to 16 bits of significance information, followed by the
1274 // corresponding 16 bits of sign information; therefore, it is
1275 // sufficient to fetch 32 bit data per loop.
1276
1277 // Althougth we are interested in 16 bits only, we load 32 bits.
1278 // For the 16 bits we are producing, we need the next 4 bits --
1279 // We need data for at least 5 columns out of 8.
1280 // Therefore loading 32 bits is easier than loading 16 bits
1281 // twice.
1282 ui32 ps, ns, cs;
1283 memcpy(&ps, prev_sig, 4);
1284 memcpy(&ns, cur_sig + mstr, 4);
1285 ui32 u = (ps & 0x88888888) >> 3; // the row on top
1286 if (!stripe_causal)
1287 u |= (ns & 0x11111111) << 3; // the row below
1288
1289 memcpy(&cs, cur_sig, 4);
1290 // vertical integration
1291 ui32 mbr = cs; // this sig. info.
1292 mbr |= (cs & 0x77777777) << 1; //above neighbors
1293 mbr |= (cs & 0xEEEEEEEE) >> 1; //below neighbors
1294 mbr |= u;
1295 // horizontal integration
1296 ui32 t = mbr;
1297 mbr |= t << 4; // neighbors on the left
1298 mbr |= t >> 4; // neighbors on the right
1299 mbr |= prev >> 12; // significance of previous group
1300
1301 // remove outside samples, and already significant samples
1302 mbr &= pattern;
1303 mbr &= ~cs;
1304
1305 // find samples that become significant during the SPP
1306 ui32 new_sig = mbr;
1307 if (new_sig)
1308 {
1309 ui64 cwd = dfetch64(spp_buf, spp_limit, spp_pos);
1310
1311 ui32 cnt = 0;
1312 ui32 col_mask = 0xFu;
1313 ui32 inv_sig = ~cs & pattern;
1314 for (int i = 0; i < 16; i += 4, col_mask <<= 4)
1315 {
1316 if ((col_mask & new_sig) == 0)
1317 continue;
1318
1319 //scan one column
1320 ui32 sample_mask = 0x1111u & col_mask;
1321 if (new_sig & sample_mask)
1322 {
1323 new_sig &= ~sample_mask;
1324 if (cwd & 1)
1325 {
1326 ui32 t = 0x33u << i;
1327 new_sig |= t & inv_sig;
1328 }
1329 cwd >>= 1; ++cnt;
1330 }
1331
1332 sample_mask <<= 1;
1333 if (new_sig & sample_mask)
1334 {
1335 new_sig &= ~sample_mask;
1336 if (cwd & 1)
1337 {
1338 ui32 t = 0x76u << i;
1339 new_sig |= t & inv_sig;
1340 }
1341 cwd >>= 1; ++cnt;
1342 }
1343
1344 sample_mask <<= 1;
1345 if (new_sig & sample_mask)
1346 {
1347 new_sig &= ~sample_mask;
1348 if (cwd & 1)
1349 {
1350 ui32 t = 0xECu << i;
1351 new_sig |= t & inv_sig;
1352 }
1353 cwd >>= 1; ++cnt;
1354 }
1355
1356 sample_mask <<= 1;
1357 if (new_sig & sample_mask)
1358 {
1359 new_sig &= ~sample_mask;
1360 if (cwd & 1)
1361 {
1362 ui32 t = 0xC8u << i;
1363 new_sig |= t & inv_sig;
1364 }
1365 cwd >>= 1; ++cnt;
1366 }
1367 }
1368
1369 if (new_sig)
1370 {
1371 // the sign bits sit right after the cnt consumed bits
1372 // of cwd; no reassembly is needed
1373
1374 // Spread new_sig, such that each bit is in one byte with a
1375 // value of 0 if new_sig bit is 0, and 0xFF if new_sig is 1
1376 __m128i new_sig_vec = _mm_set1_epi16((si16)new_sig);
1377 new_sig_vec = _mm_shuffle_epi8(new_sig_vec,
1378 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1379 new_sig_vec = _mm_and_si128(new_sig_vec,
1380 _mm_set1_epi64x((si64)0x8040201008040201));
1381 new_sig_vec = _mm_cmpeq_epi8(new_sig_vec,
1382 _mm_set1_epi64x((si64)0x8040201008040201));
1383
1384 // find cumulative sums
1385 // to find which bit in cwd we should extract
1386 __m128i inc_sum = new_sig_vec; // inclusive scan
1387 inc_sum = _mm_abs_epi8(inc_sum); // cvrt to 0 or 1
1388 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 1));
1389 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 2));
1390 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 4));
1391 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 8));
1392 cnt += (ui32)_mm_extract_epi16(inc_sum, 7) >> 8;
1393 // exclusive scan
1394 __m128i ex_sum = _mm_bslli_si128(inc_sum, 1);
1395
1396 // Spread cwd, such that each bit is in one byte
1397 // with a value of 0 or 1.
1398 __m128i cwd_vec = _mm_set1_epi16((si16)cwd);
1399 cwd_vec = _mm_shuffle_epi8(cwd_vec,
1400 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1401 cwd_vec = _mm_and_si128(cwd_vec,
1402 _mm_set1_epi64x((si64)0x8040201008040201));
1403 cwd_vec = _mm_cmpeq_epi8(cwd_vec,
1404 _mm_set1_epi64x((si64)0x8040201008040201));
1405 cwd_vec = _mm_abs_epi8(cwd_vec);
1406
1407 // Obtain bit from cwd_vec correspondig to ex_sum
1408 // Basically, collect needed bits from cwd_vec
1409 __m128i v = _mm_shuffle_epi8(cwd_vec, ex_sum);
1410
1411 // load data and set spp coefficients
1412 __m128i m =
1413 _mm_set_epi8(-1,-1,-1,12,-1,-1,-1,8,-1,-1,-1,4,-1,-1,-1,0);
1414 __m128i val = _mm_set1_epi32(3 << (p - 2));
1415 ui32 *dp = dpp;
1416 for (int c = 0; c < 4; ++ c) {
1417 __m128i s0, s0_ns, s0_val;
1418 // load coefficients
1419 s0 = _mm_load_si128((__m128i*)dp);
1420
1421 // epi32 is -1 only for coefficient that
1422 // are changed during the SPP
1423 s0_ns = _mm_shuffle_epi8(new_sig_vec, m);
1424 s0_ns = _mm_cmpeq_epi32(s0_ns, _mm_set1_epi32(0xFF));
1425
1426 // obtain sign for coefficients in SPP
1427 s0_val = _mm_shuffle_epi8(v, m);
1428 s0_val = _mm_slli_epi32(s0_val, 31);
1429 s0_val = _mm_or_si128(s0_val, val);
1430 s0_val = _mm_and_si128(s0_val, s0_ns);
1431
1432 // update vector
1433 s0 = _mm_or_si128(s0, s0_val);
1434 // store coefficients
1435 _mm_store_si128((__m128i*)dp, s0);
1436 // prepare for next row
1437 dp += stride;
1438 m = _mm_add_epi32(m, _mm_set1_epi32(1));
1439 }
1440 }
1441 spp_pos += cnt;
1442 }
1443
1444 new_sig |= cs;
1445 *prev_sig = (ui16)(new_sig);
1446
1447 // vertical integration for the new sig. info.
1448 t = new_sig;
1449 new_sig |= (t & 0x7777) << 1; //above neighbors
1450 new_sig |= (t & 0xEEEE) >> 1; //below neighbors
1451 // add sig. info. from the row on top and below
1452 prev = new_sig | u;
1453 // we need only the bits in 0xF000
1454 prev &= 0xF000;
1455 }
1456 }
1457 }
1458
1459 // We perform Magnitude Refinement Pass here
1460 if (num_passes > 2)
1461 {
1462 // de-stuff the MRP segment; consumption is at most 1 bit per
1463 // sample = 512 bytes, so 1024 bytes always suffice
1464 const ui32 mrp_cap = 1024;
1465 ui8 mrp_buf[mrp_cap + 72];
1466 ui32 mrp_limit = destuff_mrp(coded_data, (int)lengths1,
1467 (int)lengths2, mrp_buf, mrp_cap);
1468 ui32 mrp_pos = 0;
1469
1470 for (ui32 y = 0; y < height; y += 4)
1471 {
1472 ui16 *cur_sig = sigma + (y >> 2) * mstr;
1473 ui32 *dpp = decoded_data + y * stride;
1474 for (ui32 i = 0; i < width; i += 4, dpp += 4)
1475 {
1476 //Process one entry from sigma array at a time
1477 // Each nibble (4 bits) in the sigma array represents 4 rows,
1478 ui64 cwd = dfetch64(mrp_buf, mrp_limit, mrp_pos);
1479 ui16 sig = *cur_sig++; // 16 bit that will be processed now
1480 int total_bits = 0;
1481 if (sig) // if any of the 32 bits are set
1482 {
1483 // We work on 4 rows, with 4 samples each, since
1484 // data is 32 bit (4 bytes)
1485
1486 // spread the 16 bits in sig to 0 or 1 bytes in sig_vec
1487 __m128i sig_vec = _mm_set1_epi16((si16)sig);
1488 sig_vec = _mm_shuffle_epi8(sig_vec,
1489 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1490 sig_vec = _mm_and_si128(sig_vec,
1491 _mm_set1_epi64x((si64)0x8040201008040201));
1492 sig_vec = _mm_cmpeq_epi8(sig_vec,
1493 _mm_set1_epi64x((si64)0x8040201008040201));
1494 sig_vec = _mm_abs_epi8(sig_vec);
1495
1496 // find cumulative sums
1497 // to find which bit in cwd we should extract
1498 __m128i inc_sum = sig_vec; // inclusive scan
1499 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 1));
1500 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 2));
1501 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 4));
1502 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 8));
1503 total_bits = _mm_extract_epi16(inc_sum, 7) >> 8;
1504 __m128i ex_sum = _mm_bslli_si128(inc_sum, 1); // exclusive scan
1505
1506 // Spread the 16 bits in cwd to inverted 0 or 1 bytes in
1507 // cwd_vec. Then, convert these to a form suitable
1508 // for coefficient modifications; in particular, a value
1509 // of 0 is presented as binary 11, and a value of 1 is
1510 // represented as binary 01
1511 __m128i cwd_vec = _mm_set1_epi16((si16)cwd);
1512 cwd_vec = _mm_shuffle_epi8(cwd_vec,
1513 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1514 cwd_vec = _mm_and_si128(cwd_vec,
1515 _mm_set1_epi64x((si64)0x8040201008040201));
1516 cwd_vec = _mm_cmpeq_epi8(cwd_vec,
1517 _mm_set1_epi64x((si64)0x8040201008040201));
1518 cwd_vec = _mm_add_epi8(cwd_vec, _mm_set1_epi8(1));
1519 cwd_vec = _mm_add_epi8(cwd_vec, cwd_vec);
1520 cwd_vec = _mm_or_si128(cwd_vec, _mm_set1_epi8(1));
1521
1522 // load data and insert the mrp bit
1523 __m128i m =
1524 _mm_set_epi8(-1,-1,-1,12,-1,-1,-1,8,-1,-1,-1,4,-1,-1,-1,0);
1525 ui32 *dp = dpp;
1526 for (int c = 0; c < 4; ++c) {
1527 __m128i s0, s0_sig, s0_idx, s0_val;
1528 // load coefficients
1529 s0 = _mm_load_si128((__m128i*)dp);
1530 // find significant samples in this row
1531 s0_sig = _mm_shuffle_epi8(sig_vec, m);
1532 s0_sig = _mm_cmpeq_epi8(s0_sig, _mm_setzero_si128());
1533 // get MRP bit index, and MRP pattern
1534 s0_idx = _mm_shuffle_epi8(ex_sum, m);
1535 s0_val = _mm_shuffle_epi8(cwd_vec, s0_idx);
1536 // keep data from significant samples only
1537 s0_val = _mm_andnot_si128(s0_sig, s0_val);
1538 // move mrp bits to correct position, and employ
1539 s0_val = _mm_slli_epi32(s0_val, (si32)p - 2);
1540 s0 = _mm_xor_si128(s0, s0_val);
1541 // store coefficients
1542 _mm_store_si128((__m128i*)dp, s0);
1543 // prepare for next row
1544 dp += stride;
1545 m = _mm_add_epi32(m, _mm_set1_epi32(1));
1546 }
1547 }
1548 // consume data according to the number of bits set
1549 mrp_pos += (ui32)total_bits;
1550 }
1551 }
1552 }
1553 }
1554
1555 //************************************************************************/
1562 void decode_cb_step1_vlc(ui16* scratch, ui8* coded_data, int lcup,
1563 int scup, ui32 width, ui32 height, ui32 sstr)
1564 {
1565 // init structures
1566 dec_mel_st mel;
1567 mel_init(&mel, coded_data, lcup, scup);
1568
1569 // de-stuff the VLC segment; its size is at most scup - 1 < 4095
1570 // bytes (scup is a 12-bit value), and consumption per quad pair is
1571 // at most 7 + 7 + 30 bits, so 4096 bytes always suffice
1572 const ui32 vlc_cap = 4096;
1573 ui8 vlc_buf[vlc_cap + 72];
1574 ui32 vlc_limit = destuff_vlc(coded_data, lcup, scup,
1575 vlc_buf, vlc_cap);
1576 ui32 vlc_off = 0;
1577 ui32 vlc_bits = 0;
1578
1579 int run = mel_get_run(&mel); // decode runs of events from MEL bitstrm
1580 // data represented as runs of 0 events
1581 // See mel_decode description
1582
1583 ui64 vlc_val = 0;
1584 ui32 c_q = 0;
1585 ui16 *sp = scratch;
1586 //initial quad row
1587 for (ui32 x = 0; x < width; sp += 4)
1588 {
1589 // decode VLC
1591
1592 // first quad
1593 drefill(vlc_val, vlc_bits, vlc_off, vlc_buf, vlc_limit);
1594
1595 //decode VLC using the context c_q and the head of VLC bitstream
1596 ui16 t0 = vlc_tbl0[ c_q + (vlc_val & 0x7F) ];
1597
1598 // if context is zero, use one MEL event
1599 if (c_q == 0) //zero context
1600 {
1601 run -= 2; //subtract 2, since events number if multiplied by 2
1602
1603 // Is the run terminated in 1? if so, use decoded VLC code,
1604 // otherwise, discard decoded data, since we will decoded again
1605 // using a different context
1606 t0 = (run == -1) ? t0 : 0;
1607
1608 // is run -1 or -2? this means a run has been consumed
1609 if (run < 0)
1610 run = mel_get_run(&mel); // get another run
1611 }
1612 //run -= (c_q == 0) ? 2 : 0;
1613 //t0 = (c_q != 0 || run == -1) ? t0 : 0;
1614 //if (run < 0)
1615 // run = mel_get_run(&mel); // get another run
1616 sp[0] = t0;
1617 x += 2;
1618
1619 // prepare context for the next quad; eqn. 1 in ITU T.814
1620 c_q = ((t0 & 0x10U) << 3) | ((t0 & 0xE0U) << 2);
1621
1622 //remove data from vlc stream (0 bits are removed if vlc is not used)
1623 dconsume(vlc_val, vlc_bits, t0 & 0x7u);
1624
1625 //second quad
1626 ui16 t1 = 0;
1627
1628 //decode VLC using the context c_q and the head of VLC bitstream
1629 t1 = vlc_tbl0[c_q + (vlc_val & 0x7F)];
1630
1631 // if context is zero, use one MEL event
1632 if (c_q == 0 && x < width) //zero context
1633 {
1634 run -= 2; //subtract 2, since events number if multiplied by 2
1635
1636 // if event is 0, discard decoded t1
1637 t1 = (run == -1) ? t1 : 0;
1638
1639 if (run < 0) // have we consumed all events in a run
1640 run = mel_get_run(&mel); // if yes, then get another run
1641 }
1642 t1 = x < width ? t1 : 0;
1643 //run -= (c_q == 0 && x < width) ? 2 : 0;
1644 //t1 = (c_q != 0 || run == -1) ? t1 : 0;
1645 //if (run < 0)
1646 // run = mel_get_run(&mel); // get another run
1647 sp[2] = t1;
1648 x += 2;
1649
1650 //prepare context for the next quad, eqn. 1 in ITU T.814
1651 c_q = ((t1 & 0x10U) << 3) | ((t1 & 0xE0U) << 2);
1652
1653 //remove data from vlc stream, if qinf is not used, cwdlen is 0
1654 dconsume(vlc_val, vlc_bits, t1 & 0x7u);
1655
1656 // decode u
1658 // uvlc_mode is made up of u_offset bits from the quad pair
1659 ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
1660 if (uvlc_mode == 0xc0)// if both u_offset are set, get an event from
1661 { // the MEL run of events
1662 run -= 2; //subtract 2, since events number if multiplied by 2
1663
1664 uvlc_mode += (run == -1) ? 0x40 : 0; // increment uvlc_mode by
1665 // is 0x40
1666
1667 if (run < 0)//if run is consumed (run is -1 or -2), get another run
1668 run = mel_get_run(&mel);
1669 }
1670 //run -= (uvlc_mode == 0xc0) ? 2 : 0;
1671 //uvlc_mode += (uvlc_mode == 0xc0 && run == -1) ? 0x40 : 0;
1672 //if (run < 0)
1673 // run = mel_get_run(&mel); // get another run
1674
1675 //decode uvlc_mode to get u for both quads
1676 ui32 uvlc_entry = uvlc_tbl0[uvlc_mode + (vlc_val & 0x3F)];
1677 //remove total prefix length
1678 dconsume(vlc_val, vlc_bits, uvlc_entry & 0x7u);
1679 uvlc_entry >>= 3;
1680 //extract suffixes for quad 0 and 1
1681 ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads
1682 ui32 tmp = (ui32)vlc_val & ((1u << len) - 1); //suffix value, 2 quads
1683 dconsume(vlc_val, vlc_bits, len);
1684 uvlc_entry >>= 4;
1685 // quad 0 length
1686 len = uvlc_entry & 0x7; // quad 0 suffix length
1687 uvlc_entry >>= 3;
1688 ui16 u_q = (ui16)(1 + (uvlc_entry&7) + (tmp&~(0xFFU<<len))); //kap. 1
1689 sp[1] = u_q;
1690 u_q = (ui16)(1 + (uvlc_entry >> 3) + (tmp >> len)); //kappa == 1
1691 sp[3] = u_q;
1692 }
1693 sp[0] = sp[1] = 0;
1694
1695 //non initial quad rows
1696 for (ui32 y = 2; y < height; y += 2)
1697 {
1698 c_q = 0; // context
1699 ui16 *sp = scratch + (y >> 1) * sstr; // this row of quads
1700
1701 for (ui32 x = 0; x < width; sp += 4)
1702 {
1703 // decode VLC
1705
1706 // sigma_q (n, ne, nf)
1707 c_q |= ((sp[0 - (si32)sstr] & 0xA0U) << 2);
1708 c_q |= ((sp[2 - (si32)sstr] & 0x20U) << 4);
1709
1710 // first quad
1711 drefill(vlc_val, vlc_bits, vlc_off, vlc_buf, vlc_limit);
1712
1713 //decode VLC using the context c_q and the head of VLC bitstream
1714 ui16 t0 = vlc_tbl1[ c_q + (vlc_val & 0x7F) ];
1715
1716 // if context is zero, use one MEL event
1717 if (c_q == 0) //zero context
1718 {
1719 run -= 2; //subtract 2, since events number is multiplied by 2
1720
1721 // Is the run terminated in 1? if so, use decoded VLC code,
1722 // otherwise, discard decoded data, since we will decoded again
1723 // using a different context
1724 t0 = (run == -1) ? t0 : 0;
1725
1726 // is run -1 or -2? this means a run has been consumed
1727 if (run < 0)
1728 run = mel_get_run(&mel); // get another run
1729 }
1730 //run -= (c_q == 0) ? 2 : 0;
1731 //t0 = (c_q != 0 || run == -1) ? t0 : 0;
1732 //if (run < 0)
1733 // run = mel_get_run(&mel); // get another run
1734 sp[0] = t0;
1735 x += 2;
1736
1737 // prepare context for the next quad; eqn. 2 in ITU T.814
1738 // sigma_q (w, sw)
1739 c_q = ((t0 & 0x40U) << 2) | ((t0 & 0x80U) << 1);
1740 // sigma_q (nw)
1741 c_q |= sp[0 - (si32)sstr] & 0x80;
1742 // sigma_q (n, ne, nf)
1743 c_q |= ((sp[2 - (si32)sstr] & 0xA0U) << 2);
1744 c_q |= ((sp[4 - (si32)sstr] & 0x20U) << 4);
1745
1746 //remove data from vlc stream (0 bits are removed if vlc is unused)
1747 dconsume(vlc_val, vlc_bits, t0 & 0x7u);
1748
1749 //second quad
1750 ui16 t1 = 0;
1751
1752 //decode VLC using the context c_q and the head of VLC bitstream
1753 t1 = vlc_tbl1[ c_q + (vlc_val & 0x7F)];
1754
1755 // if context is zero, use one MEL event
1756 if (c_q == 0 && x < width) //zero context
1757 {
1758 run -= 2; //subtract 2, since events number if multiplied by 2
1759
1760 // if event is 0, discard decoded t1
1761 t1 = (run == -1) ? t1 : 0;
1762
1763 if (run < 0) // have we consumed all events in a run
1764 run = mel_get_run(&mel); // if yes, then get another run
1765 }
1766 t1 = x < width ? t1 : 0;
1767 //run -= (c_q == 0 && x < width) ? 2 : 0;
1768 //t1 = (c_q != 0 || run == -1) ? t1 : 0;
1769 //if (run < 0)
1770 // run = mel_get_run(&mel); // get another run
1771 sp[2] = t1;
1772 x += 2;
1773
1774 // partial c_q, will be completed when we process the next quad
1775 // sigma_q (w, sw)
1776 c_q = ((t1 & 0x40U) << 2) | ((t1 & 0x80U) << 1);
1777 // sigma_q (nw)
1778 c_q |= sp[2 - (si32)sstr] & 0x80;
1779
1780 //remove data from vlc stream, if qinf is not used, cwdlen is 0
1781 dconsume(vlc_val, vlc_bits, t1 & 0x7u);
1782
1783 // decode u using wide UVLC table
1785 ui32 uvlc_mode = (((t0 >> 3) & 1) | (((t1 >> 3) & 1) << 1));
1786 ui32 uvlc_entry =
1787 uvlc_tbl1_wide[(uvlc_mode << 10) | (vlc_val & 0x3FF)];
1788 ui32 total_bits = uvlc_entry & 0x1F;
1789 if (total_bits < 0x1F) {
1790 sp[1] = (ui16)((uvlc_entry >> 5) & 0xFF);
1791 sp[3] = (ui16)((uvlc_entry >> 13) & 0xFF);
1792 dconsume(vlc_val, vlc_bits, total_bits);
1793 } else {
1794 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
1795 uvlc_entry = uvlc_tbl1[uvlc_mode + (vlc_val & 0x3F)];
1796 dconsume(vlc_val, vlc_bits, uvlc_entry & 0x7u);
1797 uvlc_entry >>= 3;
1798 ui32 len = uvlc_entry & 0xF;
1799 ui32 tmp = (ui32)vlc_val & ((1u << len) - 1);
1800 dconsume(vlc_val, vlc_bits, len);
1801 uvlc_entry >>= 4;
1802 len = uvlc_entry & 0x7;
1803 uvlc_entry >>= 3;
1804 sp[1] = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len)));
1805 sp[3] = (ui16)((uvlc_entry >> 3) + (tmp >> len));
1806 }
1807 }
1808 sp[0] = sp[1] = 0;
1809 }
1810 }
1811
1812 bool ojph_decode_codeblock_avx2(ui8* coded_data, ui32* decoded_data,
1813 ui32 missing_msbs, ui32 num_passes,
1814 ui32 lengths1, ui32 lengths2,
1815 ui32 width, ui32 height, ui32 stride,
1816 bool stripe_causal)
1817 {
1818 static bool insufficient_precision = false;
1819 static bool modify_code = false;
1820 static bool truncate_spp_mrp = false;
1821
1822 if (num_passes > 1 && lengths2 == 0)
1823 {
1824 OJPH_WARN(0x00010001, "A malformed codeblock that has more than "
1825 "one coding pass, but zero length for "
1826 "2nd and potential 3rd pass.");
1827 num_passes = 1;
1828 }
1829
1830 if (num_passes > 3)
1831 {
1832 OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; "
1833 "This codeblocks has %d passes.",
1834 num_passes);
1835 return false;
1836 }
1837
1838 if (missing_msbs > 30) // p < 0
1839 {
1840 if (insufficient_precision == false)
1841 {
1842 insufficient_precision = true;
1843 OJPH_WARN(0x00010003, "32 bits are not enough to decode this "
1844 "codeblock. This message will not be "
1845 "displayed again.");
1846 }
1847 return false;
1848 }
1849 else if (missing_msbs == 30) // p == 0
1850 { // not enough precision to decode and set the bin center to 1
1851 if (modify_code == false) {
1852 modify_code = true;
1853 OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup "
1854 "pass. The code can be modified to support "
1855 "this case. This message will not be "
1856 "displayed again.");
1857 }
1858 return false; // 32 bits are not enough to decode this
1859 }
1860 else if (missing_msbs == 29) // if p is 1, then num_passes must be 1
1861 {
1862 if (num_passes > 1) {
1863 num_passes = 1;
1864 if (truncate_spp_mrp == false) {
1865 truncate_spp_mrp = true;
1866 OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp "
1867 "nor MagRef passes; both will be skipped. "
1868 "This message will not be displayed "
1869 "again.");
1870 }
1871 }
1872 }
1873 ui32 p = 30 - missing_msbs; // The least significant bitplane for CUP
1874 // There is a way to handle the case of p == 0, but a different path
1875 // is required
1876
1877 if (lengths1 < 2)
1878 {
1879 OJPH_WARN(0x00010006, "Wrong codeblock length.");
1880 return false;
1881 }
1882
1883 // read scup and fix the bytes there
1884 int lcup, scup;
1885 lcup = (int)lengths1; // length of CUP
1886 //scup is the length of MEL + VLC
1887 scup = (((int)coded_data[lcup-1]) << 4) + (coded_data[lcup-2] & 0xF);
1888 if (scup < 2 || scup > lcup || scup > 4079) //something is wrong
1889 return false;
1890
1891 // The temporary storage scratch holds two types of data in an
1892 // interleaved fashion. The interleaving allows us to use one
1893 // memory pointer.
1894 // We have one entry for a decoded VLC code, and one entry for UVLC.
1895 // Entries are 16 bits each, corresponding to one quad,
1896 // but since we want to use XMM registers of the SSE family
1897 // of SIMD; we allocated 16 bytes or more per quad row; that is,
1898 // the width is no smaller than 16 bytes (or 8 entries), and the
1899 // height is 512 quads
1900 // Each VLC entry contains, in the following order, starting
1901 // from MSB
1902 // e_k (4bits), e_1 (4bits), rho (4bits), useless for step 2 (4bits)
1903 // Each entry in UVLC contains u_q
1904 // One extra row to handle the case of SPP propagating downwards
1905 // when codeblock width is 4
1906 // We need an extra two entries (one inf and one u_q) beyond
1907 // the last column.
1908 // If the block width is 4 (2 quads), then we use sstr of 8
1909 // (enough for 4 quads). If width is 8 (4 quads) we use
1910 // sstr is 16 (enough for 8 quads). For a width of 16 (8
1911 // quads), we use 24 (enough for 12 quads).
1912 ui32 sstr = ((width + 2u) + 7u) & ~7u; // multiples of 8
1913
1914#ifdef __MINGW64__
1915 ui16 scratch[8 * 513] = {0};
1916#else
1917 ui16 scratch[8 * 513];
1918 ui32 quad_rows = (height + 1u) >> 1;
1919 size_t scratch_zero = (size_t)(quad_rows + 1) * sstr;
1920 if (scratch_zero > 8 * 513) scratch_zero = 8 * 513;
1921 memset(scratch, 0, scratch_zero * sizeof(ui16));
1922#endif
1923
1924 assert((stride & 0x3) == 0);
1925
1926 ui32 mmsbp2 = missing_msbs + 2;
1927
1928 // The cleanup pass is decoded in two steps; in step one,
1929 // the VLC and MEL segments are decoded, generating a record that
1930 // has 2 bytes per quad. The 2 bytes contain, u, rho, e^1 & e^k.
1931 // This information should be sufficient for the next step.
1932 // In step 2, we decode the MagSgn segment.
1933
1934 // step 1: decode VLC and MEL segments into scratch
1935 decode_cb_step1_vlc(scratch, coded_data, lcup, scup, width, height, sstr);
1936
1937 // step2 we decode magsgn
1938 // mmsbp2 equals K_max + 1 (we decode up to K_max bits + 1 sign bit)
1939 // The 32 bit path decode 16 bits data, for which one would think
1940 // 16 bits are enough, because we want to put in the center of the
1941 // bin.
1942 // If you have mmsbp2 equals 16 bit, and reversible coding, and
1943 // no bitplanes are missing, then we can decoding using the 16 bit
1944 // path, but we are not doing this here.
1945 if (mmsbp2 >= 16)
1946 {
1947 if (!decode_cb_step2_32bit(scratch, decoded_data, coded_data,
1948 width, height, stride, sstr, p, mmsbp2,
1949 lcup, scup))
1950 return false;
1951 }
1952 else {
1953 if (!decode_cb_step2_16bit(scratch, decoded_data, coded_data,
1954 width, height, stride, sstr, p, mmsbp2,
1955 lcup, scup))
1956 return false;
1957 }
1958
1959 if (num_passes > 1)
1960 decode_cb_spp_mrp(scratch, decoded_data, coded_data, width, height,
1961 stride, sstr, p, num_passes, lengths1, lengths2,
1962 stripe_causal);
1963
1964 return true;
1965 }
1966 }
1967}
1968
1969#endif
ui32 uvlc_tbl1_wide[4096]
uvlc_tbl1_wide: wider UVLC table for non-initial rows. Index = mode(2 bits) * 1024 + vlc_data(10 bits...
ui16 uvlc_tbl0[256+64]
uvlc_tbl0 contains decoding information for initial row of quads
ui16 uvlc_tbl1[256]
uvlc_tbl1 contains decoding information for non-initial row of quads
ui16 vlc_tbl1[1024]
vlc_tbl1 contains decoding information for non-initial row of quads
ui16 vlc_tbl0[1024]
vlc_tbl0 contains decoding information for initial row of quads
static void mel_read(dec_mel_st *melp)
Reads and unstuffs the MEL bitstream.
static int mel_get_run(dec_mel_st *melp)
Retrieves one run from dec_mel_st; if there are no runs stored MEL segment is decoded.
static void mel_init(dec_mel_st *melp, ui8 *bbuf, int lcup, int scup)
Initiates a dec_mel_st structure for MEL decoding and reads some bytes in order to get the read addre...
bool ojph_decode_codeblock_avx2(ui8 *coded_data, ui32 *decoded_data, ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2, ui32 width, ui32 height, ui32 stride, bool stripe_causal)
static void mel_decode(dec_mel_st *melp)
Decodes unstuffed MEL segment bits stored in tmp to runs.
int64_t si64
Definition ojph_defs.h:57
uint64_t ui64
Definition ojph_defs.h:56
uint16_t ui16
Definition ojph_defs.h:52
int32_t si32
Definition ojph_defs.h:55
int16_t si16
Definition ojph_defs.h:53
uint32_t ui32
Definition ojph_defs.h:54
uint8_t ui8
Definition ojph_defs.h:50
#define OJPH_NO_INLINE
Definition ojph_arch.h:75
#define OJPH_FORCE_INLINE
Definition ojph_arch.h:74
#define ojph_max(a, b)
Definition ojph_defs.h:73
#define OJPH_WARN(t,...)
MEL state structure for reading and decoding the MEL bitstream.
bool unstuff
true if the next bit needs to be unstuffed
int num_runs
number of decoded runs left in runs (maximum 8)
int size
number of bytes in MEL code
ui8 * data
the address of data (or bitstream)
int k
state of MEL decoder
int bits
number of bits stored in tmp
ui64 tmp
temporary buffer for read data
ui64 runs
runs of decoded MEL codewords (7 bits/run)