44#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
102 ui32 val = 0xFFFFFFFF;
103 if (melp->size > 4) {
104 memcpy(&val, melp->data, 4);
108 else if (melp->size > 0)
111 while (melp->size > 1) {
112 ui32 v = *melp->data++;
113 ui32 m = ~(0xFFu << i);
114 val = (val & m) | (v << i);
119 ui32 v = *melp->data++;
121 ui32 m = ~(0xFFu << i);
122 val = (val & m) | (v << i);
127 int bits = 32 - melp->unstuff;
134 bool unstuff = ((val & 0xFF) == 0xFF);
136 t = t << (8 - unstuff);
139 t |= (val>>8) & 0xFF;
140 unstuff = (((val >> 8) & 0xFF) == 0xFF);
142 t = t << (8 - unstuff);
144 t |= (val>>16) & 0xFF;
145 unstuff = (((val >> 16) & 0xFF) == 0xFF);
147 t = t << (8 - unstuff);
149 t |= (val>>24) & 0xFF;
150 melp->unstuff = (((val >> 24) & 0xFF) == 0xFF);
154 melp->tmp |= ((
ui64)t) << (64 - bits - melp->bits);
176 static const int mel_exp[13] = {
177 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5
186 while (melp->bits >= 6 && melp->num_runs < 8)
188 int eval = mel_exp[melp->k];
190 if (melp->tmp & (1ull<<63))
194 melp->k = melp->k + 1 < 12 ? melp->k + 1 : 12;
201 run = (int)(melp->tmp >> (63 - eval)) & ((1 << eval) - 1);
202 melp->k = melp->k - 1 > 0 ? melp->k - 1 : 0;
203 melp->tmp <<= eval + 1;
204 melp->bits -= eval + 1;
205 run = (run << 1) + 1;
207 eval = melp->num_runs * 7;
208 melp->runs &= ~((
ui64)0x3F << eval);
209 melp->runs |= ((
ui64)run) << eval;
227 melp->data = bbuf + lcup - scup;
230 melp->unstuff =
false;
231 melp->size = scup - 1;
239 int num = 4 - (int)(intptr_t(melp->data) & 0x3);
240 for (
int i = 0; i < num; ++i) {
241 assert(melp->unstuff ==
false || melp->data[0] <= 0x8F);
242 ui64 d = (melp->size > 0) ? *melp->data : 0xFF;
244 if (melp->size == 1) d |= 0xF;
246 melp->data += melp->size-- > 0;
247 int d_bits = 8 - melp->unstuff;
248 melp->tmp = (melp->tmp << d_bits) | d;
249 melp->bits += d_bits;
250 melp->unstuff = ((d & 0xFF) == 0xFF);
253 melp->tmp <<= (64 - melp->bits);
266 if (melp->num_runs == 0)
269 int t = melp->runs & 0x7F;
309 ui8* o_end = dst + cap;
311 const ui8* s_end = src + size;
314 bool prev_ff =
false;
317 while (s + 16 <= s_end && o + 24 <= o_end)
319 __m128i v = _mm_loadu_si128((
const __m128i*)s);
320 int ff = _mm_movemask_epi8(_mm_cmpeq_epi8(v, _mm_set1_epi8(-1)));
321 if (ff != 0 || prev_ff)
323 for (
int i = 0; i < 16; ++i) {
325 acc |= (
ui64)b << nb;
326 nb += prev_ff ? 7u : 8u;
327 prev_ff = (b == 0xFFu);
328 if (nb >= 8) { *o++ = (
ui8)acc; acc >>= 8; nb -= 8; }
334 memcpy(&v1, s + 8, 8);
335 ui64 w0 = acc | (v0 << nb);
336 ui64 w1 = (v1 << nb) | (nb ? (v0 >> (64 - nb)) : 0);
338 memcpy(o + 8, &w1, 8);
339 acc = nb ? (v1 >> (64 - nb)) : 0;
344 while (s < s_end && o < o_end)
347 acc |= (
ui64)b << nb;
348 nb += prev_ff ? 7u : 8u;
349 prev_ff = (b == 0xFFu);
350 if (nb >= 8) { *o++ = (
ui8)acc; acc >>= 8; nb -= 8; }
353 ui32 fill = (X == 0xFF) ? (0xFFu << nb) : 0;
355 __m256i pad = _mm256_set1_epi8((
char)X);
356 _mm256_storeu_si256((__m256i*)(o + 1), pad);
357 _mm256_storeu_si256((__m256i*)(o + 33), pad);
358 return (
ui32)(o - dst) + 1;
376 __m128i dfetch(
const ui8* dbuf,
ui32 limit,
ui32 pos)
379 off = off < limit ? off : limit;
380 const ui8* p = dbuf + off;
381 __m128i v = _mm_loadu_si128((
const __m128i*)p);
382 __m128i w = _mm_loadu_si128((
const __m128i*)(p + 8));
383 int k = (int)(pos & 7);
384 __m128i r = _mm_srl_epi64(v, _mm_cvtsi32_si128(k));
385 __m128i c = _mm_sll_epi64(w, _mm_cvtsi32_si128(64 - k));
386 return _mm_or_si128(r, c);
404 off = off < limit ? off : limit;
406 memcpy(&v, dbuf + off, 8);
407 return v >> (pos & 7);
434 ui32 o = off < limit ? off : limit;
435 memcpy(&v, dbuf + o, 8);
437 off += (63 - bits) >> 3;
486 ui32 destuff_rev(
const ui8* p,
int size,
bool unstuff,
490 ui8* o_end = dst + cap;
495 if (size > 0 && o < o_end)
499 acc |= (
ui64)d << nb;
500 nb += 8 - ((unstuff && ((d & 0x7F) == 0x7F)) ? 1u : 0u);
502 if (nb >= 8) { *o++ = (
ui8)acc; acc >>= 8; nb -= 8; }
506 while (size >= 16 && o + 24 <= o_end)
508 __m128i v = _mm_loadu_si128((
const __m128i*)(p - 15));
509 __m128i nx = _mm_loadu_si128((
const __m128i*)(p - 14));
510 __m128i is7f = _mm_cmpeq_epi8(
511 _mm_and_si128(v, _mm_set1_epi8(0x7F)), _mm_set1_epi8(0x7F));
513 __m128i le8f = _mm_cmpeq_epi8(
514 _mm_subs_epu8(nx, _mm_set1_epi8((
char)0x8F)),
515 _mm_setzero_si128());
516 __m128i stuff = _mm_andnot_si128(le8f, is7f);
517 if (!_mm_testz_si128(stuff, stuff))
519 for (
int i = 0; i < 16; ++i) {
521 acc |= (
ui64)d << nb;
522 nb += 8 - ((unstuff && ((d & 0x7F) == 0x7F)) ? 1u : 0u);
524 if (nb >= 8) { *o++ = (
ui8)acc; acc >>= 8; nb -= 8; }
529 __m128i r = _mm_shuffle_epi8(v,
530 _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,
531 8, 9, 10, 11, 12, 13, 14, 15));
532 ui64 v0 = (
ui64)_mm_cvtsi128_si64(r);
533 ui64 v1 = (
ui64)_mm_extract_epi64(r, 1);
534 ui64 w0 = acc | (v0 << nb);
535 ui64 w1 = (v1 << nb) | (nb ? (v0 >> (64 - nb)) : 0);
537 memcpy(o + 8, &w1, 8);
538 acc = nb ? (v1 >> (64 - nb)) : 0;
542 unstuff = p[1] > 0x8F;
545 while (size > 0 && o < o_end)
549 acc |= (
ui64)d << nb;
550 nb += 8 - ((unstuff && ((d & 0x7F) == 0x7F)) ? 1u : 0u);
552 if (nb >= 8) { *o++ = (
ui8)acc; acc >>= 8; nb -= 8; }
556 __m256i z = _mm256_setzero_si256();
557 _mm256_storeu_si256((__m256i*)(o + 1), z);
558 _mm256_storeu_si256((__m256i*)(o + 33), z);
559 return (
ui32)(o - dst) + 1;
579 ui32 destuff_vlc(
const ui8* data,
int lcup,
int scup,
582 const ui8* p = data + lcup - 2;
585 ui32 nb = 4 - ((acc & 7) == 7);
586 bool unstuff = (d | 0xF) > 0x8F;
587 return destuff_rev(p - 1, scup - 2, unstuff, acc, nb, dst, cap);
606 ui32 destuff_mrp(
const ui8* data,
int lcup,
int len2,
609 return destuff_rev(data + lcup + len2 - 1, len2,
true, 0, 0,
624 __m256i decode_two_quad32_avx2(__m256i inf_u_q, __m256i U_q,
626 ui32 p, __m128i& vn) {
627 __m256i row = _mm256_setzero_si256();
630 __m256i flags = _mm256_and_si256(inf_u_q, _mm256_set_epi32(0x8880, 0x4440, 0x2220, 0x1110, 0x8880, 0x4440, 0x2220, 0x1110));
631 __m256i insig = _mm256_cmpeq_epi32(flags, _mm256_setzero_si256());
633 if ((uint32_t)_mm256_movemask_epi8(insig) != (uint32_t)0xFFFFFFFF)
635 flags = _mm256_mullo_epi16(flags, _mm256_set_epi16(1, 1, 2, 2, 4, 4, 8, 8, 1, 1, 2, 2, 4, 4, 8, 8));
643 __m256i w0 = _mm256_srli_epi32(flags, 15);
644 m_n = _mm256_sub_epi32(U_q, w0);
645 m_n = _mm256_andnot_si256(insig, m_n);
649 __m256i inc_sum = m_n;
650 inc_sum = _mm256_add_epi32(inc_sum, _mm256_bslli_epi128(inc_sum, 4));
651 inc_sum = _mm256_add_epi32(inc_sum, _mm256_bslli_epi128(inc_sum, 8));
652 ui32 total_mn1 = (
ui32)_mm256_extract_epi16(inc_sum, 6);
653 ui32 total_mn2 = (
ui32)_mm256_extract_epi16(inc_sum, 14);
655 __m128i ms_vec0 = dfetch(dbuf, limit, pos);
656 __m128i ms_vec1 = dfetch(dbuf, limit, pos + total_mn1);
657 pos += total_mn1 + total_mn2;
659 __m256i ms_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ms_vec0), ms_vec1, 0x1);
661 __m256i ex_sum = _mm256_bslli_epi128(inc_sum, 4);
664 __m256i byte_idx = _mm256_srli_epi32(ex_sum, 3);
665 __m256i bit_idx = _mm256_and_si256(ex_sum, _mm256_set1_epi32(7));
666 byte_idx = _mm256_shuffle_epi8(byte_idx,
667 _mm256_set_epi32(0x0C0C0C0C, 0x08080808, 0x04040404, 0x00000000, 0x0C0C0C0C, 0x08080808, 0x04040404, 0x00000000));
668 byte_idx = _mm256_add_epi32(byte_idx, _mm256_set1_epi32(0x03020100));
669 __m256i d0 = _mm256_shuffle_epi8(ms_vec, byte_idx);
670 byte_idx = _mm256_add_epi32(byte_idx, _mm256_set1_epi32(0x01010101));
671 __m256i d1 = _mm256_shuffle_epi8(ms_vec, byte_idx);
674 bit_idx = _mm256_or_si256(bit_idx, _mm256_slli_epi32(bit_idx, 16));
676 __m128i a = _mm_set_epi8(1, 3, 7, 15, 31, 63, 127, -1, 1, 3, 7, 15, 31, 63, 127, -1);
677 __m256i aa = _mm256_inserti128_si256(_mm256_castsi128_si256(a), a, 0x1);
679 __m256i bit_shift = _mm256_shuffle_epi8(aa, bit_idx);
680 bit_shift = _mm256_add_epi16(bit_shift, _mm256_set1_epi16(0x0101));
681 d0 = _mm256_mullo_epi16(d0, bit_shift);
682 d0 = _mm256_srli_epi16(d0, 8);
683 d1 = _mm256_mullo_epi16(d1, bit_shift);
684 d1 = _mm256_and_si256(d1, _mm256_set1_epi32((
si32)0xFF00FF00));
685 d0 = _mm256_or_si256(d0, d1);
689 __m256i ones = _mm256_set1_epi32(1);
690 __m256i twos = _mm256_set1_epi32(2);
691 __m256i U_q_m1 = _mm256_sub_epi32(U_q, ones);
692 U_q_m1 = _mm256_and_si256(U_q_m1, _mm256_set_epi32(0, 0, 0, 0x1F, 0, 0, 0, 0x1F));
693 U_q_m1 = _mm256_shuffle_epi32(U_q_m1, 0);
694 w0 = _mm256_sub_epi32(twos, w0);
695 shift = _mm256_sllv_epi32(w0, U_q_m1);
696 ms_vec = _mm256_and_si256(d0, _mm256_sub_epi32(shift, ones));
699 w0 = _mm256_and_si256(flags, _mm256_set1_epi32(0x800));
700 w0 = _mm256_cmpeq_epi32(w0, _mm256_setzero_si256());
701 w0 = _mm256_andnot_si256(w0, shift);
702 ms_vec = _mm256_or_si256(ms_vec, w0);
703 w0 = _mm256_slli_epi32(ms_vec, 31);
704 ms_vec = _mm256_or_si256(ms_vec, ones);
705 __m256i tvn = ms_vec;
706 ms_vec = _mm256_add_epi32(ms_vec, twos);
707 ms_vec = _mm256_slli_epi32(ms_vec, (
si32)p - 1);
708 ms_vec = _mm256_or_si256(ms_vec, w0);
709 row = _mm256_andnot_si256(insig, ms_vec);
711 ms_vec = _mm256_andnot_si256(insig, tvn);
713 tvn = _mm256_shuffle_epi8(ms_vec, _mm256_set_epi32(-1, 0x0F0E0D0C, 0x07060504, -1, -1, -1, 0x0F0E0D0C, 0x07060504));
715 vn = _mm_or_si128(vn, _mm256_castsi256_si128(tvn));
716 vn = _mm_or_si128(vn, _mm256_extracti128_si256(tvn, 0x1));
734 __m256i decode_four_quad16(
const __m128i inf_u_q, __m128i U_q,
736 ui32 p, __m128i& vn) {
742 __m256i row = _mm256_setzero_si256();
743 __m128i ddd = _mm_shuffle_epi8(inf_u_q,
744 _mm_set_epi16(0x0d0c, 0x0d0c, 0x0908, 0x908, 0x0504, 0x0504, 0x0100, 0x0100));
745 w0 = _mm256_permutevar8x32_epi32(_mm256_castsi128_si256(ddd),
746 _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
748 flags = _mm256_and_si256(w0,
749 _mm256_set_epi16((
si16)0x8880, 0x4440, 0x2220, 0x1110,
750 (
si16)0x8880, 0x4440, 0x2220, 0x1110,
751 (
si16)0x8880, 0x4440, 0x2220, 0x1110,
752 (
si16)0x8880, 0x4440, 0x2220, 0x1110));
753 insig = _mm256_cmpeq_epi16(flags, _mm256_setzero_si256());
754 if ((uint32_t)_mm256_movemask_epi8(insig) != (uint32_t)0xFFFFFFFF)
756 ddd = _mm_or_si128(_mm_bslli_si128(U_q, 2), U_q);
757 __m256i U_q_avx = _mm256_permutevar8x32_epi32(_mm256_castsi128_si256(ddd),
758 _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
759 flags = _mm256_mullo_epi16(flags, _mm256_set_epi16(1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8, 1, 2, 4, 8));
767 w0 = _mm256_srli_epi16(flags, 15);
768 m_n = _mm256_sub_epi16(U_q_avx, w0);
769 m_n = _mm256_andnot_si256(insig, m_n);
773 __m256i inc_sum = m_n;
774 inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 2));
775 inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 4));
776 inc_sum = _mm256_add_epi16(inc_sum, _mm256_bslli_epi128(inc_sum, 8));
777 ui32 total_mn1 = (
ui32)_mm256_extract_epi16(inc_sum, 7);
778 ui32 total_mn2 = (
ui32)_mm256_extract_epi16(inc_sum, 15);
779 __m256i ex_sum = _mm256_bslli_epi128(inc_sum, 2);
781 __m128i ms_vec0 = dfetch(dbuf, limit, pos);
782 __m128i ms_vec1 = dfetch(dbuf, limit, pos + total_mn1);
783 pos += total_mn1 + total_mn2;
785 __m256i ms_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ms_vec0), ms_vec1, 0x1);
788 __m256i byte_idx = _mm256_srli_epi16(ex_sum, 3);
789 __m256i bit_idx = _mm256_and_si256(ex_sum, _mm256_set1_epi16(7));
790 byte_idx = _mm256_shuffle_epi8(byte_idx,
791 _mm256_set_epi16(0x0E0E, 0x0C0C, 0x0A0A, 0x0808,
792 0x0606, 0x0404, 0x0202, 0x0000, 0x0E0E, 0x0C0C, 0x0A0A, 0x0808,
793 0x0606, 0x0404, 0x0202, 0x0000));
794 byte_idx = _mm256_add_epi16(byte_idx, _mm256_set1_epi16(0x0100));
795 __m256i d0 = _mm256_shuffle_epi8(ms_vec, byte_idx);
796 byte_idx = _mm256_add_epi16(byte_idx, _mm256_set1_epi16(0x0101));
797 __m256i d1 = _mm256_shuffle_epi8(ms_vec, byte_idx);
800 __m256i bit_shift = _mm256_shuffle_epi8(
801 _mm256_set_epi8(1, 3, 7, 15, 31, 63, 127, -1,
802 1, 3, 7, 15, 31, 63, 127, -1, 1, 3, 7, 15, 31, 63, 127, -1,
803 1, 3, 7, 15, 31, 63, 127, -1), bit_idx);
804 bit_shift = _mm256_add_epi16(bit_shift, _mm256_set1_epi16(0x0101));
805 d0 = _mm256_mullo_epi16(d0, bit_shift);
806 d0 = _mm256_srli_epi16(d0, 8);
807 d1 = _mm256_mullo_epi16(d1, bit_shift);
808 d1 = _mm256_and_si256(d1, _mm256_set1_epi16((
si16)0xFF00));
809 d0 = _mm256_or_si256(d0, d1);
813 __m256i ones = _mm256_set1_epi16(1);
814 __m256i twos = _mm256_set1_epi16(2);
821 __m256i kq = _mm256_sub_epi16(U_q_avx, ones);
822 __m256i idx = _mm256_or_si256(kq,
823 _mm256_slli_epi16(_mm256_sub_epi16(kq,
824 _mm256_set1_epi16(8)), 8));
825 const __m256i pow2_tbl = _mm256_setr_epi8(
826 1, 2, 4, 8, 16, 32, 64, (
char)128, 0, 0, 0, 0, 0, 0, 0, 0,
827 1, 2, 4, 8, 16, 32, 64, (
char)128, 0, 0, 0, 0, 0, 0, 0, 0);
828 __m256i pow2 = _mm256_shuffle_epi8(pow2_tbl, idx);
829 w0 = _mm256_sub_epi16(twos, w0);
830 shift = _mm256_mullo_epi16(w0, pow2);
831 ms_vec = _mm256_and_si256(d0, _mm256_sub_epi16(shift, ones));
834 w0 = _mm256_and_si256(flags, _mm256_set1_epi16(0x800));
835 w0 = _mm256_cmpeq_epi16(w0, _mm256_setzero_si256());
836 w0 = _mm256_andnot_si256(w0, shift);
837 ms_vec = _mm256_or_si256(ms_vec, w0);
838 w0 = _mm256_slli_epi16(ms_vec, 15);
839 ms_vec = _mm256_or_si256(ms_vec, ones);
840 __m256i tvn = ms_vec;
841 ms_vec = _mm256_add_epi16(ms_vec, twos);
842 ms_vec = _mm256_slli_epi16(ms_vec, (
si32)p - 1);
843 ms_vec = _mm256_or_si256(ms_vec, w0);
844 row = _mm256_andnot_si256(insig, ms_vec);
846 ms_vec = _mm256_andnot_si256(insig, tvn);
848 __m256i ms_vec_shuffle1 = _mm256_shuffle_epi8(ms_vec,
849 _mm256_set_epi16(-1, -1, -1, -1, 0x0706, 0x0302, -1, -1,
850 -1, -1, -1, -1, -1, -1, 0x0706, 0x0302));
851 __m256i ms_vec_shuffle2 = _mm256_shuffle_epi8(ms_vec,
852 _mm256_set_epi16(-1, -1, -1, 0x0F0E, 0x0B0A, -1, -1, -1,
853 -1, -1, -1, -1, -1, 0x0F0E, 0x0B0A, -1));
854 ms_vec = _mm256_or_si256(ms_vec_shuffle1, ms_vec_shuffle2);
856 vn = _mm_or_si128(vn, _mm256_castsi256_si128(ms_vec));
857 vn = _mm_or_si128(vn, _mm256_extracti128_si256(ms_vec, 0x1));
863 inline __m256i avx2_lzcnt_epi32(__m256i v) {
865 v = _mm256_andnot_si256(_mm256_srli_epi32(v, 8), v);
867 v = _mm256_castps_si256(_mm256_cvtepi32_ps(v));
868 v = _mm256_srli_epi32(v, 23);
869 v = _mm256_subs_epu16(_mm256_set1_epi32(158), v);
870 v = _mm256_min_epi16(v, _mm256_set1_epi32(32));
902 bool decode_cb_step2_16bit(
ui16* scratch,
ui32* decoded_data,
910 const int v_n_size = 512 + 16;
912 ui16 v_n_scratch[v_n_size] = {0};
913 ui32 v_n_scratch_32[v_n_size] = {0};
915 ui16 v_n_scratch[v_n_size];
916 memset(v_n_scratch + (width >> 1) + 4, 0, 8 *
sizeof(
ui16));
917 ui32 v_n_scratch_32[v_n_size];
921 const ui32 dbuf_cap = 4096 * 15 / 8;
922 ui8 dbuf[dbuf_cap + 72];
923 ui32 limit = destuff_frwd<0xFF>(coded_data, lcup - scup, dbuf, dbuf_cap);
928 ui16 *vp = v_n_scratch;
929 ui32 *dp = decoded_data;
932 for (
ui32 x = 0; x < width; x += 8, sp += 8, vp += 4, dp += 8) {
934 __m128i inf_u_q = _mm_loadu_si128((__m128i*)sp);
935 __m128i U_q = _mm_srli_epi32(inf_u_q, 16);
936 __m128i w = _mm_cmpgt_epi32(U_q, _mm_set1_epi32((
int)mmsbp2));
937 if (!_mm_testz_si128(w, w)) {
941 __m128i vn = _mm_set1_epi16(2);
942 __m256i row = decode_four_quad16(inf_u_q, U_q, dbuf, limit, pos, p, vn);
944 w = _mm_cvtsi32_si128(*(
unsigned short const*)(vp));
945 _mm_storeu_si128((__m128i*)vp, _mm_or_si128(w, vn));
947 __m256i w0 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1, 0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1));
948 __m256i w1 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1, 0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1));
950 _mm256_storeu_si256((__m256i*)dp, w0);
951 _mm256_storeu_si256((__m256i*)(dp + stride), w1);
955 for (
ui32 y = 2; y < height; y += 2) {
958 ui16 *vp = v_n_scratch;
959 ui32 *vp_32 = v_n_scratch_32;
961 ui16* sp = scratch + (y >> 1) * sstr;
962 const __m256i avx_mmsbp2 = _mm256_set1_epi32((
int)mmsbp2);
963 const __m256i avx_31 = _mm256_set1_epi32(31);
964 const __m256i avx_f0 = _mm256_set1_epi32(0xF0);
965 const __m256i avx_1 = _mm256_set1_epi32(1);
966 const __m256i avx_0 = _mm256_setzero_si256();
968 for (
ui32 x = 0; x <= width; x += 16, vp += 8, sp += 16, vp_32 += 8) {
969 __m128i v = _mm_loadu_si128((__m128i*)vp);
970 __m128i v_p1 = _mm_loadu_si128((__m128i*)(vp + 1));
971 v = _mm_or_si128(v, v_p1);
973 __m256i v_avx = _mm256_cvtepu16_epi32(v);
974 v_avx = avx2_lzcnt_epi32(v_avx);
975 v_avx = _mm256_sub_epi32(avx_31, v_avx);
977 __m256i inf_u_q = _mm256_loadu_si256((__m256i*)sp);
978 __m256i gamma = _mm256_and_si256(inf_u_q, avx_f0);
979 __m256i w0 = _mm256_sub_epi32(gamma, avx_1);
980 gamma = _mm256_and_si256(gamma, w0);
981 gamma = _mm256_cmpeq_epi32(gamma, avx_0);
983 v_avx = _mm256_andnot_si256(gamma, v_avx);
984 v_avx = _mm256_max_epi32(v_avx, avx_1);
986 inf_u_q = _mm256_srli_epi32(inf_u_q, 16);
987 v_avx = _mm256_add_epi32(inf_u_q, v_avx);
989 w0 = _mm256_cmpgt_epi32(v_avx, avx_mmsbp2);
990 if (!_mm256_testz_si256(w0, w0)) {
994 _mm256_storeu_si256((__m256i*)vp_32, v_avx);
998 ui16 *vp = v_n_scratch;
999 ui32* vp_32 = v_n_scratch_32;
1000 ui16 *sp = scratch + (y >> 1) * sstr;
1001 ui32 *dp = decoded_data + y * stride;
1004 for (
ui32 x = 0; x < width; x += 8, sp += 8, vp += 4, dp += 8, vp_32 += 4) {
1006 __m128i inf_u_q = _mm_loadu_si128((__m128i*)sp);
1007 __m128i U_q = _mm_loadu_si128((__m128i*)vp_32);
1009 __m128i vn = _mm_set1_epi16(2);
1010 __m256i row = decode_four_quad16(inf_u_q, U_q, dbuf, limit, pos, p, vn);
1012 __m128i w = _mm_cvtsi32_si128(*(
unsigned short const*)(vp));
1013 _mm_storeu_si128((__m128i*)vp, _mm_or_si128(w, vn));
1015 __m256i w0 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1, 0x0D0C, -1, 0x0908, -1, 0x0504, -1, 0x0100, -1));
1016 __m256i w1 = _mm256_shuffle_epi8(row, _mm256_set_epi16(0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1, 0x0F0E, -1, 0x0B0A, -1, 0x0706, -1, 0x0302, -1));
1018 _mm256_storeu_si256((__m256i*)dp, w0);
1019 _mm256_storeu_si256((__m256i*)(dp + stride), w1);
1034 bool decode_cb_step2_32bit(
ui16* scratch,
ui32* decoded_data,
1039 const int v_n_size = 512 + 16;
1041 ui32 v_n_scratch[2 * v_n_size] = {0};
1043 ui32 v_n_scratch[2 * v_n_size];
1044 memset(v_n_scratch + (width >> 1) + 2, 0, 14 *
sizeof(
ui32));
1048 const ui32 dbuf_cap = 4096 * 32 / 8;
1049 ui8 dbuf[dbuf_cap + 72];
1050 ui32 limit = destuff_frwd<0xFF>(coded_data, lcup - scup, dbuf, dbuf_cap);
1053 const __m256i avx_mmsbp2 = _mm256_set1_epi32((
int)mmsbp2);
1057 ui32 *vp = v_n_scratch;
1058 ui32 *dp = decoded_data;
1061 for (
ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4)
1063 __m128i vn = _mm_set1_epi32(2);
1065 __m256i inf_u_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)sp));
1066 inf_u_q = _mm256_permutevar8x32_epi32(inf_u_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1));
1068 __m256i U_q = _mm256_srli_epi32(inf_u_q, 16);
1069 __m256i w = _mm256_cmpgt_epi32(U_q, avx_mmsbp2);
1070 if (!_mm256_testz_si256(w, w)) {
1074 __m256i row = decode_two_quad32_avx2(inf_u_q, U_q, dbuf, limit, pos, p, vn);
1075 row = _mm256_permutevar8x32_epi32(row, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
1076 _mm_store_si128((__m128i*)dp, _mm256_castsi256_si128(row));
1077 _mm_store_si128((__m128i*)(dp + stride), _mm256_extracti128_si256(row, 0x1));
1079 __m128i w0 = _mm_cvtsi32_si128(*(
int const*)vp);
1080 w0 = _mm_or_si128(w0, vn);
1081 _mm_storeu_si128((__m128i*)vp, w0);
1085 for (
ui32 y = 2; y < height; y += 2)
1089 ui32 *vp = v_n_scratch;
1090 ui16* sp = scratch + (y >> 1) * sstr;
1092 const __m256i avx_31 = _mm256_set1_epi32(31);
1093 const __m256i avx_f0 = _mm256_set1_epi32(0xF0);
1094 const __m256i avx_1 = _mm256_set1_epi32(1);
1095 const __m256i avx_0 = _mm256_setzero_si256();
1097 for (
ui32 x = 0; x <= width; x += 16, vp += 8, sp += 16) {
1098 __m256i v = _mm256_loadu_si256((__m256i*)vp);
1099 __m256i v_p1 = _mm256_loadu_si256((__m256i*)(vp + 1));
1100 v = _mm256_or_si256(v, v_p1);
1101 v = avx2_lzcnt_epi32(v);
1102 v = _mm256_sub_epi32(avx_31, v);
1104 __m256i inf_u_q = _mm256_loadu_si256((__m256i*)sp);
1105 __m256i gamma = _mm256_and_si256(inf_u_q, avx_f0);
1106 __m256i w0 = _mm256_sub_epi32(gamma, avx_1);
1107 gamma = _mm256_and_si256(gamma, w0);
1108 gamma = _mm256_cmpeq_epi32(gamma, avx_0);
1110 v = _mm256_andnot_si256(gamma, v);
1111 v = _mm256_max_epi32(v, avx_1);
1113 inf_u_q = _mm256_srli_epi32(inf_u_q, 16);
1114 v = _mm256_add_epi32(inf_u_q, v);
1116 w0 = _mm256_cmpgt_epi32(v, avx_mmsbp2);
1117 if (!_mm256_testz_si256(w0, w0)) {
1121 _mm256_storeu_si256((__m256i*)(vp + v_n_size), v);
1125 ui32 *vp = v_n_scratch;
1126 ui16 *sp = scratch + (y >> 1) * sstr;
1127 ui32 *dp = decoded_data + y * stride;
1130 for (
ui32 x = 0; x < width; x += 4, sp += 4, vp += 2, dp += 4) {
1132 __m128i vn = _mm_set1_epi32(2);
1134 __m256i inf_u_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)sp));
1135 inf_u_q = _mm256_permutevar8x32_epi32(inf_u_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1));
1137 __m256i U_q = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i*)(vp + v_n_size)));
1138 U_q = _mm256_permutevar8x32_epi32(U_q, _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1));
1140 __m256i row = decode_two_quad32_avx2(inf_u_q, U_q, dbuf, limit, pos, p, vn);
1141 row = _mm256_permutevar8x32_epi32(row, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7));
1142 _mm_store_si128((__m128i*)dp, _mm256_castsi256_si128(row));
1143 _mm_store_si128((__m128i*)(dp + stride), _mm256_extracti128_si256(row, 0x1));
1145 __m128i w0 = _mm_cvtsi32_si128(*(
int const*)vp);
1146 w0 = _mm_or_si128(w0, vn);
1147 _mm_storeu_si128((__m128i*)vp, w0);
1161 void decode_cb_spp_mrp(
ui16* scratch,
ui32* decoded_data,
ui8* coded_data,
1164 ui32 lengths2,
bool stripe_causal)
1170 ui16*
const sigma = scratch;
1172 ui32 mstr = (width + 3u) >> 2;
1174 mstr = ((mstr + 2u) + 7u) & ~7u;
1182 const __m128i mask_3 = _mm_set1_epi32(0x30);
1183 const __m128i mask_C = _mm_set1_epi32(0xC0);
1184 const __m128i shuffle_mask = _mm_set_epi32(-1, -1, -1, 0x0C080400);
1185 for (y = 0; y < height; y += 4)
1187 ui16* sp = scratch + (y >> 1) * sstr;
1188 ui16* dp = sigma + (y >> 2) * mstr;
1189 for (
ui32 x = 0; x < width; x += 8, sp += 8, dp += 2)
1191 __m128i s0, s1, u3, uC, t0, t1;
1193 s0 = _mm_loadu_si128((__m128i*)(sp));
1194 u3 = _mm_and_si128(s0, mask_3);
1195 u3 = _mm_srli_epi32(u3, 4);
1196 uC = _mm_and_si128(s0, mask_C);
1197 uC = _mm_srli_epi32(uC, 2);
1198 t0 = _mm_or_si128(u3, uC);
1200 s1 = _mm_loadu_si128((__m128i*)(sp + sstr));
1201 u3 = _mm_and_si128(s1, mask_3);
1202 u3 = _mm_srli_epi32(u3, 2);
1203 uC = _mm_and_si128(s1, mask_C);
1204 t1 = _mm_or_si128(u3, uC);
1206 __m128i r = _mm_or_si128(t0, t1);
1207 r = _mm_shuffle_epi8(r, shuffle_mask);
1209 ui32 t = (
ui32)_mm_extract_epi32(r, 0);
1216 ui16* dp = sigma + (y >> 2) * mstr;
1217 __m128i zero = _mm_setzero_si128();
1218 for (
ui32 x = 0; x < width; x += 32, dp += 8)
1219 _mm_storeu_si128((__m128i*)dp, zero);
1235 ui16 prev_row_sig[256 + 8] = {0};
1239 const ui32 spp_cap = 4096 * 2 / 8;
1240 ui8 spp_buf[spp_cap + 72];
1241 ui32 spp_limit = destuff_frwd<0>(coded_data + lengths1,
1242 (
int)lengths2, spp_buf, spp_cap);
1245 for (
ui32 y = 0; y < height; y += 4)
1247 ui32 pattern = 0xFFFFu;
1248 if (height - y < 4) {
1250 if (height - y < 3) {
1260 ui16 *prev_sig = prev_row_sig;
1261 ui16 *cur_sig = sigma + (y >> 2) * mstr;
1262 ui32 *dpp = decoded_data + y * stride;
1263 for (
ui32 x = 0; x < width; x += 4, dpp += 4, ++cur_sig, ++prev_sig)
1268 pattern = pattern >> (s * 4);
1283 memcpy(&ps, prev_sig, 4);
1284 memcpy(&ns, cur_sig + mstr, 4);
1285 ui32 u = (ps & 0x88888888) >> 3;
1287 u |= (ns & 0x11111111) << 3;
1289 memcpy(&cs, cur_sig, 4);
1292 mbr |= (cs & 0x77777777) << 1;
1293 mbr |= (cs & 0xEEEEEEEE) >> 1;
1309 ui64 cwd = dfetch64(spp_buf, spp_limit, spp_pos);
1312 ui32 col_mask = 0xFu;
1313 ui32 inv_sig = ~cs & pattern;
1314 for (
int i = 0; i < 16; i += 4, col_mask <<= 4)
1316 if ((col_mask & new_sig) == 0)
1320 ui32 sample_mask = 0x1111u & col_mask;
1321 if (new_sig & sample_mask)
1323 new_sig &= ~sample_mask;
1326 ui32 t = 0x33u << i;
1327 new_sig |= t & inv_sig;
1333 if (new_sig & sample_mask)
1335 new_sig &= ~sample_mask;
1338 ui32 t = 0x76u << i;
1339 new_sig |= t & inv_sig;
1345 if (new_sig & sample_mask)
1347 new_sig &= ~sample_mask;
1350 ui32 t = 0xECu << i;
1351 new_sig |= t & inv_sig;
1357 if (new_sig & sample_mask)
1359 new_sig &= ~sample_mask;
1362 ui32 t = 0xC8u << i;
1363 new_sig |= t & inv_sig;
1376 __m128i new_sig_vec = _mm_set1_epi16((
si16)new_sig);
1377 new_sig_vec = _mm_shuffle_epi8(new_sig_vec,
1378 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1379 new_sig_vec = _mm_and_si128(new_sig_vec,
1380 _mm_set1_epi64x((
si64)0x8040201008040201));
1381 new_sig_vec = _mm_cmpeq_epi8(new_sig_vec,
1382 _mm_set1_epi64x((
si64)0x8040201008040201));
1386 __m128i inc_sum = new_sig_vec;
1387 inc_sum = _mm_abs_epi8(inc_sum);
1388 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 1));
1389 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 2));
1390 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 4));
1391 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 8));
1392 cnt += (
ui32)_mm_extract_epi16(inc_sum, 7) >> 8;
1394 __m128i ex_sum = _mm_bslli_si128(inc_sum, 1);
1398 __m128i cwd_vec = _mm_set1_epi16((
si16)cwd);
1399 cwd_vec = _mm_shuffle_epi8(cwd_vec,
1400 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1401 cwd_vec = _mm_and_si128(cwd_vec,
1402 _mm_set1_epi64x((
si64)0x8040201008040201));
1403 cwd_vec = _mm_cmpeq_epi8(cwd_vec,
1404 _mm_set1_epi64x((
si64)0x8040201008040201));
1405 cwd_vec = _mm_abs_epi8(cwd_vec);
1409 __m128i v = _mm_shuffle_epi8(cwd_vec, ex_sum);
1413 _mm_set_epi8(-1,-1,-1,12,-1,-1,-1,8,-1,-1,-1,4,-1,-1,-1,0);
1414 __m128i val = _mm_set1_epi32(3 << (p - 2));
1416 for (
int c = 0; c < 4; ++ c) {
1417 __m128i s0, s0_ns, s0_val;
1419 s0 = _mm_load_si128((__m128i*)dp);
1423 s0_ns = _mm_shuffle_epi8(new_sig_vec, m);
1424 s0_ns = _mm_cmpeq_epi32(s0_ns, _mm_set1_epi32(0xFF));
1427 s0_val = _mm_shuffle_epi8(v, m);
1428 s0_val = _mm_slli_epi32(s0_val, 31);
1429 s0_val = _mm_or_si128(s0_val, val);
1430 s0_val = _mm_and_si128(s0_val, s0_ns);
1433 s0 = _mm_or_si128(s0, s0_val);
1435 _mm_store_si128((__m128i*)dp, s0);
1438 m = _mm_add_epi32(m, _mm_set1_epi32(1));
1445 *prev_sig = (
ui16)(new_sig);
1449 new_sig |= (t & 0x7777) << 1;
1450 new_sig |= (t & 0xEEEE) >> 1;
1464 const ui32 mrp_cap = 1024;
1465 ui8 mrp_buf[mrp_cap + 72];
1466 ui32 mrp_limit = destuff_mrp(coded_data, (
int)lengths1,
1467 (
int)lengths2, mrp_buf, mrp_cap);
1470 for (
ui32 y = 0; y < height; y += 4)
1472 ui16 *cur_sig = sigma + (y >> 2) * mstr;
1473 ui32 *dpp = decoded_data + y * stride;
1474 for (
ui32 i = 0; i < width; i += 4, dpp += 4)
1478 ui64 cwd = dfetch64(mrp_buf, mrp_limit, mrp_pos);
1479 ui16 sig = *cur_sig++;
1487 __m128i sig_vec = _mm_set1_epi16((
si16)sig);
1488 sig_vec = _mm_shuffle_epi8(sig_vec,
1489 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1490 sig_vec = _mm_and_si128(sig_vec,
1491 _mm_set1_epi64x((
si64)0x8040201008040201));
1492 sig_vec = _mm_cmpeq_epi8(sig_vec,
1493 _mm_set1_epi64x((
si64)0x8040201008040201));
1494 sig_vec = _mm_abs_epi8(sig_vec);
1498 __m128i inc_sum = sig_vec;
1499 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 1));
1500 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 2));
1501 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 4));
1502 inc_sum = _mm_add_epi8(inc_sum, _mm_bslli_si128(inc_sum, 8));
1503 total_bits = _mm_extract_epi16(inc_sum, 7) >> 8;
1504 __m128i ex_sum = _mm_bslli_si128(inc_sum, 1);
1511 __m128i cwd_vec = _mm_set1_epi16((
si16)cwd);
1512 cwd_vec = _mm_shuffle_epi8(cwd_vec,
1513 _mm_set_epi8(1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0));
1514 cwd_vec = _mm_and_si128(cwd_vec,
1515 _mm_set1_epi64x((
si64)0x8040201008040201));
1516 cwd_vec = _mm_cmpeq_epi8(cwd_vec,
1517 _mm_set1_epi64x((
si64)0x8040201008040201));
1518 cwd_vec = _mm_add_epi8(cwd_vec, _mm_set1_epi8(1));
1519 cwd_vec = _mm_add_epi8(cwd_vec, cwd_vec);
1520 cwd_vec = _mm_or_si128(cwd_vec, _mm_set1_epi8(1));
1524 _mm_set_epi8(-1,-1,-1,12,-1,-1,-1,8,-1,-1,-1,4,-1,-1,-1,0);
1526 for (
int c = 0; c < 4; ++c) {
1527 __m128i s0, s0_sig, s0_idx, s0_val;
1529 s0 = _mm_load_si128((__m128i*)dp);
1531 s0_sig = _mm_shuffle_epi8(sig_vec, m);
1532 s0_sig = _mm_cmpeq_epi8(s0_sig, _mm_setzero_si128());
1534 s0_idx = _mm_shuffle_epi8(ex_sum, m);
1535 s0_val = _mm_shuffle_epi8(cwd_vec, s0_idx);
1537 s0_val = _mm_andnot_si128(s0_sig, s0_val);
1539 s0_val = _mm_slli_epi32(s0_val, (
si32)p - 2);
1540 s0 = _mm_xor_si128(s0, s0_val);
1542 _mm_store_si128((__m128i*)dp, s0);
1545 m = _mm_add_epi32(m, _mm_set1_epi32(1));
1549 mrp_pos += (
ui32)total_bits;
1562 void decode_cb_step1_vlc(
ui16* scratch,
ui8* coded_data,
int lcup,
1567 mel_init(&mel, coded_data, lcup, scup);
1572 const ui32 vlc_cap = 4096;
1573 ui8 vlc_buf[vlc_cap + 72];
1574 ui32 vlc_limit = destuff_vlc(coded_data, lcup, scup,
1587 for (
ui32 x = 0; x < width; sp += 4)
1593 drefill(vlc_val, vlc_bits, vlc_off, vlc_buf, vlc_limit);
1606 t0 = (run == -1) ? t0 : 0;
1620 c_q = ((t0 & 0x10U) << 3) | ((t0 & 0xE0U) << 2);
1623 dconsume(vlc_val, vlc_bits, t0 & 0x7u);
1629 t1 =
vlc_tbl0[c_q + (vlc_val & 0x7F)];
1632 if (c_q == 0 && x < width)
1637 t1 = (run == -1) ? t1 : 0;
1642 t1 = x < width ? t1 : 0;
1651 c_q = ((t1 & 0x10U) << 3) | ((t1 & 0xE0U) << 2);
1654 dconsume(vlc_val, vlc_bits, t1 & 0x7u);
1659 ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
1660 if (uvlc_mode == 0xc0)
1664 uvlc_mode += (run == -1) ? 0x40 : 0;
1678 dconsume(vlc_val, vlc_bits, uvlc_entry & 0x7u);
1681 ui32 len = uvlc_entry & 0xF;
1682 ui32 tmp = (
ui32)vlc_val & ((1u << len) - 1);
1683 dconsume(vlc_val, vlc_bits, len);
1686 len = uvlc_entry & 0x7;
1688 ui16 u_q = (
ui16)(1 + (uvlc_entry&7) + (tmp&~(0xFFU<<len)));
1690 u_q = (
ui16)(1 + (uvlc_entry >> 3) + (tmp >> len));
1696 for (
ui32 y = 2; y < height; y += 2)
1699 ui16 *sp = scratch + (y >> 1) * sstr;
1701 for (
ui32 x = 0; x < width; sp += 4)
1707 c_q |= ((sp[0 - (
si32)sstr] & 0xA0U) << 2);
1708 c_q |= ((sp[2 - (
si32)sstr] & 0x20U) << 4);
1711 drefill(vlc_val, vlc_bits, vlc_off, vlc_buf, vlc_limit);
1724 t0 = (run == -1) ? t0 : 0;
1739 c_q = ((t0 & 0x40U) << 2) | ((t0 & 0x80U) << 1);
1741 c_q |= sp[0 - (
si32)sstr] & 0x80;
1743 c_q |= ((sp[2 - (
si32)sstr] & 0xA0U) << 2);
1744 c_q |= ((sp[4 - (
si32)sstr] & 0x20U) << 4);
1747 dconsume(vlc_val, vlc_bits, t0 & 0x7u);
1753 t1 =
vlc_tbl1[ c_q + (vlc_val & 0x7F)];
1756 if (c_q == 0 && x < width)
1761 t1 = (run == -1) ? t1 : 0;
1766 t1 = x < width ? t1 : 0;
1776 c_q = ((t1 & 0x40U) << 2) | ((t1 & 0x80U) << 1);
1778 c_q |= sp[2 - (
si32)sstr] & 0x80;
1781 dconsume(vlc_val, vlc_bits, t1 & 0x7u);
1785 ui32 uvlc_mode = (((t0 >> 3) & 1) | (((t1 >> 3) & 1) << 1));
1788 ui32 total_bits = uvlc_entry & 0x1F;
1789 if (total_bits < 0x1F) {
1790 sp[1] = (
ui16)((uvlc_entry >> 5) & 0xFF);
1791 sp[3] = (
ui16)((uvlc_entry >> 13) & 0xFF);
1792 dconsume(vlc_val, vlc_bits, total_bits);
1794 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
1795 uvlc_entry =
uvlc_tbl1[uvlc_mode + (vlc_val & 0x3F)];
1796 dconsume(vlc_val, vlc_bits, uvlc_entry & 0x7u);
1798 ui32 len = uvlc_entry & 0xF;
1799 ui32 tmp = (
ui32)vlc_val & ((1u << len) - 1);
1800 dconsume(vlc_val, vlc_bits, len);
1802 len = uvlc_entry & 0x7;
1804 sp[1] = (
ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len)));
1805 sp[3] = (
ui16)((uvlc_entry >> 3) + (tmp >> len));
1813 ui32 missing_msbs,
ui32 num_passes,
1818 static bool insufficient_precision =
false;
1819 static bool modify_code =
false;
1820 static bool truncate_spp_mrp =
false;
1822 if (num_passes > 1 && lengths2 == 0)
1824 OJPH_WARN(0x00010001,
"A malformed codeblock that has more than "
1825 "one coding pass, but zero length for "
1826 "2nd and potential 3rd pass.");
1832 OJPH_WARN(0x00010002,
"We do not support more than 3 coding passes; "
1833 "This codeblocks has %d passes.",
1838 if (missing_msbs > 30)
1840 if (insufficient_precision ==
false)
1842 insufficient_precision =
true;
1843 OJPH_WARN(0x00010003,
"32 bits are not enough to decode this "
1844 "codeblock. This message will not be "
1845 "displayed again.");
1849 else if (missing_msbs == 30)
1851 if (modify_code ==
false) {
1853 OJPH_WARN(0x00010004,
"Not enough precision to decode the cleanup "
1854 "pass. The code can be modified to support "
1855 "this case. This message will not be "
1856 "displayed again.");
1860 else if (missing_msbs == 29)
1862 if (num_passes > 1) {
1864 if (truncate_spp_mrp ==
false) {
1865 truncate_spp_mrp =
true;
1866 OJPH_WARN(0x00010005,
"Not enough precision to decode the SgnProp "
1867 "nor MagRef passes; both will be skipped. "
1868 "This message will not be displayed "
1873 ui32 p = 30 - missing_msbs;
1879 OJPH_WARN(0x00010006,
"Wrong codeblock length.");
1885 lcup = (int)lengths1;
1887 scup = (((int)coded_data[lcup-1]) << 4) + (coded_data[lcup-2] & 0xF);
1888 if (scup < 2 || scup > lcup || scup > 4079)
1912 ui32 sstr = ((width + 2u) + 7u) & ~7u;
1915 ui16 scratch[8 * 513] = {0};
1917 ui16 scratch[8 * 513];
1918 ui32 quad_rows = (height + 1u) >> 1;
1919 size_t scratch_zero = (size_t)(quad_rows + 1) * sstr;
1920 if (scratch_zero > 8 * 513) scratch_zero = 8 * 513;
1921 memset(scratch, 0, scratch_zero *
sizeof(
ui16));
1924 assert((stride & 0x3) == 0);
1926 ui32 mmsbp2 = missing_msbs + 2;
1935 decode_cb_step1_vlc(scratch, coded_data, lcup, scup, width, height, sstr);
1947 if (!decode_cb_step2_32bit(scratch, decoded_data, coded_data,
1948 width, height, stride, sstr, p, mmsbp2,
1953 if (!decode_cb_step2_16bit(scratch, decoded_data, coded_data,
1954 width, height, stride, sstr, p, mmsbp2,
1960 decode_cb_spp_mrp(scratch, decoded_data, coded_data, width, height,
1961 stride, sstr, p, num_passes, lengths1, lengths2,
ui32 uvlc_tbl1_wide[4096]
uvlc_tbl1_wide: wider UVLC table for non-initial rows. Index = mode(2 bits) * 1024 + vlc_data(10 bits...
ui16 uvlc_tbl0[256+64]
uvlc_tbl0 contains decoding information for initial row of quads
ui16 uvlc_tbl1[256]
uvlc_tbl1 contains decoding information for non-initial row of quads
ui16 vlc_tbl1[1024]
vlc_tbl1 contains decoding information for non-initial row of quads
ui16 vlc_tbl0[1024]
vlc_tbl0 contains decoding information for initial row of quads
static void mel_read(dec_mel_st *melp)
Reads and unstuffs the MEL bitstream.
static int mel_get_run(dec_mel_st *melp)
Retrieves one run from dec_mel_st; if there are no runs stored MEL segment is decoded.
static void mel_init(dec_mel_st *melp, ui8 *bbuf, int lcup, int scup)
Initiates a dec_mel_st structure for MEL decoding and reads some bytes in order to get the read addre...
bool ojph_decode_codeblock_avx2(ui8 *coded_data, ui32 *decoded_data, ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2, ui32 width, ui32 height, ui32 stride, bool stripe_causal)
static void mel_decode(dec_mel_st *melp)
Decodes unstuffed MEL segment bits stored in tmp to runs.
#define OJPH_FORCE_INLINE
MEL state structure for reading and decoding the MEL bitstream.
bool unstuff
true if the next bit needs to be unstuffed
int num_runs
number of decoded runs left in runs (maximum 8)
int size
number of bytes in MEL code
ui8 * data
the address of data (or bitstream)
int k
state of MEL decoder
int bits
number of bits stored in tmp
ui64 tmp
temporary buffer for read data
ui64 runs
runs of decoded MEL codewords (7 bits/run)