39#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
53#ifdef OJPH_COMPILER_MSVC
55 #define unlikely(x) (x)
57 #define likely(x) __builtin_expect((x), 1)
58 #define unlikely(x) __builtin_expect((x), 0)
76 static ui32 ulvc_cwd_pre[33];
77 static int ulvc_cwd_pre_len[33];
78 static ui32 ulvc_cwd_suf[33];
79 static int ulvc_cwd_suf_len[33];
84 struct vlc_src_table {
int c_q, rho, u_off, e_k, e_1, cwd, cwd_len; };
85 vlc_src_table tbl0[] = {
88 size_t tbl0_size =
sizeof(tbl0) /
sizeof(vlc_src_table);
90 si32 pattern_popcnt[16];
91 for (
ui32 i = 0; i < 16; ++i)
94 vlc_src_table* src_tbl = tbl0;
96 size_t tbl_size = tbl0_size;
97 for (
int i = 0; i < 2048; ++i)
99 int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
100 if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
104 vlc_src_table *best_entry = NULL;
108 for (
size_t j = 0; j < tbl_size; ++j)
110 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
111 if (src_tbl[j].u_off == 1)
112 if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
116 int ones_count = pattern_popcnt[src_tbl[j].e_k];
117 if (ones_count >= best_e_k)
119 best_entry = src_tbl + j;
120 best_e_k = ones_count;
127 for (
size_t j = 0; j < tbl_size; ++j)
129 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
130 if (src_tbl[j].u_off == 0)
132 best_entry = src_tbl + j;
138 tgt_tbl[i] = (
ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
143 vlc_src_table tbl1[] = {
146 size_t tbl1_size =
sizeof(tbl1) /
sizeof(vlc_src_table);
150 tbl_size = tbl1_size;
151 for (
int i = 0; i < 2048; ++i)
153 int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
154 if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
158 vlc_src_table *best_entry = NULL;
162 for (
size_t j = 0; j < tbl_size; ++j)
164 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
165 if (src_tbl[j].u_off == 1)
166 if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
170 int ones_count = pattern_popcnt[src_tbl[j].e_k];
171 if (ones_count >= best_e_k)
173 best_entry = src_tbl + j;
174 best_e_k = ones_count;
181 for (
size_t j = 0; j < tbl_size; ++j)
183 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
184 if (src_tbl[j].u_off == 0)
186 best_entry = src_tbl + j;
192 tgt_tbl[i] = (
ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
205 ulvc_cwd_pre[0] = 0; ulvc_cwd_pre[1] = 1; ulvc_cwd_pre[2] = 2;
206 ulvc_cwd_pre[3] = 4; ulvc_cwd_pre[4] = 4;
207 ulvc_cwd_pre_len[0] = 0; ulvc_cwd_pre_len[1] = 1;
208 ulvc_cwd_pre_len[2] = 2;
209 ulvc_cwd_pre_len[3] = 3; ulvc_cwd_pre_len[4] = 3;
210 ulvc_cwd_suf[0] = 0; ulvc_cwd_suf[1] = 0; ulvc_cwd_suf[2] = 0;
211 ulvc_cwd_suf[3] = 0; ulvc_cwd_suf[4] = 1;
212 ulvc_cwd_suf_len[0] = 0; ulvc_cwd_suf_len[1] = 0;
213 ulvc_cwd_suf_len[2] = 0;
214 ulvc_cwd_suf_len[3] = 1; ulvc_cwd_suf_len[4] = 1;
215 for (
int i = 5; i < 33; ++i)
218 ulvc_cwd_pre_len[i] = 3;
219 ulvc_cwd_suf[i] = (
ui32)(i-5);
220 ulvc_cwd_suf_len[i] = 5;
227 static bool tables_initialized =
false;
228 static std::once_flag tables_initialized_flag;
229 std::call_once(tables_initialized_flag, []() {
235 return tables_initialized;
261 melp->buf_size = buffer_size;
262 melp->remaining_bits = 8;
273 melp->tmp = (melp->tmp << 1) + v;
274 melp->remaining_bits--;
275 if (melp->remaining_bits == 0) {
276 melp->buf[melp->pos++] = (
ui8)melp->tmp;
277 melp->remaining_bits = (melp->tmp == 0xFF ? 7 : 8);
287 static const int mel_exp[13] = {0,0,0,1,1,1,2,2,2,3,3,4,5};
291 if (melp->run >= melp->threshold) {
294 melp->k =
ojph_min(12, melp->k + 1);
295 melp->threshold = 1 << mel_exp[melp->k];
299 int t = mel_exp[melp->k];
305 melp->threshold = 1 << mel_exp[melp->k];
312 struct vlc_struct_avx2 {
320 bool last_greater_than_8F;
327 vlcp->buf = data + buffer_size - 1;
329 vlcp->buf_size = buffer_size;
334 vlcp->last_greater_than_8F =
true;
341 vlcp->tmp |= (
ui64)cwd << vlcp->used_bits;
342 vlcp->used_bits += cwd_len;
344 while (vlcp->used_bits >= 8) {
347 if (unlikely(vlcp->last_greater_than_8F)) {
348 tmp = vlcp->tmp & 0x7F;
350 if (likely(tmp != 0x7F)) {
351 tmp = vlcp->tmp & 0xFF;
352 *(vlcp->buf - vlcp->pos) = tmp;
353 vlcp->last_greater_than_8F = tmp > 0x8F;
355 vlcp->used_bits -= 8;
357 *(vlcp->buf - vlcp->pos) = tmp;
358 vlcp->last_greater_than_8F =
false;
360 vlcp->used_bits -= 7;
364 tmp = vlcp->tmp & 0xFF;
365 *(vlcp->buf - vlcp->pos) = tmp;
366 vlcp->last_greater_than_8F = tmp > 0x8F;
368 vlcp->used_bits -= 8;
384 if (vlcp->last_greater_than_8F && (vlcp->tmp & 0x7f) == 0x7f) {
385 *(vlcp->buf - vlcp->pos) = 0x7f;
388 vlcp->used_bits -= 7;
391 melp->tmp = melp->tmp << melp->remaining_bits;
392 int mel_mask = (0xFF << melp->remaining_bits) & 0xFF;
393 int vlc_mask = 0xFF >> (8 - vlcp->used_bits);
394 if ((mel_mask | vlc_mask) == 0)
397 if (melp->pos >= melp->buf_size)
398 OJPH_ERROR(0x00020003,
"mel encoder's buffer is full");
399 ui8 vlcp_tmp = (
ui8)vlcp->tmp;
400 int fuse = melp->tmp | vlcp_tmp;
401 if ( ( ((fuse ^ melp->tmp) & mel_mask)
402 | ((fuse ^ vlcp_tmp) & vlc_mask) ) == 0
403 && (fuse != 0xFF) && vlcp->pos > 1)
405 melp->buf[melp->pos++] = (
ui8)fuse;
409 if (vlcp->pos >= vlcp->buf_size)
410 OJPH_ERROR(0x00020004,
"vlc encoder's buffer is full");
411 melp->buf[melp->pos++] = (
ui8)melp->tmp;
412 *(vlcp->buf - vlcp->pos) = (
ui8)vlcp_tmp;
437 msp->buf_size = buffer_size;
449 if (msp->pos >= msp->buf_size)
450 OJPH_ERROR(0x00020005,
"magnitude sign encoder's buffer is full");
451 int t =
ojph_min(msp->max_bits - msp->used_bits, cwd_len);
452 msp->tmp |= ((
ui32)(cwd & ((1U << t) - 1))) << msp->used_bits;
456 if (msp->used_bits >= msp->max_bits)
458 msp->buf[msp->pos++] = (
ui8)msp->tmp;
459 msp->max_bits = (msp->tmp == 0xFF) ? 7 : 8;
472 int t = msp->max_bits - msp->used_bits;
473 msp->tmp |= (0xFF & ((1U << t) - 1)) << msp->used_bits;
475 if (msp->tmp != 0xFF)
477 if (msp->pos >= msp->buf_size)
478 OJPH_ERROR(0x00020006,
"magnitude sign encoder's buffer is full");
479 msp->buf[msp->pos++] = (
ui8)msp->tmp;
482 else if (msp->max_bits == 7)
486#define ZERO _mm256_setzero_si256()
487#define ONE _mm256_set1_epi32(1)
490inline __m256i avx2_lzcnt_epi32(__m256i v) {
492 v = _mm256_andnot_si256(_mm256_srli_epi32(v, 8), v);
494 v = _mm256_castps_si256(_mm256_cvtepi32_ps(v));
495 v = _mm256_srli_epi32(v, 23);
496 v = _mm256_subs_epu16(_mm256_set1_epi32(158), v);
497 v = _mm256_min_epi16(v, _mm256_set1_epi32(32));
502inline __m256i avx2_cmpneq_epi32(__m256i v, __m256i v2) {
503 return _mm256_xor_si256(_mm256_cmpeq_epi32(v, v2), _mm256_set1_epi32((int32_t)0xffffffff));
506static void proc_pixel(__m256i *src_vec,
ui32 p,
507 __m256i *eq_vec, __m256i *s_vec,
508 __m256i &rho_vec, __m256i &e_qmax_vec)
515 for (
ui32 i = 0; i < 4; ++i) {
517 val_vec[i] = _mm256_add_epi32(src_vec[i], src_vec[i]);
520 val_vec[i] = _mm256_srli_epi32(val_vec[i], (
int)p);
523 val_vec[i] = _mm256_and_si256(val_vec[i], _mm256_set1_epi32((
int)~1u));
526 const __m256i val_notmask = avx2_cmpneq_epi32(val_vec[i], ZERO);
533 val_vec[i] = _mm256_sub_epi32(val_vec[i], ONE);
534 _eq_vec[i] = avx2_lzcnt_epi32(val_vec[i]);
535 _eq_vec[i] = _mm256_sub_epi32(_mm256_set1_epi32(32), _eq_vec[i]);
542 val_vec[i] = _mm256_sub_epi32(val_vec[i], ONE);
543 _s_vec[i] = _mm256_srli_epi32(src_vec[i], 31);
544 _s_vec[i] = _mm256_add_epi32(_s_vec[i], val_vec[i]);
546 _eq_vec[i] = _mm256_and_si256(_eq_vec[i], val_notmask);
547 _s_vec[i] = _mm256_and_si256(_s_vec[i], val_notmask);
548 val_vec[i] = _mm256_srli_epi32(val_notmask, 31);
552 const __m256i idx = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
566 for (
ui32 i = 0; i < 2; ++i) {
567 tmp1 = _mm256_permutevar8x32_epi32(_eq_vec[0 + i], idx);
568 tmp2 = _mm256_permutevar8x32_epi32(_eq_vec[2 + i], idx);
569 eq_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4));
570 eq_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4));
572 tmp1 = _mm256_permutevar8x32_epi32(_s_vec[0 + i], idx);
573 tmp2 = _mm256_permutevar8x32_epi32(_s_vec[2 + i], idx);
574 s_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4));
575 s_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4));
577 tmp1 = _mm256_permutevar8x32_epi32(val_vec[0 + i], idx);
578 tmp2 = _mm256_permutevar8x32_epi32(val_vec[2 + i], idx);
579 _rho_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4));
580 _rho_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4));
583 e_qmax_vec = _mm256_max_epi32(eq_vec[0], eq_vec[1]);
584 e_qmax_vec = _mm256_max_epi32(e_qmax_vec, eq_vec[2]);
585 e_qmax_vec = _mm256_max_epi32(e_qmax_vec, eq_vec[3]);
586 _rho_vec[1] = _mm256_slli_epi32(_rho_vec[1], 1);
587 _rho_vec[2] = _mm256_slli_epi32(_rho_vec[2], 2);
588 _rho_vec[3] = _mm256_slli_epi32(_rho_vec[3], 3);
589 rho_vec = _mm256_or_si256(_rho_vec[0], _rho_vec[1]);
590 rho_vec = _mm256_or_si256(rho_vec, _rho_vec[2]);
591 rho_vec = _mm256_or_si256(rho_vec, _rho_vec[3]);
607static void rotate_matrix(__m256i *matrix)
609 __m256i tmp1 = _mm256_unpacklo_epi32(matrix[0], matrix[1]);
610 __m256i tmp2 = _mm256_unpacklo_epi32(matrix[2], matrix[3]);
611 __m256i tmp3 = _mm256_unpackhi_epi32(matrix[0], matrix[1]);
612 __m256i tmp4 = _mm256_unpackhi_epi32(matrix[2], matrix[3]);
614 matrix[0] = _mm256_unpacklo_epi64(tmp1, tmp2);
615 matrix[1] = _mm256_unpacklo_epi64(tmp3, tmp4);
616 matrix[2] = _mm256_unpackhi_epi64(tmp1, tmp2);
617 matrix[3] = _mm256_unpackhi_epi64(tmp3, tmp4);
619 tmp1 = _mm256_permute2x128_si256(matrix[0], matrix[2], 0x20);
620 matrix[2] = _mm256_permute2x128_si256(matrix[0], matrix[2], 0x31);
623 tmp1 = _mm256_permute2x128_si256(matrix[1], matrix[3], 0x20);
624 matrix[3] = _mm256_permute2x128_si256(matrix[1], matrix[3], 0x31);
628static void proc_ms_encode(
ms_struct *msp,
638 auto tmp = _mm256_and_si256(tuple_vec, ONE);
639 tmp = _mm256_sub_epi32(uq_vec, tmp);
640 auto tmp1 = _mm256_and_si256(rho_vec, ONE);
641 auto mask = avx2_cmpneq_epi32(tmp1, ZERO);
642 m_vec[0] = _mm256_and_si256(mask, tmp);
645 tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(2));
646 tmp = _mm256_srli_epi32(tmp, 1);
647 tmp = _mm256_sub_epi32(uq_vec, tmp);
648 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(2));
649 mask = avx2_cmpneq_epi32(tmp1, ZERO);
650 m_vec[1] = _mm256_and_si256(mask, tmp);
653 tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(4));
654 tmp = _mm256_srli_epi32(tmp, 2);
655 tmp = _mm256_sub_epi32(uq_vec, tmp);
656 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(4));
657 mask = avx2_cmpneq_epi32(tmp1, ZERO);
658 m_vec[2] = _mm256_and_si256(mask, tmp);
661 tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(8));
662 tmp = _mm256_srli_epi32(tmp, 3);
663 tmp = _mm256_sub_epi32(uq_vec, tmp);
664 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(8));
665 mask = avx2_cmpneq_epi32(tmp1, ZERO);
666 m_vec[3] = _mm256_and_si256(mask, tmp);
668 rotate_matrix(m_vec);
680 rotate_matrix(s_vec);
688 for (
ui32 i = 0; i < 4; ++i) {
692 _mm256_storeu_si256((__m256i *)cwd_len, m_vec[i]);
693 tmp = _mm256_sllv_epi32(ONE, m_vec[i]);
694 tmp = _mm256_sub_epi32(tmp, ONE);
695 tmp = _mm256_and_si256(tmp, s_vec[i]);
696 _mm256_storeu_si256((__m256i*)cwd, tmp);
698 for (
ui32 j = 0; j < 4; ++j) {
701 _cwd_len = cwd_len[idx];
702 _cwd |= ((
ui64)cwd[idx + 1]) << _cwd_len;
703 _cwd_len += cwd_len[idx + 1];
709static __m256i cal_eps_vec(__m256i *eq_vec, __m256i &u_q_vec,
719 auto u_q_mask = _mm256_cmpgt_epi32(u_q_vec, ZERO);
721 auto mask = _mm256_cmpeq_epi32(eq_vec[0], e_qmax_vec);
722 auto eps_vec = _mm256_srli_epi32(mask, 31);
724 mask = _mm256_cmpeq_epi32(eq_vec[1], e_qmax_vec);
725 auto tmp = _mm256_srli_epi32(mask, 31);
726 tmp = _mm256_slli_epi32(tmp, 1);
727 eps_vec = _mm256_or_si256(eps_vec, tmp);
729 mask = _mm256_cmpeq_epi32(eq_vec[2], e_qmax_vec);
730 tmp = _mm256_srli_epi32(mask, 31);
731 tmp = _mm256_slli_epi32(tmp, 2);
732 eps_vec = _mm256_or_si256(eps_vec, tmp);
734 mask = _mm256_cmpeq_epi32(eq_vec[3], e_qmax_vec);
735 tmp = _mm256_srli_epi32(mask, 31);
736 tmp = _mm256_slli_epi32(tmp, 3);
737 eps_vec = _mm256_or_si256(eps_vec, tmp);
739 return _mm256_and_si256(u_q_mask, eps_vec);
742static void update_lep(
ui32 x, __m256i &prev_e_val_vec,
743 __m256i *eq_vec, __m256i *e_val_vec,
744 const __m256i left_shift)
750 auto tmp = _mm256_permutevar8x32_epi32(eq_vec[3], left_shift);
751 tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(prev_e_val_vec)), 0);
752 prev_e_val_vec = _mm256_insert_epi32(ZERO, _mm256_extract_epi32(eq_vec[3], 7), 0);
753 e_val_vec[x] = _mm256_max_epi32(eq_vec[1], tmp);
757static void update_lcxp(
ui32 x, __m256i &prev_cx_val_vec,
758 __m256i &rho_vec, __m256i *cx_val_vec,
759 const __m256i left_shift)
765 auto tmp = _mm256_permutevar8x32_epi32(rho_vec, left_shift);
766 tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(prev_cx_val_vec)), 0);
767 prev_cx_val_vec = _mm256_insert_epi32(ZERO, _mm256_extract_epi32(rho_vec, 7), 0);
769 tmp = _mm256_and_si256(tmp, _mm256_set1_epi32(8));
770 tmp = _mm256_srli_epi32(tmp, 3);
772 auto tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(2));
773 tmp1 = _mm256_srli_epi32(tmp1, 1);
774 cx_val_vec[x] = _mm256_or_si256(tmp, tmp1);
777static __m256i cal_tuple(__m256i &cq_vec, __m256i &rho_vec,
778 __m256i &eps_vec,
ui32 *vlc_tbl)
781 auto tmp = _mm256_slli_epi32(cq_vec, 8);
782 auto tmp1 = _mm256_slli_epi32(rho_vec, 4);
783 tmp = _mm256_add_epi32(tmp, tmp1);
784 tmp = _mm256_add_epi32(tmp, eps_vec);
785 return _mm256_i32gather_epi32((
const int *)vlc_tbl, tmp, 4);
788static __m256i proc_cq1(
ui32 x, __m256i *cx_val_vec, __m256i &rho_vec,
789 const __m256i right_shift)
796 auto tmp = _mm256_srli_epi32(rho_vec, 1);
797 auto tmp1 = _mm256_and_si256(rho_vec, ONE);
798 return _mm256_or_si256(tmp, tmp1);
801static __m256i proc_cq2(
ui32 x, __m256i *cx_val_vec, __m256i &rho_vec,
802 const __m256i right_shift)
806 auto lcxp1_vec = _mm256_permutevar8x32_epi32(cx_val_vec[x], right_shift);
807 auto tmp = _mm256_permutevar8x32_epi32(lcxp1_vec, right_shift);
809#ifdef OJPH_ARCH_X86_64
810 tmp = _mm256_insert_epi64(tmp,
811 _mm_cvtsi128_si64(_mm256_castsi256_si128(cx_val_vec[x + 1])), 3);
812#elif (defined OJPH_ARCH_I386)
813 int lsb = _mm_cvtsi128_si32(_mm256_castsi256_si128(cx_val_vec[x + 1]));
814 tmp = _mm256_insert_epi32(tmp, lsb, 6);
815 int msb = _mm_extract_epi32(_mm256_castsi256_si128(cx_val_vec[x + 1]), 1);
816 tmp = _mm256_insert_epi32(tmp, msb, 7);
818 #error Error unsupport compiler
820 tmp = _mm256_slli_epi32(tmp, 2);
821 auto tmp1 = _mm256_insert_epi32(lcxp1_vec,
822 _mm_cvtsi128_si32(_mm256_castsi256_si128(cx_val_vec[x + 1])), 7);
823 tmp = _mm256_add_epi32(tmp1, tmp);
825 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(4));
826 tmp1 = _mm256_srli_epi32(tmp1, 1);
827 tmp = _mm256_or_si256(tmp, tmp1);
829 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(8));
830 tmp1 = _mm256_srli_epi32(tmp1, 2);
832 return _mm256_or_si256(tmp, tmp1);
835using fn_proc_cq = __m256i (*)(
ui32, __m256i *, __m256i &,
const __m256i);
837static void proc_mel_encode1(
mel_struct *melp, __m256i &cq_vec,
838 __m256i &rho_vec, __m256i u_q_vec,
ui32 ignore,
839 const __m256i right_shift)
841 int32_t mel_need_encode[8];
842 int32_t mel_need_encode2[8];
847 _mm256_storeu_si256((__m256i *)mel_need_encode, _mm256_cmpeq_epi32(cq_vec, ZERO));
849 _mm256_storeu_si256((__m256i*)mel_bit, _mm256_srli_epi32(avx2_cmpneq_epi32(rho_vec, ZERO), 31));
853 auto tmp = _mm256_permutevar8x32_epi32(u_q_vec, right_shift);
854 auto tmp1 = _mm256_min_epi32(u_q_vec, tmp);
855 _mm256_storeu_si256((__m256i*)mel_bit2, _mm256_srli_epi32(_mm256_cmpgt_epi32(tmp1, _mm256_set1_epi32(2)), 31));
858 auto need_encode2 = _mm256_cmpgt_epi32(u_q_vec, ZERO);
859 _mm256_storeu_si256((__m256i*)mel_need_encode2, _mm256_and_si256(need_encode2, _mm256_cmpgt_epi32(tmp, ZERO)));
861 ui32 i_max = 8 - (ignore / 2);
863 for (
ui32 i = 0; i < i_max; i += 2) {
864 if (mel_need_encode[i]) {
869 if (mel_need_encode[i + 1]) {
874 if (mel_need_encode2[i]) {
880static void proc_mel_encode2(
mel_struct *melp, __m256i &cq_vec,
881 __m256i &rho_vec, __m256i u_q_vec,
ui32 ignore,
882 const __m256i right_shift)
886 int32_t mel_need_encode[8];
891 _mm256_storeu_si256((__m256i*)mel_need_encode, _mm256_cmpeq_epi32(cq_vec, ZERO));
893 _mm256_storeu_si256((__m256i*)mel_bit, _mm256_srli_epi32(avx2_cmpneq_epi32(rho_vec, ZERO), 31));
896 ui32 i_max = 8 - (ignore / 2);
898 for (
ui32 i = 0; i < i_max; ++i) {
899 if (mel_need_encode[i]) {
905using fn_proc_mel_encode = void (*)(
mel_struct *, __m256i &, __m256i &,
906 __m256i,
ui32,
const __m256i);
908static void proc_vlc_encode1(vlc_struct_avx2 *vlcp,
ui32 *tuple,
911 ui32 i_max = 8 - (ignore / 2);
913 for (
ui32 i = 0; i < i_max; i += 2) {
915 ui32 val = tuple[i + 0] >> 4;
916 int size = tuple[i + 0] & 7;
920 val |= (tuple[i + 1] >> 4) << size;
921 size += tuple[i + 1] & 7;
924 if (u_q[i] > 2 && u_q[i + 1] > 2) {
926 val |= (ulvc_cwd_pre[u_q[i] - 2]) << size;
927 size += ulvc_cwd_pre_len[u_q[i] - 2];
930 val |= (ulvc_cwd_pre[u_q[i + 1] - 2]) << size;
931 size += ulvc_cwd_pre_len[u_q[i + 1] - 2];
934 val |= (ulvc_cwd_suf[u_q[i] - 2]) << size;
935 size += ulvc_cwd_suf_len[u_q[i] - 2];
938 val |= (ulvc_cwd_suf[u_q[i + 1] - 2]) << size;
939 size += ulvc_cwd_suf_len[u_q[i + 1] - 2];
941 }
else if (u_q[i] > 2 && u_q[i + 1] > 0) {
943 val |= (ulvc_cwd_pre[u_q[i]]) << size;
944 size += ulvc_cwd_pre_len[u_q[i]];
947 val |= (u_q[i + 1] - 1) << size;
951 val |= (ulvc_cwd_suf[u_q[i]]) << size;
952 size += ulvc_cwd_suf_len[u_q[i]];
956 val |= (ulvc_cwd_pre[u_q[i]]) << size;
957 size += ulvc_cwd_pre_len[u_q[i]];
960 val |= (ulvc_cwd_pre[u_q[i + 1]]) << size;
961 size += ulvc_cwd_pre_len[u_q[i + 1]];
964 val |= (ulvc_cwd_suf[u_q[i]]) << size;
965 size += ulvc_cwd_suf_len[u_q[i]];
968 val |= (ulvc_cwd_suf[u_q[i + 1]]) << size;
969 size += ulvc_cwd_suf_len[u_q[i + 1]];
976static void proc_vlc_encode2(vlc_struct_avx2 *vlcp,
ui32 *tuple,
979 ui32 i_max = 8 - (ignore / 2);
981 for (
ui32 i = 0; i < i_max; i += 2) {
983 ui32 val = tuple[i + 0] >> 4;
984 int size = tuple[i + 0] & 7;
988 val |= (tuple[i + 1] >> 4) << size;
989 size += tuple[i + 1] & 7;
993 val |= ulvc_cwd_pre[u_q[i]] << size;
994 size += ulvc_cwd_pre_len[u_q[i]];
997 val |= (ulvc_cwd_pre[u_q[i + 1]]) << size;
998 size += ulvc_cwd_pre_len[u_q[i + 1]];
1001 val |= (ulvc_cwd_suf[u_q[i + 0]]) << size;
1002 size += ulvc_cwd_suf_len[u_q[i + 0]];
1005 val |= (ulvc_cwd_suf[u_q[i + 1]]) << size;
1006 size += ulvc_cwd_suf_len[u_q[i + 1]];
1012using fn_proc_vlc_encode = void (*)(vlc_struct_avx2 *,
ui32 *,
ui32 *,
ui32);
1017 ojph::mem_elastic_allocator *elastic,
1018 ojph::coded_lists *& coded)
1022 ui32 width = (_width + 15) & ~15u;
1023 ui32 ignore = width - _width;
1024 const int ms_size = (16384 * 16 + 14) / 15;
1025 const int mel_vlc_size = 3072;
1026 const int mel_size = 192;
1027 const int vlc_size = mel_vlc_size - mel_size;
1029 ui8 ms_buf[ms_size];
1030 ui8 mel_vlc_buf[mel_vlc_size];
1031 ui8 *mel_buf = mel_vlc_buf;
1032 ui8 *vlc_buf = mel_vlc_buf + mel_size;
1036 vlc_struct_avx2 vlc;
1039 ms_init(&ms, ms_size, ms_buf);
1041 const ui32 p = 30 - missing_msbs;
1052 const __m256i right_shift = _mm256_set_epi32(
1053 0, 7, 6, 5, 4, 3, 2, 1
1056 const __m256i left_shift = _mm256_set_epi32(
1057 6, 5, 4, 3, 2, 1, 0, 7
1060 ui32 n_loop = (width + 15) / 16;
1062 __m256i e_val_vec[65];
1064 e_val_vec[i] = ZERO;
1066 __m256i prev_e_val_vec = ZERO;
1068 __m256i cx_val_vec[65];
1069 __m256i prev_cx_val_vec = ZERO;
1078 fn_proc_cq proc_cq = proc_cq1;
1079 fn_proc_mel_encode proc_mel_encode = proc_mel_encode1;
1080 fn_proc_vlc_encode proc_vlc_encode = proc_vlc_encode1;
1083 for (
ui32 y = 0; y < height; y += 2)
1085 e_val_vec[n_loop] = prev_e_val_vec;
1087 __m256i tmp = _mm256_and_si256(prev_cx_val_vec, _mm256_set1_epi32(8));
1088 cx_val_vec[n_loop] = _mm256_srli_epi32(tmp, 3);
1090 prev_e_val_vec = ZERO;
1091 prev_cx_val_vec = ZERO;
1093 ui32 *sp = buf + y * stride;
1096 for (
ui32 x = 0; x < n_loop; ++x) {
1099 if ((x == (n_loop - 1)) && (_width % 16)) {
1100 ui32 tmp_buf[16] = { 0 };
1101 memcpy(tmp_buf, sp, (_width % 16) *
sizeof(
ui32));
1102 src_vec[0] = _mm256_loadu_si256((__m256i*)(tmp_buf));
1103 src_vec[2] = _mm256_loadu_si256((__m256i*)(tmp_buf + 8));
1104 if (y + 1 < height) {
1105 memcpy(tmp_buf, sp + stride, (_width % 16) *
sizeof(
ui32));
1106 src_vec[1] = _mm256_loadu_si256((__m256i*)(tmp_buf));
1107 src_vec[3] = _mm256_loadu_si256((__m256i*)(tmp_buf + 8));
1115 src_vec[0] = _mm256_loadu_si256((__m256i*)(sp));
1116 src_vec[2] = _mm256_loadu_si256((__m256i*)(sp + 8));
1118 if (y + 1 < height) {
1119 src_vec[1] = _mm256_loadu_si256((__m256i*)(sp + stride));
1120 src_vec[3] = _mm256_loadu_si256((__m256i*)(sp + 8 + stride));
1135 __m256i rho_vec, e_qmax_vec;
1136 proc_pixel(src_vec, p, eq_vec, s_vec, rho_vec, e_qmax_vec);
1139 tmp = _mm256_permutevar8x32_epi32(e_val_vec[x], right_shift);
1140 tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(e_val_vec[x + 1])), 7);
1142 auto max_e_vec = _mm256_max_epi32(tmp, e_val_vec[x]);
1143 max_e_vec = _mm256_sub_epi32(max_e_vec, ONE);
1146 tmp = _mm256_max_epi32(max_e_vec, ONE);
1147 __m256i tmp1 = _mm256_sub_epi32(rho_vec, ONE);
1148 tmp1 = _mm256_and_si256(rho_vec, tmp1);
1150 auto cmp = _mm256_cmpeq_epi32(tmp1, ZERO);
1151 auto kappa_vec1_ = _mm256_and_si256(cmp, ONE);
1152 auto kappa_vec2_ = _mm256_and_si256(_mm256_xor_si256(cmp, _mm256_set1_epi32((int32_t)0xffffffff)), tmp);
1153 const __m256i kappa_vec = _mm256_max_epi32(kappa_vec1_, kappa_vec2_);
1158 tmp = proc_cq(x, cx_val_vec, rho_vec, right_shift);
1160 auto cq_vec = _mm256_permutevar8x32_epi32(tmp, left_shift);
1161 cq_vec = _mm256_insert_epi32(cq_vec, prev_cq, 0);
1162 prev_cq = (
ui32)_mm256_extract_epi32(tmp, 7);
1164 update_lep(x, prev_e_val_vec, eq_vec, e_val_vec, left_shift);
1165 update_lcxp(x, prev_cx_val_vec, rho_vec, cx_val_vec, left_shift);
1169 auto uq_vec = _mm256_max_epi32(kappa_vec, e_qmax_vec);
1170 auto u_q_vec = _mm256_sub_epi32(uq_vec, kappa_vec);
1172 auto eps_vec = cal_eps_vec(eq_vec, u_q_vec, e_qmax_vec);
1173 __m256i tuple_vec = cal_tuple(cq_vec, rho_vec, eps_vec, vlc_tbl);
1174 ui32 _ignore = ((n_loop - 1) == x) ? ignore : 0;
1176 proc_mel_encode(&mel, cq_vec, rho_vec, u_q_vec, _ignore,
1179 proc_ms_encode(&ms, tuple_vec, uq_vec, rho_vec, s_vec);
1189 tuple_vec = _mm256_srli_epi32(tuple_vec, 4);
1190 _mm256_storeu_si256((__m256i*)tuple, tuple_vec);
1191 _mm256_storeu_si256((__m256i*)u_q, u_q_vec);
1193 proc_vlc_encode(&vlc, tuple, u_q, _ignore);
1196 tmp = _mm256_permutevar8x32_epi32(cx_val_vec[0], right_shift);
1197 tmp = _mm256_slli_epi32(tmp, 2);
1198 tmp = _mm256_add_epi32(tmp, cx_val_vec[0]);
1199 prev_cq = (
ui32)_mm_cvtsi128_si32(_mm256_castsi256_si128(tmp));
1203 proc_mel_encode = proc_mel_encode2;
1204 proc_vlc_encode = proc_vlc_encode2;
1211 lengths[0] = mel.pos + vlc.pos + ms.pos;
1212 elastic->
get_buffer(mel.pos + vlc.pos + ms.pos, coded);
1213 memcpy(coded->
buf, ms.buf, ms.pos);
1214 memcpy(coded->
buf + ms.pos, mel.buf, mel.pos);
1215 memcpy(coded->
buf + ms.pos + mel.pos, vlc.buf - vlc.pos + 1, vlc.pos);
1218 ui32 num_bytes = mel.pos + vlc.pos;
1219 coded->
buf[lengths[0]-1] = (
ui8)(num_bytes >> 4);
1220 coded->
buf[lengths[0]-2] = coded->
buf[lengths[0]-2] & 0xF0;
1221 coded->
buf[lengths[0]-2] =
1222 (
ui8)(coded->
buf[lengths[0]-2] | (num_bytes & 0xF));
void get_buffer(ui32 needed_bytes, coded_lists *&p)
static bool uvlc_init_tables()
Initializes uvlc_tbl0 and uvlc_tbl1 tables.
static bool vlc_init_tables()
Initializes vlc_tbl0 and vlc_tbl1 tables, from table0.h and table1.h.
ui16 vlc_tbl1[1024]
vlc_tbl1 contains decoding information for non-initial row of quads
ui16 vlc_tbl0[1024]
vlc_tbl0 contains decoding information for initial row of quads
static void ms_terminate(ms_struct *msp)
static void vlc_encode(vlc_struct *vlcp, int cwd, int cwd_len)
static void terminate_mel_vlc(mel_struct *melp, vlc_struct *vlcp)
void ojph_encode_codeblock_avx2(ui32 *buf, ui32 missing_msbs, ui32 num_passes, ui32 width, ui32 height, ui32 stride, ui32 *lengths, ojph::mem_elastic_allocator *elastic, ojph::coded_lists *&coded)
static void mel_init(dec_mel_st *melp, ui8 *bbuf, int lcup, int scup)
Initiates a dec_mel_st structure for MEL decoding and reads some bytes in order to get the read addre...
static void ms_init(ms_struct *msp, ui32 buffer_size, ui8 *data)
static void ms_encode(ms_struct *msp, ui32 cwd, int cwd_len)
static void mel_encode(mel_struct *melp, bool bit)
static void mel_emit_bit(mel_struct *melp, int v)
bool initialize_block_encoder_tables_avx2()
static void vlc_init(vlc_struct *vlcp, ui32 buffer_size, ui8 *data)
static ui32 population_count(ui32 val)
#define OJPH_ERROR(t,...)