42#if defined(__apple_build_version__)
47#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
61#ifdef OJPH_COMPILER_MSVC
63 #define unlikely(x) (x)
65 #define likely(x) __builtin_expect((x), 1)
66 #define unlikely(x) __builtin_expect((x), 0)
84 static ui32 uvlc_tbl_pair1[33 * 33];
85 static ui32 uvlc_tbl_pair2[33 * 33];
86 static ui32 ulvc_cwd_pre[33];
87 static int ulvc_cwd_pre_len[33];
88 static ui32 ulvc_cwd_suf[33];
89 static int ulvc_cwd_suf_len[33];
94 struct vlc_src_table {
int c_q, rho, u_off, e_k, e_1, cwd, cwd_len; };
95 vlc_src_table tbl0[] = {
98 size_t tbl0_size =
sizeof(tbl0) /
sizeof(vlc_src_table);
100 si32 pattern_popcnt[16];
101 for (
ui32 i = 0; i < 16; ++i)
104 vlc_src_table* src_tbl = tbl0;
106 size_t tbl_size = tbl0_size;
107 for (
int i = 0; i < 2048; ++i)
109 int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
110 if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
114 vlc_src_table *best_entry = NULL;
118 for (
size_t j = 0; j < tbl_size; ++j)
120 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
121 if (src_tbl[j].u_off == 1)
122 if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
126 int ones_count = pattern_popcnt[src_tbl[j].e_k];
127 if (ones_count >= best_e_k)
129 best_entry = src_tbl + j;
130 best_e_k = ones_count;
137 for (
size_t j = 0; j < tbl_size; ++j)
139 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
140 if (src_tbl[j].u_off == 0)
142 best_entry = src_tbl + j;
148 tgt_tbl[i] = (
ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
153 vlc_src_table tbl1[] = {
156 size_t tbl1_size =
sizeof(tbl1) /
sizeof(vlc_src_table);
160 tbl_size = tbl1_size;
161 for (
int i = 0; i < 2048; ++i)
163 int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
164 if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
168 vlc_src_table *best_entry = NULL;
172 for (
size_t j = 0; j < tbl_size; ++j)
174 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
175 if (src_tbl[j].u_off == 1)
176 if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
180 int ones_count = pattern_popcnt[src_tbl[j].e_k];
181 if (ones_count >= best_e_k)
183 best_entry = src_tbl + j;
184 best_e_k = ones_count;
191 for (
size_t j = 0; j < tbl_size; ++j)
193 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
194 if (src_tbl[j].u_off == 0)
196 best_entry = src_tbl + j;
202 tgt_tbl[i] = (
ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
215 ulvc_cwd_pre[0] = 0; ulvc_cwd_pre[1] = 1; ulvc_cwd_pre[2] = 2;
216 ulvc_cwd_pre[3] = 4; ulvc_cwd_pre[4] = 4;
217 ulvc_cwd_pre_len[0] = 0; ulvc_cwd_pre_len[1] = 1;
218 ulvc_cwd_pre_len[2] = 2;
219 ulvc_cwd_pre_len[3] = 3; ulvc_cwd_pre_len[4] = 3;
220 ulvc_cwd_suf[0] = 0; ulvc_cwd_suf[1] = 0; ulvc_cwd_suf[2] = 0;
221 ulvc_cwd_suf[3] = 0; ulvc_cwd_suf[4] = 1;
222 ulvc_cwd_suf_len[0] = 0; ulvc_cwd_suf_len[1] = 0;
223 ulvc_cwd_suf_len[2] = 0;
224 ulvc_cwd_suf_len[3] = 1; ulvc_cwd_suf_len[4] = 1;
225 for (
int i = 5; i < 33; ++i)
228 ulvc_cwd_pre_len[i] = 3;
229 ulvc_cwd_suf[i] = (
ui32)(i-5);
230 ulvc_cwd_suf_len[i] = 5;
236 static void uvlc_init_pair_tables()
238 for (
int uq0 = 0; uq0 < 33; ++uq0) {
239 for (
int uq1 = 0; uq1 < 33; ++uq1) {
243 if (uq0 > 2 && uq1 > 2) {
244 cwd |= ulvc_cwd_pre[uq0 - 2];
245 len += ulvc_cwd_pre_len[uq0 - 2];
246 cwd |= ulvc_cwd_pre[uq1 - 2] << len;
247 len += ulvc_cwd_pre_len[uq1 - 2];
248 cwd |= ulvc_cwd_suf[uq0 - 2] << len;
249 len += ulvc_cwd_suf_len[uq0 - 2];
250 cwd |= ulvc_cwd_suf[uq1 - 2] << len;
251 len += ulvc_cwd_suf_len[uq1 - 2];
252 }
else if (uq0 > 2 && uq1 > 0) {
253 cwd |= ulvc_cwd_pre[uq0];
254 len += ulvc_cwd_pre_len[uq0];
255 cwd |= (
ui32)(uq1 - 1) << len;
257 cwd |= ulvc_cwd_suf[uq0] << len;
258 len += ulvc_cwd_suf_len[uq0];
260 cwd |= ulvc_cwd_pre[uq0];
261 len += ulvc_cwd_pre_len[uq0];
262 cwd |= ulvc_cwd_pre[uq1] << len;
263 len += ulvc_cwd_pre_len[uq1];
264 cwd |= ulvc_cwd_suf[uq0] << len;
265 len += ulvc_cwd_suf_len[uq0];
266 cwd |= ulvc_cwd_suf[uq1] << len;
267 len += ulvc_cwd_suf_len[uq1];
269 uvlc_tbl_pair1[uq0 * 33 + uq1] = (cwd << 5) | (
ui32)len;
272 cwd |= ulvc_cwd_pre[uq0];
273 len += ulvc_cwd_pre_len[uq0];
274 cwd |= ulvc_cwd_pre[uq1] << len;
275 len += ulvc_cwd_pre_len[uq1];
276 cwd |= ulvc_cwd_suf[uq0] << len;
277 len += ulvc_cwd_suf_len[uq0];
278 cwd |= ulvc_cwd_suf[uq1] << len;
279 len += ulvc_cwd_suf_len[uq1];
280 uvlc_tbl_pair2[uq0 * 33 + uq1] = (cwd << 5) | (
ui32)len;
287 static bool tables_initialized =
false;
288 static std::once_flag tables_initialized_flag;
289 std::call_once(tables_initialized_flag, []() {
294 uvlc_init_pair_tables();
296 return tables_initialized;
322 melp->buf_size = buffer_size;
323 melp->remaining_bits = 8;
330 static const int mel_exp[13] = {0,0,0,1,1,1,2,2,2,3,3,4,5};
336 melp->tmp = (melp->tmp << num_bits) | (
int)bits;
337 melp->remaining_bits -= num_bits;
338 if (melp->remaining_bits <= 0) {
339 int excess = -melp->remaining_bits;
340 ui8 byte = (
ui8)(melp->tmp >> excess);
341 melp->buf[melp->pos++] = byte;
342 melp->tmp &= (1 << excess) - 1;
343 melp->remaining_bits += 8 - (
byte == 0xFF);
353 if (melp->run >= melp->threshold) {
354 mel_emit_bits(melp, 1, 1);
356 melp->k =
ojph_min(12, melp->k + 1);
357 melp->threshold = 1 << mel_exp[melp->k];
360 int t = mel_exp[melp->k];
361 mel_emit_bits(melp, melp->run & ((1u << t) - 1), t + 1);
364 melp->threshold = 1 << mel_exp[melp->k];
373 while (remaining > 0) {
375 if (remaining >= space) {
377 mel_emit_bits(melp, 1, 1);
379 melp->k =
ojph_min(12, melp->k + 1);
380 melp->threshold = 1 << mel_exp[melp->k];
382 melp->run += (int)remaining;
392 int t = mel_exp[melp->k];
393 mel_emit_bits(melp, melp->run & ((1u << t) - 1), t + 1);
396 melp->threshold = 1 << mel_exp[melp->k];
418 vlcp->buf = data + buffer_size - 1;
420 vlcp->buf_size = buffer_size;
425 vlcp->last_greater_than_8F =
true;
432 while (vlcp->used_bits >= 8) {
433 int escape = (int)vlcp->last_greater_than_8F;
434 int is_7f = (int)((vlcp->tmp & 0x7F) == 0x7F);
435 int need_stuff = escape & is_7f;
436 int bits = 8 - need_stuff;
438 ui8 byte = (
ui8)(vlcp->tmp & ((1u << bits) - 1));
439 *(vlcp->buf - vlcp->pos) =
byte;
442 vlcp->used_bits -= bits;
443 vlcp->last_greater_than_8F =
byte > 0x8F;
452 int avail = 64 - vlcp->used_bits;
453 if (likely(cwd_len <= avail)) {
454 vlcp->tmp |= cwd << vlcp->used_bits;
455 vlcp->used_bits += cwd_len;
458 vlcp->tmp |= (cwd & ((1ULL << avail) - 1)) << vlcp->used_bits;
459 vlcp->used_bits = 64;
473 mel_emit_bits(melp, 1, 1);
475 if (vlcp->last_greater_than_8F && (vlcp->tmp & 0x7f) == 0x7f) {
476 *(vlcp->buf - vlcp->pos) = 0x7f;
479 vlcp->used_bits -= 7;
482 melp->tmp = melp->tmp << melp->remaining_bits;
483 int mel_mask = (0xFF << melp->remaining_bits) & 0xFF;
484 int vlc_mask = 0xFF >> (8 - vlcp->used_bits);
485 if ((mel_mask | vlc_mask) == 0)
488 if (melp->pos >= melp->buf_size)
489 OJPH_ERROR(0x00020003,
"mel encoder's buffer is full");
490 ui8 vlcp_tmp = (
ui8)vlcp->tmp;
491 int fuse = melp->tmp | vlcp_tmp;
492 if ( ( ((fuse ^ melp->tmp) & mel_mask)
493 | ((fuse ^ vlcp_tmp) & vlc_mask) ) == 0
494 && (fuse != 0xFF) && vlcp->pos > 1)
496 melp->buf[melp->pos++] = (
ui8)fuse;
500 if (vlcp->pos >= vlcp->buf_size)
501 OJPH_ERROR(0x00020004,
"vlc encoder's buffer is full");
502 melp->buf[melp->pos++] = (
ui8)melp->tmp;
503 *(vlcp->buf - vlcp->pos) = (
ui8)vlcp_tmp;
529 msp->buf_size = buffer_size;
532 msp->last_was_ff =
false;
539 if (msp->last_was_ff) {
540 if (msp->used_bits < 7)
542 msp->buf[msp->pos++] = (
ui8)(msp->tmp & 0x7F);
545 msp->last_was_ff =
false;
548 while (msp->used_bits >= 8) {
549 int n_bytes = msp->used_bits >> 3;
550 if (n_bytes > 8) n_bytes = 8;
552 ui64 word = msp->tmp;
553 ui64 valid_mask = (n_bytes < 8)
554 ? (1ULL << (n_bytes * 8)) - 1 : ~(
ui64)0;
557 ui64 ff_detect = (w - 0x0101010101010101ULL) & ~w
558 & 0x8080808080808080ULL;
559 ff_detect &= valid_mask;
561 if (likely(ff_detect == 0)) {
562 memcpy(msp->buf + msp->pos, &word, (
size_t)n_bytes);
563 msp->pos += (
ui32)n_bytes;
565 msp->tmp >>= (n_bytes * 8);
568 msp->used_bits -= n_bytes * 8;
571 int safe = ff_pos + 1;
572 memcpy(msp->buf + msp->pos, &word, (
size_t)safe);
573 msp->pos += (
ui32)safe;
579 msp->used_bits -= bits;
581 if (msp->used_bits >= 7) {
582 msp->buf[msp->pos++] = (
ui8)(msp->tmp & 0x7F);
585 msp->last_was_ff =
false;
587 msp->last_was_ff =
true;
599 int avail = 64 - msp->used_bits;
600 if (likely(cwd_len <= avail)) {
601 msp->tmp |= cwd << msp->used_bits;
602 msp->used_bits += cwd_len;
605 msp->tmp |= (cwd & ((1ULL << avail) - 1)) << msp->used_bits;
617 int avail = 64 - msp->used_bits;
618 if (likely(cwd_len <= avail)) {
619 msp->tmp |= cwd << msp->used_bits;
620 msp->used_bits += cwd_len;
622 msp->tmp |= (cwd & ((1ULL << avail) - 1)) << msp->used_bits;
627 msp->tmp |= cwd << msp->used_bits;
628 msp->used_bits += cwd_len;
640 int max_bits = msp->last_was_ff ? 7 : 8;
641 int t = max_bits - msp->used_bits;
642 ui32 byte = (
ui32)(msp->tmp & ((1ULL << msp->used_bits) - 1));
643 byte |= (0xFFu & ((1u << t) - 1)) << msp->used_bits;
646 if (msp->pos >= msp->buf_size)
647 OJPH_ERROR(0x00020006,
"magnitude sign encoder's buffer is full");
648 msp->buf[msp->pos++] = (
ui8)
byte;
651 else if (msp->last_was_ff)
655#define ZERO _mm256_setzero_si256()
656#define ONE _mm256_set1_epi32(1)
659inline __m256i avx2_lzcnt_epi32(__m256i v) {
661 v = _mm256_andnot_si256(_mm256_srli_epi32(v, 8), v);
663 v = _mm256_castps_si256(_mm256_cvtepi32_ps(v));
664 v = _mm256_srli_epi32(v, 23);
665 v = _mm256_subs_epu16(_mm256_set1_epi32(158), v);
666 v = _mm256_min_epi16(v, _mm256_set1_epi32(32));
671inline __m256i avx2_cmpneq_epi32(__m256i v, __m256i v2) {
672 return _mm256_xor_si256(_mm256_cmpeq_epi32(v, v2), _mm256_set1_epi32((int32_t)0xffffffff));
675static void proc_pixel(__m256i *src_vec,
ui32 p,
676 __m256i *eq_vec, __m256i *s_vec,
677 __m256i &rho_vec, __m256i &e_qmax_vec)
684 for (
ui32 i = 0; i < 4; ++i) {
686 val_vec[i] = _mm256_add_epi32(src_vec[i], src_vec[i]);
689 val_vec[i] = _mm256_srli_epi32(val_vec[i], (
int)p);
692 val_vec[i] = _mm256_and_si256(val_vec[i], _mm256_set1_epi32((
int)~1u));
695 const __m256i val_notmask = avx2_cmpneq_epi32(val_vec[i], ZERO);
702 val_vec[i] = _mm256_sub_epi32(val_vec[i], ONE);
703 _eq_vec[i] = avx2_lzcnt_epi32(val_vec[i]);
704 _eq_vec[i] = _mm256_sub_epi32(_mm256_set1_epi32(32), _eq_vec[i]);
711 val_vec[i] = _mm256_sub_epi32(val_vec[i], ONE);
712 _s_vec[i] = _mm256_srli_epi32(src_vec[i], 31);
713 _s_vec[i] = _mm256_add_epi32(_s_vec[i], val_vec[i]);
715 _eq_vec[i] = _mm256_and_si256(_eq_vec[i], val_notmask);
716 _s_vec[i] = _mm256_and_si256(_s_vec[i], val_notmask);
717 val_vec[i] = _mm256_srli_epi32(val_notmask, 31);
721 const __m256i idx = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
735 for (
ui32 i = 0; i < 2; ++i) {
736 tmp1 = _mm256_permutevar8x32_epi32(_eq_vec[0 + i], idx);
737 tmp2 = _mm256_permutevar8x32_epi32(_eq_vec[2 + i], idx);
738 eq_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4));
739 eq_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4));
741 tmp1 = _mm256_permutevar8x32_epi32(_s_vec[0 + i], idx);
742 tmp2 = _mm256_permutevar8x32_epi32(_s_vec[2 + i], idx);
743 s_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4));
744 s_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4));
746 tmp1 = _mm256_permutevar8x32_epi32(val_vec[0 + i], idx);
747 tmp2 = _mm256_permutevar8x32_epi32(val_vec[2 + i], idx);
748 _rho_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4));
749 _rho_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4));
752 e_qmax_vec = _mm256_max_epi32(eq_vec[0], eq_vec[1]);
753 e_qmax_vec = _mm256_max_epi32(e_qmax_vec, eq_vec[2]);
754 e_qmax_vec = _mm256_max_epi32(e_qmax_vec, eq_vec[3]);
755 _rho_vec[1] = _mm256_slli_epi32(_rho_vec[1], 1);
756 _rho_vec[2] = _mm256_slli_epi32(_rho_vec[2], 2);
757 _rho_vec[3] = _mm256_slli_epi32(_rho_vec[3], 3);
758 rho_vec = _mm256_or_si256(_rho_vec[0], _rho_vec[1]);
759 rho_vec = _mm256_or_si256(rho_vec, _rho_vec[2]);
760 rho_vec = _mm256_or_si256(rho_vec, _rho_vec[3]);
776static void rotate_matrix(__m256i *matrix)
778 __m256i tmp1 = _mm256_unpacklo_epi32(matrix[0], matrix[1]);
779 __m256i tmp2 = _mm256_unpacklo_epi32(matrix[2], matrix[3]);
780 __m256i tmp3 = _mm256_unpackhi_epi32(matrix[0], matrix[1]);
781 __m256i tmp4 = _mm256_unpackhi_epi32(matrix[2], matrix[3]);
783 matrix[0] = _mm256_unpacklo_epi64(tmp1, tmp2);
784 matrix[1] = _mm256_unpacklo_epi64(tmp3, tmp4);
785 matrix[2] = _mm256_unpackhi_epi64(tmp1, tmp2);
786 matrix[3] = _mm256_unpackhi_epi64(tmp3, tmp4);
788 tmp1 = _mm256_permute2x128_si256(matrix[0], matrix[2], 0x20);
789 matrix[2] = _mm256_permute2x128_si256(matrix[0], matrix[2], 0x31);
792 tmp1 = _mm256_permute2x128_si256(matrix[1], matrix[3], 0x20);
793 matrix[3] = _mm256_permute2x128_si256(matrix[1], matrix[3], 0x31);
797static void proc_ms_encode(
ms_struct *msp,
807 auto tmp = _mm256_and_si256(tuple_vec, ONE);
808 tmp = _mm256_sub_epi32(uq_vec, tmp);
809 auto tmp1 = _mm256_and_si256(rho_vec, ONE);
810 auto mask = avx2_cmpneq_epi32(tmp1, ZERO);
811 m_vec[0] = _mm256_and_si256(mask, tmp);
814 tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(2));
815 tmp = _mm256_srli_epi32(tmp, 1);
816 tmp = _mm256_sub_epi32(uq_vec, tmp);
817 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(2));
818 mask = avx2_cmpneq_epi32(tmp1, ZERO);
819 m_vec[1] = _mm256_and_si256(mask, tmp);
822 tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(4));
823 tmp = _mm256_srli_epi32(tmp, 2);
824 tmp = _mm256_sub_epi32(uq_vec, tmp);
825 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(4));
826 mask = avx2_cmpneq_epi32(tmp1, ZERO);
827 m_vec[2] = _mm256_and_si256(mask, tmp);
830 tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(8));
831 tmp = _mm256_srli_epi32(tmp, 3);
832 tmp = _mm256_sub_epi32(uq_vec, tmp);
833 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(8));
834 mask = avx2_cmpneq_epi32(tmp1, ZERO);
835 m_vec[3] = _mm256_and_si256(mask, tmp);
837 rotate_matrix(m_vec);
838 rotate_matrix(s_vec);
844 for (
ui32 i = 0; i < 4; ++i) {
848 _mm256_storeu_si256((__m256i *)cwd_len, m_vec[i]);
849 tmp = _mm256_sllv_epi32(ONE, m_vec[i]);
850 tmp = _mm256_sub_epi32(tmp, ONE);
851 tmp = _mm256_and_si256(tmp, s_vec[i]);
852 _mm256_storeu_si256((__m256i*)cwd, tmp);
854 for (
ui32 j = 0; j < 4; j += 2) {
856 ui64 _cwd = cwd[idx0];
857 int _cwd_len = cwd_len[idx0];
858 _cwd |= ((
ui64)cwd[idx0 + 1]) << _cwd_len;
859 _cwd_len += cwd_len[idx0 + 1];
861 ui32 idx1 = (j + 1) * 2;
862 int len1 = cwd_len[idx1] + cwd_len[idx1 + 1];
863 if (likely(_cwd_len + len1 <= 64)) {
864 _cwd |= ((
ui64)cwd[idx1]) << _cwd_len;
865 _cwd_len += cwd_len[idx1];
866 _cwd |= ((
ui64)cwd[idx1 + 1]) << _cwd_len;
867 _cwd_len += cwd_len[idx1 + 1];
868 ms_encode_nodefer(msp, _cwd, _cwd_len);
870 ms_encode_nodefer(msp, _cwd, _cwd_len);
872 _cwd_len = cwd_len[idx1];
873 _cwd |= ((
ui64)cwd[idx1 + 1]) << _cwd_len;
874 _cwd_len += cwd_len[idx1 + 1];
875 ms_encode_nodefer(msp, _cwd, _cwd_len);
882static __m256i cal_eps_vec(__m256i *eq_vec, __m256i &u_q_vec,
892 auto u_q_mask = _mm256_cmpgt_epi32(u_q_vec, ZERO);
894 auto mask = _mm256_cmpeq_epi32(eq_vec[0], e_qmax_vec);
895 auto eps_vec = _mm256_srli_epi32(mask, 31);
897 mask = _mm256_cmpeq_epi32(eq_vec[1], e_qmax_vec);
898 auto tmp = _mm256_srli_epi32(mask, 31);
899 tmp = _mm256_slli_epi32(tmp, 1);
900 eps_vec = _mm256_or_si256(eps_vec, tmp);
902 mask = _mm256_cmpeq_epi32(eq_vec[2], e_qmax_vec);
903 tmp = _mm256_srli_epi32(mask, 31);
904 tmp = _mm256_slli_epi32(tmp, 2);
905 eps_vec = _mm256_or_si256(eps_vec, tmp);
907 mask = _mm256_cmpeq_epi32(eq_vec[3], e_qmax_vec);
908 tmp = _mm256_srli_epi32(mask, 31);
909 tmp = _mm256_slli_epi32(tmp, 3);
910 eps_vec = _mm256_or_si256(eps_vec, tmp);
912 return _mm256_and_si256(u_q_mask, eps_vec);
915static void update_lep(
ui32 x, __m256i &prev_e_val_vec,
916 __m256i *eq_vec, __m256i *e_val_vec,
917 const __m256i left_shift)
923 auto tmp = _mm256_permutevar8x32_epi32(eq_vec[3], left_shift);
924 tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(prev_e_val_vec)), 0);
925 prev_e_val_vec = _mm256_insert_epi32(ZERO, _mm256_extract_epi32(eq_vec[3], 7), 0);
926 e_val_vec[x] = _mm256_max_epi32(eq_vec[1], tmp);
930static void update_lcxp(
ui32 x, __m256i &prev_cx_val_vec,
931 __m256i &rho_vec, __m256i *cx_val_vec,
932 const __m256i left_shift)
938 auto tmp = _mm256_permutevar8x32_epi32(rho_vec, left_shift);
939 tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(prev_cx_val_vec)), 0);
940 prev_cx_val_vec = _mm256_insert_epi32(ZERO, _mm256_extract_epi32(rho_vec, 7), 0);
942 tmp = _mm256_and_si256(tmp, _mm256_set1_epi32(8));
943 tmp = _mm256_srli_epi32(tmp, 3);
945 auto tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(2));
946 tmp1 = _mm256_srli_epi32(tmp1, 1);
947 cx_val_vec[x] = _mm256_or_si256(tmp, tmp1);
950static __m256i cal_tuple(__m256i &cq_vec, __m256i &rho_vec,
951 __m256i &eps_vec,
ui32 *vlc_tbl)
954 auto tmp = _mm256_slli_epi32(cq_vec, 8);
955 auto tmp1 = _mm256_slli_epi32(rho_vec, 4);
956 tmp = _mm256_add_epi32(tmp, tmp1);
957 tmp = _mm256_add_epi32(tmp, eps_vec);
958 return _mm256_i32gather_epi32((
const int *)vlc_tbl, tmp, 4);
961static __m256i proc_cq1(
ui32 x, __m256i *cx_val_vec, __m256i &rho_vec,
962 const __m256i right_shift)
969 auto tmp = _mm256_srli_epi32(rho_vec, 1);
970 auto tmp1 = _mm256_and_si256(rho_vec, ONE);
971 return _mm256_or_si256(tmp, tmp1);
974static __m256i proc_cq2(
ui32 x, __m256i *cx_val_vec, __m256i &rho_vec,
975 const __m256i right_shift)
979 auto lcxp1_vec = _mm256_permutevar8x32_epi32(cx_val_vec[x], right_shift);
980 auto tmp = _mm256_permutevar8x32_epi32(lcxp1_vec, right_shift);
982#ifdef OJPH_ARCH_X86_64
983 tmp = _mm256_insert_epi64(tmp,
984 _mm_cvtsi128_si64(_mm256_castsi256_si128(cx_val_vec[x + 1])), 3);
985#elif (defined OJPH_ARCH_I386)
986 int lsb = _mm_cvtsi128_si32(_mm256_castsi256_si128(cx_val_vec[x + 1]));
987 tmp = _mm256_insert_epi32(tmp, lsb, 6);
988 int msb = _mm_extract_epi32(_mm256_castsi256_si128(cx_val_vec[x + 1]), 1);
989 tmp = _mm256_insert_epi32(tmp, msb, 7);
991 #error Error unsupport compiler
993 tmp = _mm256_slli_epi32(tmp, 2);
994 auto tmp1 = _mm256_insert_epi32(lcxp1_vec,
995 _mm_cvtsi128_si32(_mm256_castsi256_si128(cx_val_vec[x + 1])), 7);
996 tmp = _mm256_add_epi32(tmp1, tmp);
998 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(4));
999 tmp1 = _mm256_srli_epi32(tmp1, 1);
1000 tmp = _mm256_or_si256(tmp, tmp1);
1002 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(8));
1003 tmp1 = _mm256_srli_epi32(tmp1, 2);
1005 return _mm256_or_si256(tmp, tmp1);
1008static void proc_mel_encode1(
mel_struct *melp, __m256i &cq_vec,
1009 __m256i &rho_vec, __m256i u_q_vec,
ui32 ignore,
1010 const __m256i right_shift)
1012 int32_t mel_need_encode[8];
1013 int32_t mel_need_encode2[8];
1015 int32_t mel_bit2[8];
1018 _mm256_storeu_si256((__m256i *)mel_need_encode, _mm256_cmpeq_epi32(cq_vec, ZERO));
1020 _mm256_storeu_si256((__m256i*)mel_bit, _mm256_srli_epi32(avx2_cmpneq_epi32(rho_vec, ZERO), 31));
1024 auto tmp = _mm256_permutevar8x32_epi32(u_q_vec, right_shift);
1025 auto tmp1 = _mm256_min_epi32(u_q_vec, tmp);
1026 _mm256_storeu_si256((__m256i*)mel_bit2, _mm256_srli_epi32(_mm256_cmpgt_epi32(tmp1, _mm256_set1_epi32(2)), 31));
1029 auto need_encode2 = _mm256_cmpgt_epi32(u_q_vec, ZERO);
1030 _mm256_storeu_si256((__m256i*)mel_need_encode2, _mm256_and_si256(need_encode2, _mm256_cmpgt_epi32(tmp, ZERO)));
1032 ui32 i_max = 8 - (ignore / 2);
1034 for (
ui32 i = 0; i < i_max; i += 2) {
1035 if (mel_need_encode[i]) {
1039 if (i + 1 < i_max) {
1040 if (mel_need_encode[i + 1]) {
1045 if (mel_need_encode2[i]) {
1051static void proc_mel_encode2(
mel_struct *melp, __m256i &cq_vec,
1052 __m256i &rho_vec, __m256i u_q_vec,
ui32 ignore,
1053 const __m256i right_shift)
1058 __m256i need = _mm256_cmpeq_epi32(cq_vec, ZERO);
1059 ui32 mask = (
ui32)_mm256_movemask_epi8(need);
1062 ui32 i_max = 8 - (ignore / 2);
1064 mask &= (1u << (i_max * 4)) - 1;
1070 _mm256_storeu_si256((__m256i*)mel_bit,
1071 _mm256_srli_epi32(avx2_cmpneq_epi32(rho_vec, ZERO), 31));
1075 ui32 i = bit_pos / 4;
1081using fn_proc_mel_encode = void (*)(
mel_struct *, __m256i &, __m256i &,
1082 __m256i,
ui32,
const __m256i);
1088 val = tuple[i + 0] >> 4;
1089 size = tuple[i + 0] & 7;
1091 val |= (
ui64)(tuple[i + 1] >> 4) << size;
1092 size += tuple[i + 1] & 7;
1095 val |= (
ui64)(entry >> 5) << size;
1096 size += entry & 0x1F;
1102 ui32 i_max = 8 - (ignore / 2);
1105 for (; i + 2 < i_max; i += 4) {
1106 ui64 val1;
int size1;
1107 build_vlc_uvlc_pair(tuple, u_q, i,
uvlc_tbl, val1, size1);
1108 ui64 val2;
int size2;
1109 build_vlc_uvlc_pair(tuple, u_q, i + 2,
uvlc_tbl, val2, size2);
1110 vlc_encode(vlcp, val1 | (val2 << size1), size1 + size2);
1114 build_vlc_uvlc_pair(tuple, u_q, i,
uvlc_tbl, val, size);
1124 __m256i *e_val_vec, __m256i &prev_e_val_vec,
1125 __m256i *cx_val_vec, __m256i &prev_cx_val_vec,
1127 const __m256i &right_shift,
const __m256i &left_shift)
1137 for (
ui32 x = 0; x < n_loop; ++x) {
1140 if ((x == (n_loop - 1)) && (_width % 16)) {
1141 ui32 tmp_buf[16] = { 0 };
1142 memcpy(tmp_buf, sp, (_width % 16) *
sizeof(
ui32));
1143 src_vec[0] = _mm256_loadu_si256((__m256i*)(tmp_buf));
1144 src_vec[2] = _mm256_loadu_si256((__m256i*)(tmp_buf + 8));
1145 if (y + 1 < height) {
1146 memcpy(tmp_buf, sp + stride, (_width % 16) *
sizeof(
ui32));
1147 src_vec[1] = _mm256_loadu_si256((__m256i*)(tmp_buf));
1148 src_vec[3] = _mm256_loadu_si256((__m256i*)(tmp_buf + 8));
1156 src_vec[0] = _mm256_loadu_si256((__m256i*)(sp));
1157 src_vec[2] = _mm256_loadu_si256((__m256i*)(sp + 8));
1159 if (y + 1 < height) {
1160 src_vec[1] = _mm256_loadu_si256((__m256i*)(sp + stride));
1161 src_vec[3] = _mm256_loadu_si256((__m256i*)(sp + 8 + stride));
1170 __m256i rho_vec, e_qmax_vec;
1171 proc_pixel(src_vec, p, eq_vec, s_vec, rho_vec, e_qmax_vec);
1174 tmp = _mm256_permutevar8x32_epi32(e_val_vec[x], right_shift);
1175 tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(e_val_vec[x + 1])), 7);
1177 auto max_e_vec = _mm256_max_epi32(tmp, e_val_vec[x]);
1178 max_e_vec = _mm256_sub_epi32(max_e_vec, ONE);
1181 tmp = _mm256_max_epi32(max_e_vec, ONE);
1182 tmp1 = _mm256_sub_epi32(rho_vec, ONE);
1183 tmp1 = _mm256_and_si256(rho_vec, tmp1);
1185 auto cmp = _mm256_cmpeq_epi32(tmp1, ZERO);
1186 auto kappa_vec1_ = _mm256_and_si256(cmp, ONE);
1187 auto kappa_vec2_ = _mm256_and_si256(_mm256_xor_si256(cmp, _mm256_set1_epi32((int32_t)0xffffffff)), tmp);
1188 const __m256i kappa_vec = _mm256_max_epi32(kappa_vec1_, kappa_vec2_);
1191 tmp = proc_cq1(x, cx_val_vec, rho_vec, right_shift);
1193 tmp = proc_cq2(x, cx_val_vec, rho_vec, right_shift);
1195 auto cq_vec = _mm256_permutevar8x32_epi32(tmp, left_shift);
1196 cq_vec = _mm256_insert_epi32(cq_vec, prev_cq, 0);
1197 prev_cq = (
ui32)_mm256_extract_epi32(tmp, 7);
1199 update_lep(x, prev_e_val_vec, eq_vec, e_val_vec, left_shift);
1200 update_lcxp(x, prev_cx_val_vec, rho_vec, cx_val_vec, left_shift);
1204 auto uq_vec = _mm256_max_epi32(kappa_vec, e_qmax_vec);
1205 auto u_q_vec = _mm256_sub_epi32(uq_vec, kappa_vec);
1207 auto eps_vec = cal_eps_vec(eq_vec, u_q_vec, e_qmax_vec);
1208 __m256i tuple_vec = cal_tuple(cq_vec, rho_vec, eps_vec, vlc_tbl);
1209 ui32 _ignore = ((n_loop - 1) == x) ? ignore : 0;
1212 proc_mel_encode1(&mel, cq_vec, rho_vec, u_q_vec, _ignore,
1215 proc_mel_encode2(&mel, cq_vec, rho_vec, u_q_vec, _ignore,
1218 proc_ms_encode(&ms, tuple_vec, uq_vec, rho_vec, s_vec);
1222 tuple_vec = _mm256_srli_epi32(tuple_vec, 4);
1223 _mm256_storeu_si256((__m256i*)tuple, tuple_vec);
1224 _mm256_storeu_si256((__m256i*)u_q, u_q_vec);
1226 ui32 i_max = 8 - (_ignore / 2);
1227 if (i_max & 1) { tuple[i_max] = 0; u_q[i_max] = 0; }
1228 tuple[8] = 0; u_q[8] = 0;
1230 proc_vlc_encode(&vlc, tuple, u_q, _ignore,
1231 (PASS == 1) ? uvlc_tbl_pair1 : uvlc_tbl_pair2);
1238 ojph::mem_elastic_allocator *elastic,
1239 ojph::coded_lists *& coded)
1243 ui32 width = (_width + 15) & ~15u;
1244 ui32 ignore = width - _width;
1245 const int ms_size = (16384 * 16 + 14) / 15;
1246 const int mel_vlc_size = 3072;
1247 const int mel_size = 192;
1248 const int vlc_size = mel_vlc_size - mel_size;
1250 ui8 ms_buf[ms_size];
1251 ui8 mel_vlc_buf[mel_vlc_size];
1252 ui8 *mel_buf = mel_vlc_buf;
1253 ui8 *vlc_buf = mel_vlc_buf + mel_size;
1260 ms_init(&ms, ms_size, ms_buf);
1262 const ui32 p = 30 - missing_msbs;
1273 const __m256i right_shift = _mm256_set_epi32(
1274 0, 7, 6, 5, 4, 3, 2, 1
1277 const __m256i left_shift = _mm256_set_epi32(
1278 6, 5, 4, 3, 2, 1, 0, 7
1281 ui32 n_loop = (width + 15) / 16;
1283 __m256i e_val_vec[65];
1285 e_val_vec[i] = ZERO;
1287 __m256i prev_e_val_vec = ZERO;
1289 __m256i cx_val_vec[65];
1290 __m256i prev_cx_val_vec = ZERO;
1297 for (
ui32 y = 0; y < height; y += 2)
1299 e_val_vec[n_loop] = prev_e_val_vec;
1301 tmp = _mm256_and_si256(prev_cx_val_vec, _mm256_set1_epi32(8));
1302 cx_val_vec[n_loop] = _mm256_srli_epi32(tmp, 3);
1304 prev_e_val_vec = ZERO;
1305 prev_cx_val_vec = ZERO;
1307 ui32 *sp = buf + y * stride;
1310 encode_x_loop<1>(sp, stride, height, y, n_loop, _width,
1311 ignore, p, mel, vlc, ms,
1312 e_val_vec, prev_e_val_vec,
1313 cx_val_vec, prev_cx_val_vec, prev_cq,
1314 right_shift, left_shift);
1316 encode_x_loop<2>(sp, stride, height, y, n_loop, _width,
1317 ignore, p, mel, vlc, ms,
1318 e_val_vec, prev_e_val_vec,
1319 cx_val_vec, prev_cx_val_vec, prev_cq,
1320 right_shift, left_shift);
1322 tmp = _mm256_permutevar8x32_epi32(cx_val_vec[0], right_shift);
1323 tmp = _mm256_slli_epi32(tmp, 2);
1324 tmp = _mm256_add_epi32(tmp, cx_val_vec[0]);
1325 prev_cq = (
ui32)_mm_cvtsi128_si32(_mm256_castsi256_si128(tmp));
1333 lengths[0] = mel.pos + vlc.pos + ms.pos;
1334 elastic->
get_buffer(mel.pos + vlc.pos + ms.pos, coded);
1335 memcpy(coded->
buf, ms.buf, ms.pos);
1336 memcpy(coded->
buf + ms.pos, mel.buf, mel.pos);
1337 memcpy(coded->
buf + ms.pos + mel.pos, vlc.buf - vlc.pos + 1, vlc.pos);
1340 ui32 num_bytes = mel.pos + vlc.pos;
1341 coded->
buf[lengths[0]-1] = (
ui8)(num_bytes >> 4);
1342 coded->
buf[lengths[0]-2] = coded->
buf[lengths[0]-2] & 0xF0;
1343 coded->
buf[lengths[0]-2] =
1344 (
ui8)(coded->
buf[lengths[0]-2] | (num_bytes & 0xF));
void get_buffer(ui32 needed_bytes, coded_lists *&p)
static bool uvlc_init_tables()
Initializes uvlc_tbl0 and uvlc_tbl1 tables.
static bool vlc_init_tables()
Initializes vlc_tbl0 and vlc_tbl1 tables, from table0.h and table1.h.
ui16 vlc_tbl1[1024]
vlc_tbl1 contains decoding information for non-initial row of quads
ui16 vlc_tbl0[1024]
vlc_tbl0 contains decoding information for initial row of quads
static void ms_terminate(ms_struct *msp)
static void vlc_encode(vlc_struct *vlcp, int cwd, int cwd_len)
static void terminate_mel_vlc(mel_struct *melp, vlc_struct *vlcp)
void ojph_encode_codeblock_avx2(ui32 *buf, ui32 missing_msbs, ui32 num_passes, ui32 width, ui32 height, ui32 stride, ui32 *lengths, ojph::mem_elastic_allocator *elastic, ojph::coded_lists *&coded)
static void mel_init(dec_mel_st *melp, ui8 *bbuf, int lcup, int scup)
Initiates a dec_mel_st structure for MEL decoding and reads some bytes in order to get the read addre...
static void ms_init(ms_struct *msp, ui32 buffer_size, ui8 *data)
static void ms_encode(ms_struct *msp, ui32 cwd, int cwd_len)
static void mel_encode(mel_struct *melp, bool bit)
bool initialize_block_encoder_tables_avx2()
static void vlc_init(vlc_struct *vlcp, ui32 buffer_size, ui8 *data)
static uvlc_tbl_struct uvlc_tbl[num_uvlc_entries]
static ui32 population_count(ui32 val)
static ui32 count_trailing_zeros(ui32 val)
#define OJPH_FORCE_INLINE
#define OJPH_ERROR(t,...)
bool last_greater_than_8F