OpenJPH
Open-source implementation of JPEG2000 Part-15
Loading...
Searching...
No Matches
ojph_block_encoder_avx2.cpp
Go to the documentation of this file.
1//***************************************************************************/
2// This software is released under the 2-Clause BSD license, included
3// below.
4//
5// Copyright (c) 2019, Aous Naman
6// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7// Copyright (c) 2019, The University of New South Wales, Australia
8// Copyright (c) 2024, Intel Corporation
9// Copyright (c) 2026, Osamu Watanabe
10//
11// Redistribution and use in source and binary forms, with or without
12// modification, are permitted provided that the following conditions are
13// met:
14//
15// 1. Redistributions of source code must retain the above copyright
16// notice, this list of conditions and the following disclaimer.
17//
18// 2. Redistributions in binary form must reproduce the above copyright
19// notice, this list of conditions and the following disclaimer in the
20// documentation and/or other materials provided with the distribution.
21//
22// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
23// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
24// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
25// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
28// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
29// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
30// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
31// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33//***************************************************************************/
34// This file is part of the OpenJPH software implementation.
35// File: ojph_block_encoder_avx2.cpp
36//***************************************************************************/
37
38// Apple Clang on Intel produces corrupt bitstreams with several of the
39// scalar optimizations below (branchless VLC drain, 64-bit MagSgn, etc.)
40// for reasons not yet identified. Since this file is x86-only, the guard
41// does not affect Apple Silicon builds (which use NEON, not AVX2).
42#if defined(__apple_build_version__)
44#else
45
46#include "ojph_arch.h"
47#if defined(OJPH_ARCH_I386) || defined(OJPH_ARCH_X86_64)
48
49#include <cassert>
50#include <cstring>
51#include <cstdint>
52#include <climits>
53#include <immintrin.h>
54#include <mutex>
55
56#include "ojph_mem.h"
57#include "ojph_arch.h"
58#include "ojph_block_encoder.h"
59#include "ojph_message.h"
60
61#ifdef OJPH_COMPILER_MSVC
62 #define likely(x) (x)
63 #define unlikely(x) (x)
64#else
65 #define likely(x) __builtin_expect((x), 1)
66 #define unlikely(x) __builtin_expect((x), 0)
67#endif
68
69namespace ojph {
70 namespace local {
71
73 // tables
75
76 //VLC encoding
77 // index is (c_q << 8) + (rho << 4) + eps
78 // data is (cwd << 8) + (cwd_len << 4) + eps
79 // table 0 is for the initial line of quads
80 static ui32 vlc_tbl0[2048];
81 static ui32 vlc_tbl1[2048];
82
83 //UVLC encoding
84 static ui32 uvlc_tbl_pair1[33 * 33];
85 static ui32 uvlc_tbl_pair2[33 * 33];
86 static ui32 ulvc_cwd_pre[33];
87 static int ulvc_cwd_pre_len[33];
88 static ui32 ulvc_cwd_suf[33];
89 static int ulvc_cwd_suf_len[33];
90
92 static bool vlc_init_tables()
93 {
94 struct vlc_src_table { int c_q, rho, u_off, e_k, e_1, cwd, cwd_len; };
95 vlc_src_table tbl0[] = {
96 #include "table0.h"
97 };
98 size_t tbl0_size = sizeof(tbl0) / sizeof(vlc_src_table);
99
100 si32 pattern_popcnt[16];
101 for (ui32 i = 0; i < 16; ++i)
102 pattern_popcnt[i] = (si32)population_count(i);
103
104 vlc_src_table* src_tbl = tbl0;
105 ui32 *tgt_tbl = vlc_tbl0;
106 size_t tbl_size = tbl0_size;
107 for (int i = 0; i < 2048; ++i)
108 {
109 int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
110 if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
111 tgt_tbl[i] = 0;
112 else
113 {
114 vlc_src_table *best_entry = NULL;
115 if (emb) // u_off = 1
116 {
117 int best_e_k = -1;
118 for (size_t j = 0; j < tbl_size; ++j)
119 {
120 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
121 if (src_tbl[j].u_off == 1)
122 if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
123 {
124 //now we need to find the smallest cwd with the highest
125 // number of bits set in e_k
126 int ones_count = pattern_popcnt[src_tbl[j].e_k];
127 if (ones_count >= best_e_k)
128 {
129 best_entry = src_tbl + j;
130 best_e_k = ones_count;
131 }
132 }
133 }
134 }
135 else // u_off = 0
136 {
137 for (size_t j = 0; j < tbl_size; ++j)
138 {
139 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
140 if (src_tbl[j].u_off == 0)
141 {
142 best_entry = src_tbl + j;
143 break;
144 }
145 }
146 }
147 assert(best_entry);
148 tgt_tbl[i] = (ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
149 + best_entry->e_k);
150 }
151 }
152
153 vlc_src_table tbl1[] = {
154 #include "table1.h"
155 };
156 size_t tbl1_size = sizeof(tbl1) / sizeof(vlc_src_table);
157
158 src_tbl = tbl1;
159 tgt_tbl = vlc_tbl1;
160 tbl_size = tbl1_size;
161 for (int i = 0; i < 2048; ++i)
162 {
163 int c_q = i >> 8, rho = (i >> 4) & 0xF, emb = i & 0xF;
164 if (((emb & rho) != emb) || (rho == 0 && c_q == 0))
165 tgt_tbl[i] = 0;
166 else
167 {
168 vlc_src_table *best_entry = NULL;
169 if (emb) // u_off = 1
170 {
171 int best_e_k = -1;
172 for (size_t j = 0; j < tbl_size; ++j)
173 {
174 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
175 if (src_tbl[j].u_off == 1)
176 if ((emb & src_tbl[j].e_k) == src_tbl[j].e_1)
177 {
178 //now we need to find the smallest cwd with the highest
179 // number of bits set in e_k
180 int ones_count = pattern_popcnt[src_tbl[j].e_k];
181 if (ones_count >= best_e_k)
182 {
183 best_entry = src_tbl + j;
184 best_e_k = ones_count;
185 }
186 }
187 }
188 }
189 else // u_off = 0
190 {
191 for (size_t j = 0; j < tbl_size; ++j)
192 {
193 if (src_tbl[j].c_q == c_q && src_tbl[j].rho == rho)
194 if (src_tbl[j].u_off == 0)
195 {
196 best_entry = src_tbl + j;
197 break;
198 }
199 }
200 }
201 assert(best_entry);
202 tgt_tbl[i] = (ui16)((best_entry->cwd<<8) + (best_entry->cwd_len<<4)
203 + best_entry->e_k);
204 }
205 }
206
207
208 return true;
209 }
210
212 static bool uvlc_init_tables()
213 {
214 //code goes from 0 to 31, extension and 32 are not supported here
215 ulvc_cwd_pre[0] = 0; ulvc_cwd_pre[1] = 1; ulvc_cwd_pre[2] = 2;
216 ulvc_cwd_pre[3] = 4; ulvc_cwd_pre[4] = 4;
217 ulvc_cwd_pre_len[0] = 0; ulvc_cwd_pre_len[1] = 1;
218 ulvc_cwd_pre_len[2] = 2;
219 ulvc_cwd_pre_len[3] = 3; ulvc_cwd_pre_len[4] = 3;
220 ulvc_cwd_suf[0] = 0; ulvc_cwd_suf[1] = 0; ulvc_cwd_suf[2] = 0;
221 ulvc_cwd_suf[3] = 0; ulvc_cwd_suf[4] = 1;
222 ulvc_cwd_suf_len[0] = 0; ulvc_cwd_suf_len[1] = 0;
223 ulvc_cwd_suf_len[2] = 0;
224 ulvc_cwd_suf_len[3] = 1; ulvc_cwd_suf_len[4] = 1;
225 for (int i = 5; i < 33; ++i)
226 {
227 ulvc_cwd_pre[i] = 0;
228 ulvc_cwd_pre_len[i] = 3;
229 ulvc_cwd_suf[i] = (ui32)(i-5);
230 ulvc_cwd_suf_len[i] = 5;
231 }
232 return true;
233 }
234
236 static void uvlc_init_pair_tables()
237 {
238 for (int uq0 = 0; uq0 < 33; ++uq0) {
239 for (int uq1 = 0; uq1 < 33; ++uq1) {
240 ui32 cwd; int len;
241
242 cwd = 0; len = 0;
243 if (uq0 > 2 && uq1 > 2) {
244 cwd |= ulvc_cwd_pre[uq0 - 2];
245 len += ulvc_cwd_pre_len[uq0 - 2];
246 cwd |= ulvc_cwd_pre[uq1 - 2] << len;
247 len += ulvc_cwd_pre_len[uq1 - 2];
248 cwd |= ulvc_cwd_suf[uq0 - 2] << len;
249 len += ulvc_cwd_suf_len[uq0 - 2];
250 cwd |= ulvc_cwd_suf[uq1 - 2] << len;
251 len += ulvc_cwd_suf_len[uq1 - 2];
252 } else if (uq0 > 2 && uq1 > 0) {
253 cwd |= ulvc_cwd_pre[uq0];
254 len += ulvc_cwd_pre_len[uq0];
255 cwd |= (ui32)(uq1 - 1) << len;
256 len += 1;
257 cwd |= ulvc_cwd_suf[uq0] << len;
258 len += ulvc_cwd_suf_len[uq0];
259 } else {
260 cwd |= ulvc_cwd_pre[uq0];
261 len += ulvc_cwd_pre_len[uq0];
262 cwd |= ulvc_cwd_pre[uq1] << len;
263 len += ulvc_cwd_pre_len[uq1];
264 cwd |= ulvc_cwd_suf[uq0] << len;
265 len += ulvc_cwd_suf_len[uq0];
266 cwd |= ulvc_cwd_suf[uq1] << len;
267 len += ulvc_cwd_suf_len[uq1];
268 }
269 uvlc_tbl_pair1[uq0 * 33 + uq1] = (cwd << 5) | (ui32)len;
270
271 cwd = 0; len = 0;
272 cwd |= ulvc_cwd_pre[uq0];
273 len += ulvc_cwd_pre_len[uq0];
274 cwd |= ulvc_cwd_pre[uq1] << len;
275 len += ulvc_cwd_pre_len[uq1];
276 cwd |= ulvc_cwd_suf[uq0] << len;
277 len += ulvc_cwd_suf_len[uq0];
278 cwd |= ulvc_cwd_suf[uq1] << len;
279 len += ulvc_cwd_suf_len[uq1];
280 uvlc_tbl_pair2[uq0 * 33 + uq1] = (cwd << 5) | (ui32)len;
281 }
282 }
283 }
284
287 static bool tables_initialized = false;
288 static std::once_flag tables_initialized_flag;
289 std::call_once(tables_initialized_flag, []() {
290 memset(vlc_tbl0, 0, 2048 * sizeof(ui32));
291 memset(vlc_tbl1, 0, 2048 * sizeof(ui32));
292 tables_initialized = vlc_init_tables();
293 tables_initialized = tables_initialized && uvlc_init_tables();
294 uvlc_init_pair_tables();
295 });
296 return tables_initialized;
297 }
298
300 //
302 struct mel_struct {
303 //storage
304 ui8* buf; //pointer to data buffer
305 ui32 pos; //position of next writing within buf
306 ui32 buf_size; //size of buffer, which we must not exceed
307
308 // all these can be replaced by bytes
309 int remaining_bits; //number of empty bits in tmp
310 int tmp; //temporary storage of coded bits
311 int run; //number of 0 run
312 int k; //state
313 int threshold; //threshold where one bit must be coded
314 };
315
317 static inline void
318 mel_init(mel_struct* melp, ui32 buffer_size, ui8* data)
319 {
320 melp->buf = data;
321 melp->pos = 0;
322 melp->buf_size = buffer_size;
323 melp->remaining_bits = 8;
324 melp->tmp = 0;
325 melp->run = 0;
326 melp->k = 0;
327 melp->threshold = 1; // this is 1 << mel_exp[melp->k];
328 }
329
330 static const int mel_exp[13] = {0,0,0,1,1,1,2,2,2,3,3,4,5};
331
333 static inline void
334 mel_emit_bits(mel_struct* melp, ui32 bits, int num_bits)
335 {
336 melp->tmp = (melp->tmp << num_bits) | (int)bits;
337 melp->remaining_bits -= num_bits;
338 if (melp->remaining_bits <= 0) {
339 int excess = -melp->remaining_bits;
340 ui8 byte = (ui8)(melp->tmp >> excess);
341 melp->buf[melp->pos++] = byte;
342 melp->tmp &= (1 << excess) - 1;
343 melp->remaining_bits += 8 - (byte == 0xFF);
344 }
345 }
346
348 static inline void
349 mel_encode(mel_struct* melp, bool bit)
350 {
351 if (bit == false) {
352 ++melp->run;
353 if (melp->run >= melp->threshold) {
354 mel_emit_bits(melp, 1, 1);
355 melp->run = 0;
356 melp->k = ojph_min(12, melp->k + 1);
357 melp->threshold = 1 << mel_exp[melp->k];
358 }
359 } else {
360 int t = mel_exp[melp->k];
361 mel_emit_bits(melp, melp->run & ((1u << t) - 1), t + 1);
362 melp->run = 0;
363 melp->k = ojph_max(0, melp->k - 1);
364 melp->threshold = 1 << mel_exp[melp->k];
365 }
366 }
367
369 static inline void
370 mel_advance_run(mel_struct* melp, ui32 n)
371 {
372 ui32 remaining = n;
373 while (remaining > 0) {
374 ui32 space = (ui32)melp->threshold - (ui32)melp->run;
375 if (remaining >= space) {
376 remaining -= space;
377 mel_emit_bits(melp, 1, 1);
378 melp->run = 0;
379 melp->k = ojph_min(12, melp->k + 1);
380 melp->threshold = 1 << mel_exp[melp->k];
381 } else {
382 melp->run += (int)remaining;
383 remaining = 0;
384 }
385 }
386 }
387
389 static inline void
390 mel_encode_significance(mel_struct* melp)
391 {
392 int t = mel_exp[melp->k];
393 mel_emit_bits(melp, melp->run & ((1u << t) - 1), t + 1);
394 melp->run = 0;
395 melp->k = ojph_max(0, melp->k - 1);
396 melp->threshold = 1 << mel_exp[melp->k];
397 }
398
400 //
402
403 struct vlc_struct {
404 //storage
405 ui8* buf; //pointer to data buffer
406 ui32 pos; //position of next writing within buf
407 ui32 buf_size; //size of buffer, which we must not exceed
408
409 int used_bits; //number of occupied bits in tmp
410 ui64 tmp; //temporary storage of coded bits
411 bool last_greater_than_8F; //true if last byte us greater than 0x8F
412 };
413
415 static inline void
416 vlc_init(vlc_struct* vlcp, ui32 buffer_size, ui8* data)
417 {
418 vlcp->buf = data + buffer_size - 1; //points to last byte
419 vlcp->pos = 1; //locations will be all -pos
420 vlcp->buf_size = buffer_size;
421
422 vlcp->buf[0] = 0xFF;
423 vlcp->used_bits = 4;
424 vlcp->tmp = 0xF;
425 vlcp->last_greater_than_8F = true;
426 }
427
429 static inline void
430 vlc_drain(vlc_struct* vlcp)
431 {
432 while (vlcp->used_bits >= 8) {
433 int escape = (int)vlcp->last_greater_than_8F;
434 int is_7f = (int)((vlcp->tmp & 0x7F) == 0x7F);
435 int need_stuff = escape & is_7f;
436 int bits = 8 - need_stuff;
437
438 ui8 byte = (ui8)(vlcp->tmp & ((1u << bits) - 1));
439 *(vlcp->buf - vlcp->pos) = byte;
440 vlcp->pos++;
441 vlcp->tmp >>= bits;
442 vlcp->used_bits -= bits;
443 vlcp->last_greater_than_8F = byte > 0x8F;
444 }
445 }
446
448 static inline void
449 vlc_encode(vlc_struct* vlcp, ui64 cwd, int cwd_len)
450 {
451 while (true) {
452 int avail = 64 - vlcp->used_bits;
453 if (likely(cwd_len <= avail)) {
454 vlcp->tmp |= cwd << vlcp->used_bits;
455 vlcp->used_bits += cwd_len;
456 return;
457 }
458 vlcp->tmp |= (cwd & ((1ULL << avail) - 1)) << vlcp->used_bits;
459 vlcp->used_bits = 64;
460 vlc_drain(vlcp);
461 cwd >>= avail;
462 cwd_len -= avail;
463 }
464 }
465
467 //
469 static inline void
471 {
472 if (melp->run > 0)
473 mel_emit_bits(melp, 1, 1);
474
475 if (vlcp->last_greater_than_8F && (vlcp->tmp & 0x7f) == 0x7f) {
476 *(vlcp->buf - vlcp->pos) = 0x7f;
477 vlcp->pos++;
478 vlcp->tmp >>= 7;
479 vlcp->used_bits -= 7;
480 }
481
482 melp->tmp = melp->tmp << melp->remaining_bits;
483 int mel_mask = (0xFF << melp->remaining_bits) & 0xFF;
484 int vlc_mask = 0xFF >> (8 - vlcp->used_bits);
485 if ((mel_mask | vlc_mask) == 0)
486 return; //last mel byte cannot be 0xFF, since then
487 //melp->remaining_bits would be < 8
488 if (melp->pos >= melp->buf_size)
489 OJPH_ERROR(0x00020003, "mel encoder's buffer is full");
490 ui8 vlcp_tmp = (ui8)vlcp->tmp;
491 int fuse = melp->tmp | vlcp_tmp;
492 if ( ( ((fuse ^ melp->tmp) & mel_mask)
493 | ((fuse ^ vlcp_tmp) & vlc_mask) ) == 0
494 && (fuse != 0xFF) && vlcp->pos > 1)
495 {
496 melp->buf[melp->pos++] = (ui8)fuse;
497 }
498 else
499 {
500 if (vlcp->pos >= vlcp->buf_size)
501 OJPH_ERROR(0x00020004, "vlc encoder's buffer is full");
502 melp->buf[melp->pos++] = (ui8)melp->tmp; //melp->tmp cannot be 0xFF
503 *(vlcp->buf - vlcp->pos) = (ui8)vlcp_tmp;
504 vlcp->pos++;
505 }
506 }
507
509//
511
512 struct ms_struct {
513 //storage
514 ui8* buf; //pointer to data buffer
515 ui32 pos; //position of next writing within buf
516 ui32 buf_size; //size of buffer, which we must not exceed
517
518 int used_bits; //number of occupied bits in tmp
519 ui64 tmp; //temporary storage of coded bits (64-bit accumulator)
520 bool last_was_ff;//true if the last written byte was 0xFF
521 };
522
524 static inline void
525 ms_init(ms_struct* msp, ui32 buffer_size, ui8* data)
526 {
527 msp->buf = data;
528 msp->pos = 0;
529 msp->buf_size = buffer_size;
530 msp->used_bits = 0;
531 msp->tmp = 0;
532 msp->last_was_ff = false;
533 }
534
536 static inline void
537 ms_drain(ms_struct* msp)
538 {
539 if (msp->last_was_ff) {
540 if (msp->used_bits < 7)
541 return;
542 msp->buf[msp->pos++] = (ui8)(msp->tmp & 0x7F);
543 msp->tmp >>= 7;
544 msp->used_bits -= 7;
545 msp->last_was_ff = false;
546 }
547
548 while (msp->used_bits >= 8) {
549 int n_bytes = msp->used_bits >> 3;
550 if (n_bytes > 8) n_bytes = 8;
551
552 ui64 word = msp->tmp;
553 ui64 valid_mask = (n_bytes < 8)
554 ? (1ULL << (n_bytes * 8)) - 1 : ~(ui64)0;
555
556 ui64 w = ~word;
557 ui64 ff_detect = (w - 0x0101010101010101ULL) & ~w
558 & 0x8080808080808080ULL;
559 ff_detect &= valid_mask;
560
561 if (likely(ff_detect == 0)) {
562 memcpy(msp->buf + msp->pos, &word, (size_t)n_bytes);
563 msp->pos += (ui32)n_bytes;
564 if (n_bytes < 8)
565 msp->tmp >>= (n_bytes * 8);
566 else
567 msp->tmp = 0;
568 msp->used_bits -= n_bytes * 8;
569 } else {
570 int ff_pos = (int)(count_trailing_zeros(ff_detect) >> 3);
571 int safe = ff_pos + 1;
572 memcpy(msp->buf + msp->pos, &word, (size_t)safe);
573 msp->pos += (ui32)safe;
574 int bits = safe * 8;
575 if (bits < 64)
576 msp->tmp >>= bits;
577 else
578 msp->tmp = 0;
579 msp->used_bits -= bits;
580
581 if (msp->used_bits >= 7) {
582 msp->buf[msp->pos++] = (ui8)(msp->tmp & 0x7F);
583 msp->tmp >>= 7;
584 msp->used_bits -= 7;
585 msp->last_was_ff = false;
586 } else {
587 msp->last_was_ff = true;
588 return;
589 }
590 }
591 }
592 }
593
595 static inline void
596 ms_encode_nodefer(ms_struct* msp, ui64 cwd, int cwd_len)
597 {
598 while (true) {
599 int avail = 64 - msp->used_bits;
600 if (likely(cwd_len <= avail)) {
601 msp->tmp |= cwd << msp->used_bits;
602 msp->used_bits += cwd_len;
603 return;
604 }
605 msp->tmp |= (cwd & ((1ULL << avail) - 1)) << msp->used_bits;
606 msp->used_bits = 64;
607 ms_drain(msp);
608 cwd >>= avail;
609 cwd_len -= avail;
610 }
611 }
612
614 static inline void
615 ms_encode(ms_struct* msp, ui64 cwd, int cwd_len)
616 {
617 int avail = 64 - msp->used_bits;
618 if (likely(cwd_len <= avail)) {
619 msp->tmp |= cwd << msp->used_bits;
620 msp->used_bits += cwd_len;
621 } else {
622 msp->tmp |= (cwd & ((1ULL << avail) - 1)) << msp->used_bits;
623 msp->used_bits = 64;
624 ms_drain(msp);
625 cwd >>= avail;
626 cwd_len -= avail;
627 msp->tmp |= cwd << msp->used_bits;
628 msp->used_bits += cwd_len;
629 }
630 ms_drain(msp);
631 }
632
634 static inline void
636 {
637 ms_drain(msp);
638 if (msp->used_bits)
639 {
640 int max_bits = msp->last_was_ff ? 7 : 8;
641 int t = max_bits - msp->used_bits;
642 ui32 byte = (ui32)(msp->tmp & ((1ULL << msp->used_bits) - 1));
643 byte |= (0xFFu & ((1u << t) - 1)) << msp->used_bits;
644 if (byte != 0xFF)
645 {
646 if (msp->pos >= msp->buf_size)
647 OJPH_ERROR(0x00020006, "magnitude sign encoder's buffer is full");
648 msp->buf[msp->pos++] = (ui8)byte;
649 }
650 }
651 else if (msp->last_was_ff)
652 msp->pos--;
653 }
654
655#define ZERO _mm256_setzero_si256()
656#define ONE _mm256_set1_epi32(1)
657
658// https://stackoverflow.com/a/58827596
659inline __m256i avx2_lzcnt_epi32(__m256i v) {
660 // prevent value from being rounded up to the next power of two
661 v = _mm256_andnot_si256(_mm256_srli_epi32(v, 8), v); // keep 8 MSB
662
663 v = _mm256_castps_si256(_mm256_cvtepi32_ps(v)); // convert an integer to float
664 v = _mm256_srli_epi32(v, 23); // shift down the exponent
665 v = _mm256_subs_epu16(_mm256_set1_epi32(158), v); // undo bias
666 v = _mm256_min_epi16(v, _mm256_set1_epi32(32)); // clamp at 32
667
668 return v;
669}
670
671inline __m256i avx2_cmpneq_epi32(__m256i v, __m256i v2) {
672 return _mm256_xor_si256(_mm256_cmpeq_epi32(v, v2), _mm256_set1_epi32((int32_t)0xffffffff));
673}
674
675static void proc_pixel(__m256i *src_vec, ui32 p,
676 __m256i *eq_vec, __m256i *s_vec,
677 __m256i &rho_vec, __m256i &e_qmax_vec)
678{
679 __m256i val_vec[4];
680 __m256i _eq_vec[4];
681 __m256i _s_vec[4];
682 __m256i _rho_vec[4];
683
684 for (ui32 i = 0; i < 4; ++i) {
685 /* val = t + t; //multiply by 2 and get rid of sign */
686 val_vec[i] = _mm256_add_epi32(src_vec[i], src_vec[i]);
687
688 /* val >>= p; // 2 \mu_p + x */
689 val_vec[i] = _mm256_srli_epi32(val_vec[i], (int)p);
690
691 /* val &= ~1u; // 2 \mu_p */
692 val_vec[i] = _mm256_and_si256(val_vec[i], _mm256_set1_epi32((int)~1u));
693
694 /* if (val) { */
695 const __m256i val_notmask = avx2_cmpneq_epi32(val_vec[i], ZERO);
696
697 /* rho[i] = 1 << i;
698 * rho is processed below.
699 */
700
701 /* e_q[i] = 32 - (int)count_leading_ZEROs(--val); //2\mu_p - 1 */
702 val_vec[i] = _mm256_sub_epi32(val_vec[i], ONE);
703 _eq_vec[i] = avx2_lzcnt_epi32(val_vec[i]);
704 _eq_vec[i] = _mm256_sub_epi32(_mm256_set1_epi32(32), _eq_vec[i]);
705
706 /* e_qmax[i] = ojph_max(e_qmax[i], e_q[j]);
707 * e_qmax is processed below
708 */
709
710 /* s[0] = --val + (t >> 31); //v_n = 2(\mu_p-1) + s_n */
711 val_vec[i] = _mm256_sub_epi32(val_vec[i], ONE);
712 _s_vec[i] = _mm256_srli_epi32(src_vec[i], 31);
713 _s_vec[i] = _mm256_add_epi32(_s_vec[i], val_vec[i]);
714
715 _eq_vec[i] = _mm256_and_si256(_eq_vec[i], val_notmask);
716 _s_vec[i] = _mm256_and_si256(_s_vec[i], val_notmask);
717 val_vec[i] = _mm256_srli_epi32(val_notmask, 31);
718 /* } */
719 }
720
721 const __m256i idx = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
722
723 /* Reorder from
724 * *_vec[0]:[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [0, 5], [0, 6], [0, 7]
725 * *_vec[1]:[1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5],.[1, 6], [1, 7]
726 * *_vec[2]:[0, 8], [0, 9], [0,10], [0,11], [0,12], [0,13], [0,14], [0,15]
727 * *_vec[3]:[1, 8], [1, 9], [1,10], [1,11], [1,12], [1,13], [1,14], [1,15]
728 * to
729 * *_vec[0]:[0, 0], [0, 2], [0, 4], [0, 6], [0, 8], [0,10], [0,12], [0,14]
730 * *_vec[1]:[1, 0], [1, 2], [1, 4], [1, 6], [1, 8], [1,10], [1,12], [1,14]
731 * *_vec[2]:[0, 1], [0, 3], [0, 5], [0, 7], [0, 9], [0,11], [0,13], [0,15]
732 * *_vec[3]:[1, 1], [1, 3], [1, 5], [1, 7], [1, 9], [1,11], [1,13], [1,15]
733 */
734 __m256i tmp1, tmp2;
735 for (ui32 i = 0; i < 2; ++i) {
736 tmp1 = _mm256_permutevar8x32_epi32(_eq_vec[0 + i], idx);
737 tmp2 = _mm256_permutevar8x32_epi32(_eq_vec[2 + i], idx);
738 eq_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4));
739 eq_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4));
740
741 tmp1 = _mm256_permutevar8x32_epi32(_s_vec[0 + i], idx);
742 tmp2 = _mm256_permutevar8x32_epi32(_s_vec[2 + i], idx);
743 s_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4));
744 s_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4));
745
746 tmp1 = _mm256_permutevar8x32_epi32(val_vec[0 + i], idx);
747 tmp2 = _mm256_permutevar8x32_epi32(val_vec[2 + i], idx);
748 _rho_vec[0 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (0 << 0) + (2 << 4));
749 _rho_vec[2 + i] = _mm256_permute2x128_si256(tmp1, tmp2, (1 << 0) + (3 << 4));
750 }
751
752 e_qmax_vec = _mm256_max_epi32(eq_vec[0], eq_vec[1]);
753 e_qmax_vec = _mm256_max_epi32(e_qmax_vec, eq_vec[2]);
754 e_qmax_vec = _mm256_max_epi32(e_qmax_vec, eq_vec[3]);
755 _rho_vec[1] = _mm256_slli_epi32(_rho_vec[1], 1);
756 _rho_vec[2] = _mm256_slli_epi32(_rho_vec[2], 2);
757 _rho_vec[3] = _mm256_slli_epi32(_rho_vec[3], 3);
758 rho_vec = _mm256_or_si256(_rho_vec[0], _rho_vec[1]);
759 rho_vec = _mm256_or_si256(rho_vec, _rho_vec[2]);
760 rho_vec = _mm256_or_si256(rho_vec, _rho_vec[3]);
761}
762
763/* from [0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, ...]
764 * [0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, ...]
765 * [0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, ...]
766 * [0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, ...]
767 *
768 * to [0x00, 0x10, 0x20, 0x30, 0x01, 0x11, 0x21, 0x31,
769 * 0x02, 0x12, 0x22, 0x32, 0x03, 0x13, 0x23, 0x33]
770 *
771 * [0x04, 0x14, 0x24, 0x34, 0x05, 0x15, 0x25, 0x35,
772 * 0x06, 0x16, 0x26, 0x36, 0x07, 0x17, 0x27, 0x37]
773 *
774 * [..]
775 */
776static void rotate_matrix(__m256i *matrix)
777{
778 __m256i tmp1 = _mm256_unpacklo_epi32(matrix[0], matrix[1]);
779 __m256i tmp2 = _mm256_unpacklo_epi32(matrix[2], matrix[3]);
780 __m256i tmp3 = _mm256_unpackhi_epi32(matrix[0], matrix[1]);
781 __m256i tmp4 = _mm256_unpackhi_epi32(matrix[2], matrix[3]);
782
783 matrix[0] = _mm256_unpacklo_epi64(tmp1, tmp2);
784 matrix[1] = _mm256_unpacklo_epi64(tmp3, tmp4);
785 matrix[2] = _mm256_unpackhi_epi64(tmp1, tmp2);
786 matrix[3] = _mm256_unpackhi_epi64(tmp3, tmp4);
787
788 tmp1 = _mm256_permute2x128_si256(matrix[0], matrix[2], 0x20);
789 matrix[2] = _mm256_permute2x128_si256(matrix[0], matrix[2], 0x31);
790 matrix[0] = tmp1;
791
792 tmp1 = _mm256_permute2x128_si256(matrix[1], matrix[3], 0x20);
793 matrix[3] = _mm256_permute2x128_si256(matrix[1], matrix[3], 0x31);
794 matrix[1] = tmp1;
795}
796
797static void proc_ms_encode(ms_struct *msp,
798 __m256i &tuple_vec,
799 __m256i &uq_vec,
800 __m256i &rho_vec,
801 __m256i *s_vec)
802{
803 __m256i m_vec[4];
804
805 /* Prepare parameters for ms_encode */
806 /* m = (rho[i] & 1) ? Uq[i] - ((tuple[i] & 1) >> 0) : 0; */
807 auto tmp = _mm256_and_si256(tuple_vec, ONE);
808 tmp = _mm256_sub_epi32(uq_vec, tmp);
809 auto tmp1 = _mm256_and_si256(rho_vec, ONE);
810 auto mask = avx2_cmpneq_epi32(tmp1, ZERO);
811 m_vec[0] = _mm256_and_si256(mask, tmp);
812
813 /* m = (rho[i] & 2) ? Uq[i] - ((tuple[i] & 2) >> 1) : 0; */
814 tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(2));
815 tmp = _mm256_srli_epi32(tmp, 1);
816 tmp = _mm256_sub_epi32(uq_vec, tmp);
817 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(2));
818 mask = avx2_cmpneq_epi32(tmp1, ZERO);
819 m_vec[1] = _mm256_and_si256(mask, tmp);
820
821 /* m = (rho[i] & 4) ? Uq[i] - ((tuple[i] & 4) >> 2) : 0; */
822 tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(4));
823 tmp = _mm256_srli_epi32(tmp, 2);
824 tmp = _mm256_sub_epi32(uq_vec, tmp);
825 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(4));
826 mask = avx2_cmpneq_epi32(tmp1, ZERO);
827 m_vec[2] = _mm256_and_si256(mask, tmp);
828
829 /* m = (rho[i] & 8) ? Uq[i] - ((tuple[i] & 8) >> 3) : 0; */
830 tmp = _mm256_and_si256(tuple_vec, _mm256_set1_epi32(8));
831 tmp = _mm256_srli_epi32(tmp, 3);
832 tmp = _mm256_sub_epi32(uq_vec, tmp);
833 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(8));
834 mask = avx2_cmpneq_epi32(tmp1, ZERO);
835 m_vec[3] = _mm256_and_si256(mask, tmp);
836
837 rotate_matrix(m_vec);
838 rotate_matrix(s_vec);
839
840 ui32 cwd[8];
841 int cwd_len[8];
842
843 /* Each iteration process 8 bytes * 2 lines */
844 for (ui32 i = 0; i < 4; ++i) {
845 /* cwd = s[i * 4 + 0] & ((1U << m) - 1)
846 * cwd_len = m
847 */
848 _mm256_storeu_si256((__m256i *)cwd_len, m_vec[i]);
849 tmp = _mm256_sllv_epi32(ONE, m_vec[i]);
850 tmp = _mm256_sub_epi32(tmp, ONE);
851 tmp = _mm256_and_si256(tmp, s_vec[i]);
852 _mm256_storeu_si256((__m256i*)cwd, tmp);
853
854 for (ui32 j = 0; j < 4; j += 2) {
855 ui32 idx0 = j * 2;
856 ui64 _cwd = cwd[idx0];
857 int _cwd_len = cwd_len[idx0];
858 _cwd |= ((ui64)cwd[idx0 + 1]) << _cwd_len;
859 _cwd_len += cwd_len[idx0 + 1];
860
861 ui32 idx1 = (j + 1) * 2;
862 int len1 = cwd_len[idx1] + cwd_len[idx1 + 1];
863 if (likely(_cwd_len + len1 <= 64)) {
864 _cwd |= ((ui64)cwd[idx1]) << _cwd_len;
865 _cwd_len += cwd_len[idx1];
866 _cwd |= ((ui64)cwd[idx1 + 1]) << _cwd_len;
867 _cwd_len += cwd_len[idx1 + 1];
868 ms_encode_nodefer(msp, _cwd, _cwd_len);
869 } else {
870 ms_encode_nodefer(msp, _cwd, _cwd_len);
871 _cwd = cwd[idx1];
872 _cwd_len = cwd_len[idx1];
873 _cwd |= ((ui64)cwd[idx1 + 1]) << _cwd_len;
874 _cwd_len += cwd_len[idx1 + 1];
875 ms_encode_nodefer(msp, _cwd, _cwd_len);
876 }
877 }
878 }
879 ms_drain(msp);
880}
881
882static __m256i cal_eps_vec(__m256i *eq_vec, __m256i &u_q_vec,
883 __m256i &e_qmax_vec)
884{
885 /* if (u_q[i] > 0) {
886 * eps[i] |= (e_q[i * 4 + 0] == e_qmax[i]);
887 * eps[i] |= (e_q[i * 4 + 1] == e_qmax[i]) << 1;
888 * eps[i] |= (e_q[i * 4 + 2] == e_qmax[i]) << 2;
889 * eps[i] |= (e_q[i * 4 + 3] == e_qmax[i]) << 3;
890 * }
891 */
892 auto u_q_mask = _mm256_cmpgt_epi32(u_q_vec, ZERO);
893
894 auto mask = _mm256_cmpeq_epi32(eq_vec[0], e_qmax_vec);
895 auto eps_vec = _mm256_srli_epi32(mask, 31);
896
897 mask = _mm256_cmpeq_epi32(eq_vec[1], e_qmax_vec);
898 auto tmp = _mm256_srli_epi32(mask, 31);
899 tmp = _mm256_slli_epi32(tmp, 1);
900 eps_vec = _mm256_or_si256(eps_vec, tmp);
901
902 mask = _mm256_cmpeq_epi32(eq_vec[2], e_qmax_vec);
903 tmp = _mm256_srli_epi32(mask, 31);
904 tmp = _mm256_slli_epi32(tmp, 2);
905 eps_vec = _mm256_or_si256(eps_vec, tmp);
906
907 mask = _mm256_cmpeq_epi32(eq_vec[3], e_qmax_vec);
908 tmp = _mm256_srli_epi32(mask, 31);
909 tmp = _mm256_slli_epi32(tmp, 3);
910 eps_vec = _mm256_or_si256(eps_vec, tmp);
911
912 return _mm256_and_si256(u_q_mask, eps_vec);
913}
914
915static void update_lep(ui32 x, __m256i &prev_e_val_vec,
916 __m256i *eq_vec, __m256i *e_val_vec,
917 const __m256i left_shift)
918{
919 /* lep[0] = ojph_max(lep[0], (ui8)e_q[1]); lep++;
920 * lep[0] = (ui8)e_q[3];
921 * Compare e_q[1] with e_q[3] of the prevous round.
922 */
923 auto tmp = _mm256_permutevar8x32_epi32(eq_vec[3], left_shift);
924 tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(prev_e_val_vec)), 0);
925 prev_e_val_vec = _mm256_insert_epi32(ZERO, _mm256_extract_epi32(eq_vec[3], 7), 0);
926 e_val_vec[x] = _mm256_max_epi32(eq_vec[1], tmp);
927}
928
929
930static void update_lcxp(ui32 x, __m256i &prev_cx_val_vec,
931 __m256i &rho_vec, __m256i *cx_val_vec,
932 const __m256i left_shift)
933{
934 /* lcxp[0] = (ui8)(lcxp[0] | (ui8)((rho[0] & 2) >> 1)); lcxp++;
935 * lcxp[0] = (ui8)((rho[0] & 8) >> 3);
936 * Or (rho[0] & 2) and (rho[0] of the previous round & 8).
937 */
938 auto tmp = _mm256_permutevar8x32_epi32(rho_vec, left_shift);
939 tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(prev_cx_val_vec)), 0);
940 prev_cx_val_vec = _mm256_insert_epi32(ZERO, _mm256_extract_epi32(rho_vec, 7), 0);
941
942 tmp = _mm256_and_si256(tmp, _mm256_set1_epi32(8));
943 tmp = _mm256_srli_epi32(tmp, 3);
944
945 auto tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(2));
946 tmp1 = _mm256_srli_epi32(tmp1, 1);
947 cx_val_vec[x] = _mm256_or_si256(tmp, tmp1);
948}
949
950static __m256i cal_tuple(__m256i &cq_vec, __m256i &rho_vec,
951 __m256i &eps_vec, ui32 *vlc_tbl)
952{
953 /* tuple[i] = vlc_tbl1[(c_q[i] << 8) + (rho[i] << 4) + eps[i]]; */
954 auto tmp = _mm256_slli_epi32(cq_vec, 8);
955 auto tmp1 = _mm256_slli_epi32(rho_vec, 4);
956 tmp = _mm256_add_epi32(tmp, tmp1);
957 tmp = _mm256_add_epi32(tmp, eps_vec);
958 return _mm256_i32gather_epi32((const int *)vlc_tbl, tmp, 4);
959}
960
961static __m256i proc_cq1(ui32 x, __m256i *cx_val_vec, __m256i &rho_vec,
962 const __m256i right_shift)
963{
964 ojph_unused(x);
965 ojph_unused(cx_val_vec);
966 ojph_unused(right_shift);
967
968 /* c_q[i + 1] = (rho[i] >> 1) | (rho[i] & 1); */
969 auto tmp = _mm256_srli_epi32(rho_vec, 1);
970 auto tmp1 = _mm256_and_si256(rho_vec, ONE);
971 return _mm256_or_si256(tmp, tmp1);
972}
973
974static __m256i proc_cq2(ui32 x, __m256i *cx_val_vec, __m256i &rho_vec,
975 const __m256i right_shift)
976{
977 // c_q[i + 1] = (lcxp[i + 1] + (lcxp[i + 2] << 2))
978 // | (((rho[i] & 4) >> 1) | ((rho[i] & 8) >> 2));
979 auto lcxp1_vec = _mm256_permutevar8x32_epi32(cx_val_vec[x], right_shift);
980 auto tmp = _mm256_permutevar8x32_epi32(lcxp1_vec, right_shift);
981
982#ifdef OJPH_ARCH_X86_64
983 tmp = _mm256_insert_epi64(tmp,
984 _mm_cvtsi128_si64(_mm256_castsi256_si128(cx_val_vec[x + 1])), 3);
985#elif (defined OJPH_ARCH_I386)
986 int lsb = _mm_cvtsi128_si32(_mm256_castsi256_si128(cx_val_vec[x + 1]));
987 tmp = _mm256_insert_epi32(tmp, lsb, 6);
988 int msb = _mm_extract_epi32(_mm256_castsi256_si128(cx_val_vec[x + 1]), 1);
989 tmp = _mm256_insert_epi32(tmp, msb, 7);
990#else
991 #error Error unsupport compiler
992#endif
993 tmp = _mm256_slli_epi32(tmp, 2);
994 auto tmp1 = _mm256_insert_epi32(lcxp1_vec,
995 _mm_cvtsi128_si32(_mm256_castsi256_si128(cx_val_vec[x + 1])), 7);
996 tmp = _mm256_add_epi32(tmp1, tmp);
997
998 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(4));
999 tmp1 = _mm256_srli_epi32(tmp1, 1);
1000 tmp = _mm256_or_si256(tmp, tmp1);
1001
1002 tmp1 = _mm256_and_si256(rho_vec, _mm256_set1_epi32(8));
1003 tmp1 = _mm256_srli_epi32(tmp1, 2);
1004
1005 return _mm256_or_si256(tmp, tmp1);
1006}
1007
1008static void proc_mel_encode1(mel_struct *melp, __m256i &cq_vec,
1009 __m256i &rho_vec, __m256i u_q_vec, ui32 ignore,
1010 const __m256i right_shift)
1011{
1012 int32_t mel_need_encode[8];
1013 int32_t mel_need_encode2[8];
1014 int32_t mel_bit[8];
1015 int32_t mel_bit2[8];
1016 /* Prepare mel_encode params */
1017 /* if (c_q[i] == 0) { */
1018 _mm256_storeu_si256((__m256i *)mel_need_encode, _mm256_cmpeq_epi32(cq_vec, ZERO));
1019 /* mel_encode(&mel, rho[i] != 0); */
1020 _mm256_storeu_si256((__m256i*)mel_bit, _mm256_srli_epi32(avx2_cmpneq_epi32(rho_vec, ZERO), 31));
1021 /* } */
1022
1023 /* mel_encode(&mel, ojph_min(u_q[i], u_q[i + 1]) > 2); */
1024 auto tmp = _mm256_permutevar8x32_epi32(u_q_vec, right_shift);
1025 auto tmp1 = _mm256_min_epi32(u_q_vec, tmp);
1026 _mm256_storeu_si256((__m256i*)mel_bit2, _mm256_srli_epi32(_mm256_cmpgt_epi32(tmp1, _mm256_set1_epi32(2)), 31));
1027
1028 /* if (u_q[i] > 0 && u_q[i + 1] > 0) { } */
1029 auto need_encode2 = _mm256_cmpgt_epi32(u_q_vec, ZERO);
1030 _mm256_storeu_si256((__m256i*)mel_need_encode2, _mm256_and_si256(need_encode2, _mm256_cmpgt_epi32(tmp, ZERO)));
1031
1032 ui32 i_max = 8 - (ignore / 2);
1033
1034 for (ui32 i = 0; i < i_max; i += 2) {
1035 if (mel_need_encode[i]) {
1036 mel_encode(melp, mel_bit[i]);
1037 }
1038
1039 if (i + 1 < i_max) {
1040 if (mel_need_encode[i + 1]) {
1041 mel_encode(melp, mel_bit[i + 1]);
1042 }
1043 }
1044
1045 if (mel_need_encode2[i]) {
1046 mel_encode(melp, mel_bit2[i]);
1047 }
1048 }
1049}
1050
1051static void proc_mel_encode2(mel_struct *melp, __m256i &cq_vec,
1052 __m256i &rho_vec, __m256i u_q_vec, ui32 ignore,
1053 const __m256i right_shift)
1054{
1055 ojph_unused(u_q_vec);
1056 ojph_unused(right_shift);
1057
1058 __m256i need = _mm256_cmpeq_epi32(cq_vec, ZERO);
1059 ui32 mask = (ui32)_mm256_movemask_epi8(need);
1060 mask &= 0x88888888;
1061
1062 ui32 i_max = 8 - (ignore / 2);
1063 if (i_max < 8)
1064 mask &= (1u << (i_max * 4)) - 1;
1065
1066 if (mask == 0)
1067 return;
1068
1069 int32_t mel_bit[8];
1070 _mm256_storeu_si256((__m256i*)mel_bit,
1071 _mm256_srli_epi32(avx2_cmpneq_epi32(rho_vec, ZERO), 31));
1072
1073 while (mask) {
1074 ui32 bit_pos = (ui32)count_trailing_zeros(mask);
1075 ui32 i = bit_pos / 4;
1076 mel_encode(melp, mel_bit[i]);
1077 mask &= mask - 1;
1078 }
1079}
1080
1081using fn_proc_mel_encode = void (*)(mel_struct *, __m256i &, __m256i &,
1082 __m256i, ui32, const __m256i);
1083
1084static inline void
1085build_vlc_uvlc_pair(ui32 *tuple, ui32 *u_q, ui32 i,
1086 const ui32 *uvlc_tbl, ui64 &val, int &size)
1087{
1088 val = tuple[i + 0] >> 4;
1089 size = tuple[i + 0] & 7;
1090
1091 val |= (ui64)(tuple[i + 1] >> 4) << size;
1092 size += tuple[i + 1] & 7;
1093
1094 ui32 entry = uvlc_tbl[u_q[i] * 33 + u_q[i + 1]];
1095 val |= (ui64)(entry >> 5) << size;
1096 size += entry & 0x1F;
1097}
1098
1099static void proc_vlc_encode(vlc_struct *vlcp, ui32 *tuple,
1100 ui32 *u_q, ui32 ignore, const ui32 *uvlc_tbl)
1101{
1102 ui32 i_max = 8 - (ignore / 2);
1103
1104 ui32 i = 0;
1105 for (; i + 2 < i_max; i += 4) {
1106 ui64 val1; int size1;
1107 build_vlc_uvlc_pair(tuple, u_q, i, uvlc_tbl, val1, size1);
1108 ui64 val2; int size2;
1109 build_vlc_uvlc_pair(tuple, u_q, i + 2, uvlc_tbl, val2, size2);
1110 vlc_encode(vlcp, val1 | (val2 << size1), size1 + size2);
1111 }
1112 if (i < i_max) {
1113 ui64 val; int size;
1114 build_vlc_uvlc_pair(tuple, u_q, i, uvlc_tbl, val, size);
1115 vlc_encode(vlcp, val, size);
1116 }
1117}
1118
1119template<int PASS>
1120OJPH_FORCE_INLINE void encode_x_loop(
1121 ui32 *sp, ui32 stride, ui32 height, ui32 y,
1122 ui32 n_loop, ui32 _width, ui32 ignore, ui32 p,
1123 mel_struct &mel, vlc_struct &vlc, ms_struct &ms,
1124 __m256i *e_val_vec, __m256i &prev_e_val_vec,
1125 __m256i *cx_val_vec, __m256i &prev_cx_val_vec,
1126 ui32 &prev_cq,
1127 const __m256i &right_shift, const __m256i &left_shift)
1128{
1129 ui32 *vlc_tbl = (PASS == 1) ? vlc_tbl0 : vlc_tbl1;
1130
1131 __m256i tmp, tmp1;
1132 __m256i eq_vec[4];
1133 __m256i s_vec[4];
1134 __m256i src_vec[4];
1135
1136 /* 16 bytes per iteration */
1137 for (ui32 x = 0; x < n_loop; ++x) {
1138
1139 /* t = sp[i]; */
1140 if ((x == (n_loop - 1)) && (_width % 16)) {
1141 ui32 tmp_buf[16] = { 0 };
1142 memcpy(tmp_buf, sp, (_width % 16) * sizeof(ui32));
1143 src_vec[0] = _mm256_loadu_si256((__m256i*)(tmp_buf));
1144 src_vec[2] = _mm256_loadu_si256((__m256i*)(tmp_buf + 8));
1145 if (y + 1 < height) {
1146 memcpy(tmp_buf, sp + stride, (_width % 16) * sizeof(ui32));
1147 src_vec[1] = _mm256_loadu_si256((__m256i*)(tmp_buf));
1148 src_vec[3] = _mm256_loadu_si256((__m256i*)(tmp_buf + 8));
1149 }
1150 else {
1151 src_vec[1] = ZERO;
1152 src_vec[3] = ZERO;
1153 }
1154 }
1155 else {
1156 src_vec[0] = _mm256_loadu_si256((__m256i*)(sp));
1157 src_vec[2] = _mm256_loadu_si256((__m256i*)(sp + 8));
1158
1159 if (y + 1 < height) {
1160 src_vec[1] = _mm256_loadu_si256((__m256i*)(sp + stride));
1161 src_vec[3] = _mm256_loadu_si256((__m256i*)(sp + 8 + stride));
1162 }
1163 else {
1164 src_vec[1] = ZERO;
1165 src_vec[3] = ZERO;
1166 }
1167 sp += 16;
1168 }
1169
1170 __m256i rho_vec, e_qmax_vec;
1171 proc_pixel(src_vec, p, eq_vec, s_vec, rho_vec, e_qmax_vec);
1172
1173 // max_e[(i + 1) % num] = ojph_max(lep[i + 1], lep[i + 2]) - 1;
1174 tmp = _mm256_permutevar8x32_epi32(e_val_vec[x], right_shift);
1175 tmp = _mm256_insert_epi32(tmp, _mm_cvtsi128_si32(_mm256_castsi256_si128(e_val_vec[x + 1])), 7);
1176
1177 auto max_e_vec = _mm256_max_epi32(tmp, e_val_vec[x]);
1178 max_e_vec = _mm256_sub_epi32(max_e_vec, ONE);
1179
1180 // kappa[i] = (rho[i] & (rho[i] - 1)) ? ojph_max(1, max_e[i]) : 1;
1181 tmp = _mm256_max_epi32(max_e_vec, ONE);
1182 tmp1 = _mm256_sub_epi32(rho_vec, ONE);
1183 tmp1 = _mm256_and_si256(rho_vec, tmp1);
1184
1185 auto cmp = _mm256_cmpeq_epi32(tmp1, ZERO);
1186 auto kappa_vec1_ = _mm256_and_si256(cmp, ONE);
1187 auto kappa_vec2_ = _mm256_and_si256(_mm256_xor_si256(cmp, _mm256_set1_epi32((int32_t)0xffffffff)), tmp);
1188 const __m256i kappa_vec = _mm256_max_epi32(kappa_vec1_, kappa_vec2_);
1189
1190 if (PASS == 1)
1191 tmp = proc_cq1(x, cx_val_vec, rho_vec, right_shift);
1192 else
1193 tmp = proc_cq2(x, cx_val_vec, rho_vec, right_shift);
1194
1195 auto cq_vec = _mm256_permutevar8x32_epi32(tmp, left_shift);
1196 cq_vec = _mm256_insert_epi32(cq_vec, prev_cq, 0);
1197 prev_cq = (ui32)_mm256_extract_epi32(tmp, 7);
1198
1199 update_lep(x, prev_e_val_vec, eq_vec, e_val_vec, left_shift);
1200 update_lcxp(x, prev_cx_val_vec, rho_vec, cx_val_vec, left_shift);
1201
1202 /* Uq[i] = ojph_max(e_qmax[i], kappa[i]); */
1203 /* u_q[i] = Uq[i] - kappa[i]; */
1204 auto uq_vec = _mm256_max_epi32(kappa_vec, e_qmax_vec);
1205 auto u_q_vec = _mm256_sub_epi32(uq_vec, kappa_vec);
1206
1207 auto eps_vec = cal_eps_vec(eq_vec, u_q_vec, e_qmax_vec);
1208 __m256i tuple_vec = cal_tuple(cq_vec, rho_vec, eps_vec, vlc_tbl);
1209 ui32 _ignore = ((n_loop - 1) == x) ? ignore : 0;
1210
1211 if (PASS == 1)
1212 proc_mel_encode1(&mel, cq_vec, rho_vec, u_q_vec, _ignore,
1213 right_shift);
1214 else
1215 proc_mel_encode2(&mel, cq_vec, rho_vec, u_q_vec, _ignore,
1216 right_shift);
1217
1218 proc_ms_encode(&ms, tuple_vec, uq_vec, rho_vec, s_vec);
1219
1220 ui32 u_q[10];
1221 ui32 tuple[10];
1222 tuple_vec = _mm256_srli_epi32(tuple_vec, 4);
1223 _mm256_storeu_si256((__m256i*)tuple, tuple_vec);
1224 _mm256_storeu_si256((__m256i*)u_q, u_q_vec);
1225 {
1226 ui32 i_max = 8 - (_ignore / 2);
1227 if (i_max & 1) { tuple[i_max] = 0; u_q[i_max] = 0; }
1228 tuple[8] = 0; u_q[8] = 0;
1229 }
1230 proc_vlc_encode(&vlc, tuple, u_q, _ignore,
1231 (PASS == 1) ? uvlc_tbl_pair1 : uvlc_tbl_pair2);
1232 }
1233}
1234
1235void ojph_encode_codeblock_avx2(ui32* buf, ui32 missing_msbs,
1236 ui32 num_passes, ui32 _width, ui32 height,
1237 ui32 stride, ui32* lengths,
1238 ojph::mem_elastic_allocator *elastic,
1239 ojph::coded_lists *& coded)
1240{
1241 ojph_unused(num_passes); //currently not used
1242
1243 ui32 width = (_width + 15) & ~15u;
1244 ui32 ignore = width - _width;
1245 const int ms_size = (16384 * 16 + 14) / 15; //more than enough
1246 const int mel_vlc_size = 3072; //more than enough
1247 const int mel_size = 192;
1248 const int vlc_size = mel_vlc_size - mel_size;
1249
1250 ui8 ms_buf[ms_size];
1251 ui8 mel_vlc_buf[mel_vlc_size];
1252 ui8 *mel_buf = mel_vlc_buf;
1253 ui8 *vlc_buf = mel_vlc_buf + mel_size;
1254
1255 mel_struct mel;
1256 mel_init(&mel, mel_size, mel_buf);
1257 vlc_struct vlc;
1258 vlc_init(&vlc, vlc_size, vlc_buf);
1259 ms_struct ms;
1260 ms_init(&ms, ms_size, ms_buf);
1261
1262 const ui32 p = 30 - missing_msbs;
1263
1264 //e_val: E values for a line (these are the highest set bit)
1265 //cx_val: is the context values
1266 //Each byte stores the info for the 2 sample. For E, it is maximum
1267 // of the two samples, while for cx, it is the OR of these two samples.
1268 //The maximum is between the pixel at the bottom left of one quad
1269 // and the bottom right of the earlier quad. The same is true for cx.
1270 //For a 1024 pixels, we need 512 bytes, the 2 extra,
1271 // one for the non-existing earlier quad, and one for beyond the
1272 // the end
1273 const __m256i right_shift = _mm256_set_epi32(
1274 0, 7, 6, 5, 4, 3, 2, 1
1275 );
1276
1277 const __m256i left_shift = _mm256_set_epi32(
1278 6, 5, 4, 3, 2, 1, 0, 7
1279 );
1280
1281 ui32 n_loop = (width + 15) / 16;
1282
1283 __m256i e_val_vec[65];
1284 for (ui32 i = 0; i <ojph_min(64, n_loop); ++i) {
1285 e_val_vec[i] = ZERO;
1286 }
1287 __m256i prev_e_val_vec = ZERO;
1288
1289 __m256i cx_val_vec[65];
1290 __m256i prev_cx_val_vec = ZERO;
1291
1292 ui32 prev_cq = 0;
1293
1294 __m256i tmp;
1295
1296 /* 2 lines per iteration */
1297 for (ui32 y = 0; y < height; y += 2)
1298 {
1299 e_val_vec[n_loop] = prev_e_val_vec;
1300 /* lcxp[0] = (ui8)((rho[0] & 8) >> 3); */
1301 tmp = _mm256_and_si256(prev_cx_val_vec, _mm256_set1_epi32(8));
1302 cx_val_vec[n_loop] = _mm256_srli_epi32(tmp, 3);
1303
1304 prev_e_val_vec = ZERO;
1305 prev_cx_val_vec = ZERO;
1306
1307 ui32 *sp = buf + y * stride;
1308
1309 if (y == 0)
1310 encode_x_loop<1>(sp, stride, height, y, n_loop, _width,
1311 ignore, p, mel, vlc, ms,
1312 e_val_vec, prev_e_val_vec,
1313 cx_val_vec, prev_cx_val_vec, prev_cq,
1314 right_shift, left_shift);
1315 else
1316 encode_x_loop<2>(sp, stride, height, y, n_loop, _width,
1317 ignore, p, mel, vlc, ms,
1318 e_val_vec, prev_e_val_vec,
1319 cx_val_vec, prev_cx_val_vec, prev_cq,
1320 right_shift, left_shift);
1321
1322 tmp = _mm256_permutevar8x32_epi32(cx_val_vec[0], right_shift);
1323 tmp = _mm256_slli_epi32(tmp, 2);
1324 tmp = _mm256_add_epi32(tmp, cx_val_vec[0]);
1325 prev_cq = (ui32)_mm_cvtsi128_si32(_mm256_castsi256_si128(tmp));
1326 }
1327
1328 ms_terminate(&ms);
1329 vlc_drain(&vlc);
1330 terminate_mel_vlc(&mel, &vlc);
1331
1332 //copy to elastic
1333 lengths[0] = mel.pos + vlc.pos + ms.pos;
1334 elastic->get_buffer(mel.pos + vlc.pos + ms.pos, coded);
1335 memcpy(coded->buf, ms.buf, ms.pos);
1336 memcpy(coded->buf + ms.pos, mel.buf, mel.pos);
1337 memcpy(coded->buf + ms.pos + mel.pos, vlc.buf - vlc.pos + 1, vlc.pos);
1338
1339 // put in the interface locator word
1340 ui32 num_bytes = mel.pos + vlc.pos;
1341 coded->buf[lengths[0]-1] = (ui8)(num_bytes >> 4);
1342 coded->buf[lengths[0]-2] = coded->buf[lengths[0]-2] & 0xF0;
1343 coded->buf[lengths[0]-2] =
1344 (ui8)(coded->buf[lengths[0]-2] | (num_bytes & 0xF));
1345
1346 coded->avail_size -= lengths[0];
1347}
1348
1349} /* namespace local */
1350} /* namespace ojph */
1351
1352#endif
1353#endif // !defined(__apple_build_version__)
void get_buffer(ui32 needed_bytes, coded_lists *&p)
Definition ojph_mem.cpp:113
static bool uvlc_init_tables()
Initializes uvlc_tbl0 and uvlc_tbl1 tables.
static bool vlc_init_tables()
Initializes vlc_tbl0 and vlc_tbl1 tables, from table0.h and table1.h.
ui16 vlc_tbl1[1024]
vlc_tbl1 contains decoding information for non-initial row of quads
ui16 vlc_tbl0[1024]
vlc_tbl0 contains decoding information for initial row of quads
static void ms_terminate(ms_struct *msp)
static void vlc_encode(vlc_struct *vlcp, int cwd, int cwd_len)
static void terminate_mel_vlc(mel_struct *melp, vlc_struct *vlcp)
void ojph_encode_codeblock_avx2(ui32 *buf, ui32 missing_msbs, ui32 num_passes, ui32 width, ui32 height, ui32 stride, ui32 *lengths, ojph::mem_elastic_allocator *elastic, ojph::coded_lists *&coded)
static void mel_init(dec_mel_st *melp, ui8 *bbuf, int lcup, int scup)
Initiates a dec_mel_st structure for MEL decoding and reads some bytes in order to get the read addre...
static void ms_init(ms_struct *msp, ui32 buffer_size, ui8 *data)
static void ms_encode(ms_struct *msp, ui32 cwd, int cwd_len)
static void mel_encode(mel_struct *melp, bool bit)
bool initialize_block_encoder_tables_avx2()
static void vlc_init(vlc_struct *vlcp, ui32 buffer_size, ui8 *data)
static uvlc_tbl_struct uvlc_tbl[num_uvlc_entries]
uint64_t ui64
Definition ojph_defs.h:56
uint16_t ui16
Definition ojph_defs.h:52
static ui32 population_count(ui32 val)
Definition ojph_arch.h:168
static ui32 count_trailing_zeros(ui32 val)
Definition ojph_arch.h:252
int32_t si32
Definition ojph_defs.h:55
uint32_t ui32
Definition ojph_defs.h:54
uint8_t ui8
Definition ojph_defs.h:50
#define OJPH_FORCE_INLINE
Definition ojph_arch.h:74
#define ojph_max(a, b)
Definition ojph_defs.h:73
#define ojph_min(a, b)
Definition ojph_defs.h:76
#define ojph_unused(x)
Definition ojph_defs.h:78
#define OJPH_ERROR(t,...)