326 unsigned int framebits,
328 unsigned char* Branchtab)
331 for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
332 unsigned char a75, a81;
334 short int s20, s21, s26, s27;
335 unsigned char *a74, *a80, *b6;
336 short int *a110, *a111, *a91, *a93, *a94;
337 __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98, *a99;
338 __m128i a105, a106, a86, a87;
339 __m128i a100, a101, a103, a104, a107, a108, a109, a76, a78, a79, a82, a84, a85,
340 a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18,
341 s19, s22, s23, s24, s25, s28, s29, t13, t14, t15, t16, t17, t18;
349 a76 = _mm_set1_epi8(a75);
350 a77 = ((__m128i*)Branchtab);
352 a79 = _mm_xor_si128(a76, a78);
356 a82 = _mm_set1_epi8(a81);
359 a85 = _mm_xor_si128(a82, a84);
360 t13 = _mm_avg_epu8(a79, a85);
361 a86 = ((__m128i)t13);
362 a87 = _mm_srli_epi16(a86, 2);
363 a88 = ((__m128i)a87);
366 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
368 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
370 m23 = _mm_adds_epu8(s18, t14);
371 m24 = _mm_adds_epu8(s19, t15);
372 m25 = _mm_adds_epu8(s18, t15);
373 m26 = _mm_adds_epu8(s19, t14);
374 a89 = _mm_min_epu8(m24, m23);
375 d9 = _mm_cmpeq_epi8(a89, m24);
376 a90 = _mm_min_epu8(m26, m25);
377 d10 = _mm_cmpeq_epi8(a90, m26);
378 s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10));
379 a91 = ((
short int*)dec);
383 s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10));
386 s22 = _mm_unpacklo_epi8(a89, a90);
387 s23 = _mm_unpackhi_epi8(a89, a90);
398 a101 = _mm_xor_si128(a76, a100);
401 a104 = _mm_xor_si128(a82, a103);
402 t16 = _mm_avg_epu8(a101, a104);
403 a105 = ((__m128i)t16);
404 a106 = _mm_srli_epi16(a105, 2);
405 a107 = ((__m128i)a106);
408 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
410 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
412 m27 = _mm_adds_epu8(s24, t17);
413 m28 = _mm_adds_epu8(s25, t18);
414 m29 = _mm_adds_epu8(s24, t18);
415 m30 = _mm_adds_epu8(s25, t17);
416 a108 = _mm_min_epu8(m28, m27);
417 d11 = _mm_cmpeq_epi8(a108, m28);
418 a109 = _mm_min_epu8(m30, m29);
419 d12 = _mm_cmpeq_epi8(a109, m30);
420 s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12));
423 s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12));
426 s28 = _mm_unpacklo_epi8(a108, a109);
427 s29 = _mm_unpackhi_epi8(a108, a109);
434 m5 = ((__m128i*)Y)[0];
435 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
436 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]);
437 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]);
439 m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
440 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7)));
441 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7)));
442 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7)));
443 m7 = _mm_unpacklo_epi8(m7, m7);
444 m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
445 m6 = _mm_unpacklo_epi64(m7, m7);
446 ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6);
447 ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6);
448 ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6);
449 ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6);
451 unsigned char a188, a194;
453 short int s48, s49, s54, s55;
454 unsigned char *a187, *a193, *b15;
455 short int *a204, *a206, *a207, *a223, *a224, *b16;
456 __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210, *a211, *a212, *a215,
458 __m128i a199, a200, a218, a219;
459 __m128i a189, a191, a192, a195, a197, a198, a201, a202, a203, a213, a214, a216,
460 a217, a220, a221, a222, d17, d18, d19, d20, m39, m40, m41, m42, m43, m44, m45,
461 m46, s46, s47, s50, s51, s52, s53, s56, s57, t25, t26, t27, t28, t29, t30;
462 a184 = ((__m128i*)Y);
470 a189 = _mm_set1_epi8(a188);
471 a190 = ((__m128i*)Branchtab);
473 a192 = _mm_xor_si128(a189, a191);
476 a195 = _mm_set1_epi8(a194);
479 a198 = _mm_xor_si128(a195, a197);
480 t25 = _mm_avg_epu8(a192, a198);
481 a199 = ((__m128i)t25);
482 a200 = _mm_srli_epi16(a199, 2);
483 a201 = ((__m128i)a200);
486 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
488 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
490 m39 = _mm_adds_epu8(s46, t26);
491 m40 = _mm_adds_epu8(s47, t27);
492 m41 = _mm_adds_epu8(s46, t27);
493 m42 = _mm_adds_epu8(s47, t26);
494 a202 = _mm_min_epu8(m40, m39);
495 d17 = _mm_cmpeq_epi8(a202, m40);
496 a203 = _mm_min_epu8(m42, m41);
497 d18 = _mm_cmpeq_epi8(a203, m42);
498 s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17, d18));
499 a204 = ((
short int*)dec);
504 s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17, d18));
507 s50 = _mm_unpacklo_epi8(a202, a203);
508 s51 = _mm_unpackhi_epi8(a202, a203);
509 a208 = ((__m128i*)X);
519 a214 = _mm_xor_si128(a189, a213);
522 a217 = _mm_xor_si128(a195, a216);
523 t28 = _mm_avg_epu8(a214, a217);
524 a218 = ((__m128i)t28);
525 a219 = _mm_srli_epi16(a218, 2);
526 a220 = ((__m128i)a219);
529 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
531 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
533 m43 = _mm_adds_epu8(s52, t29);
534 m44 = _mm_adds_epu8(s53, t30);
535 m45 = _mm_adds_epu8(s52, t30);
536 m46 = _mm_adds_epu8(s53, t29);
537 a221 = _mm_min_epu8(m44, m43);
538 d19 = _mm_cmpeq_epi8(a221, m44);
539 a222 = _mm_min_epu8(m46, m45);
540 d20 = _mm_cmpeq_epi8(a222, m46);
541 s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19, d20));
544 s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19, d20));
547 s56 = _mm_unpacklo_epi8(a221, a222);
548 s57 = _mm_unpackhi_epi8(a221, a222);
555 m12 = ((__m128i*)X)[0];
556 m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]);
557 m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]);
558 m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]);
560 m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
561 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)), ((__m128i)m14)));
562 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)), ((__m128i)m14)));
563 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)), ((__m128i)m14)));
564 m14 = _mm_unpacklo_epi8(m14, m14);
565 m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
566 m13 = _mm_unpacklo_epi64(m14, m14);
567 ((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13);
568 ((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13);
569 ((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13);
570 ((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13);
582 for (j = 0; j < (framebits + excess) % 2; ++j) {
584 for (i = 0; i < 64 / 2; i++) {
586 (((framebits + excess) >> 1) << 1) + j,
616 unsigned int framebits,
618 unsigned char* Branchtab)
621 for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
622 unsigned char a75, a81;
624 short int s20, s21, s26, s27;
625 unsigned char *a74, *a80, *b6;
626 short int *a110, *a111, *a91, *a93, *a94;
627 __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98, *a99;
628 __m128i a105, a106, a86, a87;
629 __m128i a100, a101, a103, a104, a107, a108, a109, a76, a78, a79, a82, a84, a85,
630 a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18,
631 s19, s22, s23, s24, s25, s28, s29, t13, t14, t15, t16, t17, t18;
639 a76 = _mm_set1_epi8(a75);
640 a77 = ((__m128i*)Branchtab);
642 a79 = _mm_xor_si128(a76, a78);
646 a82 = _mm_set1_epi8(a81);
649 a85 = _mm_xor_si128(a82, a84);
650 t13 = _mm_avg_epu8(a79, a85);
651 a86 = ((__m128i)t13);
652 a87 = _mm_srli_epi16(a86, 2);
653 a88 = ((__m128i)a87);
656 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
658 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
660 m23 = _mm_adds_epu8(s18, t14);
661 m24 = _mm_adds_epu8(s19, t15);
662 m25 = _mm_adds_epu8(s18, t15);
663 m26 = _mm_adds_epu8(s19, t14);
664 a89 = _mm_min_epu8(m24, m23);
665 d9 = _mm_cmpeq_epi8(a89, m24);
666 a90 = _mm_min_epu8(m26, m25);
667 d10 = _mm_cmpeq_epi8(a90, m26);
668 s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10));
669 a91 = ((
short int*)dec);
673 s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10));
676 s22 = _mm_unpacklo_epi8(a89, a90);
677 s23 = _mm_unpackhi_epi8(a89, a90);
688 a101 = _mm_xor_si128(a76, a100);
691 a104 = _mm_xor_si128(a82, a103);
692 t16 = _mm_avg_epu8(a101, a104);
693 a105 = ((__m128i)t16);
694 a106 = _mm_srli_epi16(a105, 2);
695 a107 = ((__m128i)a106);
698 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
700 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
702 m27 = _mm_adds_epu8(s24, t17);
703 m28 = _mm_adds_epu8(s25, t18);
704 m29 = _mm_adds_epu8(s24, t18);
705 m30 = _mm_adds_epu8(s25, t17);
706 a108 = _mm_min_epu8(m28, m27);
707 d11 = _mm_cmpeq_epi8(a108, m28);
708 a109 = _mm_min_epu8(m30, m29);
709 d12 = _mm_cmpeq_epi8(a109, m30);
710 s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12));
713 s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12));
716 s28 = _mm_unpacklo_epi8(a108, a109);
717 s29 = _mm_unpackhi_epi8(a108, a109);
724 m5 = ((__m128i*)Y)[0];
725 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
726 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]);
727 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]);
729 m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
730 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7)));
731 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7)));
732 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7)));
733 m7 = _mm_unpacklo_epi8(m7, m7);
734 m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
735 m6 = _mm_unpacklo_epi64(m7, m7);
736 ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6);
737 ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6);
738 ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6);
739 ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6);
741 unsigned char a188, a194;
743 short int s48, s49, s54, s55;
744 unsigned char *a187, *a193, *b15;
745 short int *a204, *a206, *a207, *a223, *a224, *b16;
746 __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210, *a211, *a212, *a215,
748 __m128i a199, a200, a218, a219;
749 __m128i a189, a191, a192, a195, a197, a198, a201, a202, a203, a213, a214, a216,
750 a217, a220, a221, a222, d17, d18, d19, d20, m39, m40, m41, m42, m43, m44, m45,
751 m46, s46, s47, s50, s51, s52, s53, s56, s57, t25, t26, t27, t28, t29, t30;
752 a184 = ((__m128i*)Y);
760 a189 = _mm_set1_epi8(a188);
761 a190 = ((__m128i*)Branchtab);
763 a192 = _mm_xor_si128(a189, a191);
766 a195 = _mm_set1_epi8(a194);
769 a198 = _mm_xor_si128(a195, a197);
770 t25 = _mm_avg_epu8(a192, a198);
771 a199 = ((__m128i)t25);
772 a200 = _mm_srli_epi16(a199, 2);
773 a201 = ((__m128i)a200);
776 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
778 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
780 m39 = _mm_adds_epu8(s46, t26);
781 m40 = _mm_adds_epu8(s47, t27);
782 m41 = _mm_adds_epu8(s46, t27);
783 m42 = _mm_adds_epu8(s47, t26);
784 a202 = _mm_min_epu8(m40, m39);
785 d17 = _mm_cmpeq_epi8(a202, m40);
786 a203 = _mm_min_epu8(m42, m41);
787 d18 = _mm_cmpeq_epi8(a203, m42);
788 s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17, d18));
789 a204 = ((
short int*)dec);
794 s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17, d18));
797 s50 = _mm_unpacklo_epi8(a202, a203);
798 s51 = _mm_unpackhi_epi8(a202, a203);
799 a208 = ((__m128i*)X);
809 a214 = _mm_xor_si128(a189, a213);
812 a217 = _mm_xor_si128(a195, a216);
813 t28 = _mm_avg_epu8(a214, a217);
814 a218 = ((__m128i)t28);
815 a219 = _mm_srli_epi16(a218, 2);
816 a220 = ((__m128i)a219);
819 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
821 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
823 m43 = _mm_adds_epu8(s52, t29);
824 m44 = _mm_adds_epu8(s53, t30);
825 m45 = _mm_adds_epu8(s52, t30);
826 m46 = _mm_adds_epu8(s53, t29);
827 a221 = _mm_min_epu8(m44, m43);
828 d19 = _mm_cmpeq_epi8(a221, m44);
829 a222 = _mm_min_epu8(m46, m45);
830 d20 = _mm_cmpeq_epi8(a222, m46);
831 s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19, d20));
834 s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19, d20));
837 s56 = _mm_unpacklo_epi8(a221, a222);
838 s57 = _mm_unpackhi_epi8(a221, a222);
845 m12 = ((__m128i*)X)[0];
846 m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]);
847 m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]);
848 m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]);
850 m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
851 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)), ((__m128i)m14)));
852 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)), ((__m128i)m14)));
853 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)), ((__m128i)m14)));
854 m14 = _mm_unpacklo_epi8(m14, m14);
855 m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
856 m13 = _mm_unpacklo_epi64(m14, m14);
857 ((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13);
858 ((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13);
859 ((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13);
860 ((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13);
872 for (j = 0; j < (framebits + excess) % 2; ++j) {
874 for (i = 0; i < 64 / 2; i++) {
876 (((framebits + excess) >> 1) << 1) + j,