709 if constexpr (modulus.data[3] >= 0x4000000000000000ULL) {
710 return montgomery_mul_big(*
this);
712#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
713 uint64_t carry_hi = 0;
715 auto [t0, carry_lo] = mul_wide(
data[0],
data[0]);
716 uint64_t t1 = square_accumulate(0,
data[1],
data[0], carry_lo, carry_hi, carry_lo, carry_hi);
717 uint64_t t2 = square_accumulate(0,
data[2],
data[0], carry_lo, carry_hi, carry_lo, carry_hi);
718 uint64_t t3 = square_accumulate(0,
data[3],
data[0], carry_lo, carry_hi, carry_lo, carry_hi);
720 uint64_t round_carry = carry_lo;
721 uint64_t k = t0 * T::r_inv;
722 carry_lo = mac_discard_lo(t0, k, modulus.data[0]);
723 mac(t1, k, modulus.data[1], carry_lo, t0, carry_lo);
724 mac(t2, k, modulus.data[2], carry_lo, t1, carry_lo);
725 mac(t3, k, modulus.data[3], carry_lo, t2, carry_lo);
726 t3 = carry_lo + round_carry;
728 t1 = mac_mini(t1,
data[1],
data[1], carry_lo);
730 t2 = square_accumulate(t2,
data[2],
data[1], carry_lo, carry_hi, carry_lo, carry_hi);
731 t3 = square_accumulate(t3,
data[3],
data[1], carry_lo, carry_hi, carry_lo, carry_hi);
732 round_carry = carry_lo;
734 carry_lo = mac_discard_lo(t0, k, modulus.data[0]);
735 mac(t1, k, modulus.data[1], carry_lo, t0, carry_lo);
736 mac(t2, k, modulus.data[2], carry_lo, t1, carry_lo);
737 mac(t3, k, modulus.data[3], carry_lo, t2, carry_lo);
738 t3 = carry_lo + round_carry;
740 t2 = mac_mini(t2,
data[2],
data[2], carry_lo);
742 t3 = square_accumulate(t3,
data[3],
data[2], carry_lo, carry_hi, carry_lo, carry_hi);
743 round_carry = carry_lo;
745 carry_lo = mac_discard_lo(t0, k, modulus.data[0]);
746 mac(t1, k, modulus.data[1], carry_lo, t0, carry_lo);
747 mac(t2, k, modulus.data[2], carry_lo, t1, carry_lo);
748 mac(t3, k, modulus.data[3], carry_lo, t2, carry_lo);
749 t3 = carry_lo + round_carry;
751 t3 = mac_mini(t3,
data[3],
data[3], carry_lo);
753 round_carry = carry_lo;
754 carry_lo = mac_discard_lo(t0, k, modulus.data[0]);
755 mac(t1, k, modulus.data[1], carry_lo, t0, carry_lo);
756 mac(t2, k, modulus.data[2], carry_lo, t1, carry_lo);
757 mac(t3, k, modulus.data[3], carry_lo, t2, carry_lo);
758 t3 = carry_lo + round_carry;
759 return { t0, t1, t2, t3 };
762 auto left = wasm_convert(
data);
763 constexpr uint64_t mask = 0x1fffffff;
774 uint64_t temp_10 = 0;
775 uint64_t temp_11 = 0;
776 uint64_t temp_12 = 0;
777 uint64_t temp_13 = 0;
778 uint64_t temp_14 = 0;
779 uint64_t temp_15 = 0;
780 uint64_t temp_16 = 0;
783 temp_0 += left[0] * left[0];
785 acc += left[0] * left[1];
786 temp_1 += (acc << 1);
788 acc += left[0] * left[2];
789 temp_2 += left[1] * left[1];
790 temp_2 += (acc << 1);
792 acc += left[0] * left[3];
793 acc += left[1] * left[2];
794 temp_3 += (acc << 1);
796 acc += left[0] * left[4];
797 acc += left[1] * left[3];
798 temp_4 += left[2] * left[2];
799 temp_4 += (acc << 1);
801 acc += left[0] * left[5];
802 acc += left[1] * left[4];
803 acc += left[2] * left[3];
804 temp_5 += (acc << 1);
806 acc += left[0] * left[6];
807 acc += left[1] * left[5];
808 acc += left[2] * left[4];
809 temp_6 += left[3] * left[3];
810 temp_6 += (acc << 1);
812 acc += left[0] * left[7];
813 acc += left[1] * left[6];
814 acc += left[2] * left[5];
815 acc += left[3] * left[4];
816 temp_7 += (acc << 1);
818 acc += left[0] * left[8];
819 acc += left[1] * left[7];
820 acc += left[2] * left[6];
821 acc += left[3] * left[5];
822 temp_8 += left[4] * left[4];
823 temp_8 += (acc << 1);
825 acc += left[1] * left[8];
826 acc += left[2] * left[7];
827 acc += left[3] * left[6];
828 acc += left[4] * left[5];
829 temp_9 += (acc << 1);
831 acc += left[2] * left[8];
832 acc += left[3] * left[7];
833 acc += left[4] * left[6];
834 temp_10 += left[5] * left[5];
835 temp_10 += (acc << 1);
837 acc += left[3] * left[8];
838 acc += left[4] * left[7];
839 acc += left[5] * left[6];
840 temp_11 += (acc << 1);
842 acc += left[4] * left[8];
843 acc += left[5] * left[7];
844 temp_12 += left[6] * left[6];
845 temp_12 += (acc << 1);
847 acc += left[5] * left[8];
848 acc += left[6] * left[7];
849 temp_13 += (acc << 1);
851 acc += left[6] * left[8];
852 temp_14 += left[7] * left[7];
853 temp_14 += (acc << 1);
855 acc += left[7] * left[8];
856 temp_15 += (acc << 1);
857 temp_16 += left[8] * left[8];
861 wasm_reduce_yuval(temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9);
862 wasm_reduce_yuval(temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10);
863 wasm_reduce_yuval(temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11);
864 wasm_reduce_yuval(temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12);
865 wasm_reduce_yuval(temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13);
866 wasm_reduce_yuval(temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14);
867 wasm_reduce_yuval(temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15);
868 wasm_reduce_yuval(temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15, temp_16);
886 wasm_reduce(temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15, temp_16);
904 return { (temp_9 << 0) | (temp_10 << 29) | (temp_11 << 58),
905 (temp_11 >> 6) | (temp_12 << 23) | (temp_13 << 52),
906 (temp_13 >> 12) | (temp_14 << 17) | (temp_15 << 46),
907 (temp_15 >> 18) | (temp_16 << 11) };