diff --git a/Backport-JDK-8319716-8327283-RISC-V-Add-SHA-2.patch b/Backport-JDK-8319716-8327283-RISC-V-Add-SHA-2.patch new file mode 100644 index 0000000000000000000000000000000000000000..16763dc8a0b363c01d888167f340e636207195cf --- /dev/null +++ b/Backport-JDK-8319716-8327283-RISC-V-Add-SHA-2.patch @@ -0,0 +1,920 @@ +diff --git a/src/hotspot/cpu/riscv/assembler_riscv.hpp b/src/hotspot/cpu/riscv/assembler_riscv.hpp +index 24de7c15f..24e5f4fa8 100644 +--- a/src/hotspot/cpu/riscv/assembler_riscv.hpp ++++ b/src/hotspot/cpu/riscv/assembler_riscv.hpp +@@ -1312,6 +1312,7 @@ enum VectorMask { + INSN(vsll_vi, 0b1010111, 0b011, 0b100101); + + // Vector Slide Instructions ++ INSN(vslideup_vi, 0b1010111, 0b011, 0b001110); + INSN(vslidedown_vi, 0b1010111, 0b011, 0b001111); + + #undef INSN +@@ -1666,7 +1667,6 @@ enum VectorMask { + INSN(vmv_v_x, 0b1010111, 0b100, v0, 0b1, 0b010111); + + #undef INSN +-#undef patch_VArith + + #define INSN(NAME, op, funct13, funct6) \ + void NAME(VectorRegister Vd, VectorMask vm = unmasked) { \ +@@ -1708,14 +1708,29 @@ enum Nf { + patch_reg((address)&insn, 15, Rs1); \ + emit(insn) + +-#define INSN(NAME, op, lumop, vm, mop, nf) \ +- void NAME(VectorRegister Vd, Register Rs1, uint32_t width = 0, bool mew = false) { \ ++#define INSN(NAME, op, width, lumop, vm, mop, mew, nf) \ ++ void NAME(VectorRegister Vd, Register Rs1) { \ + guarantee(is_uimm3(width), "width is invalid"); \ + patch_VLdSt(op, Vd, width, Rs1, lumop, vm, mop, mew, nf); \ + } + + // Vector Load/Store Instructions +- INSN(vl1re8_v, 0b0000111, 0b01000, 0b1, 0b00, g1); ++ INSN(vl1re8_v, 0b0000111, 0b000, 0b01000, 0b1, 0b00, 0b0, g1); ++ INSN(vl1re16_v, 0b0000111, 0b101, 0b01000, 0b1, 0b00, 0b0, g1); ++ INSN(vl1re32_v, 0b0000111, 0b110, 0b01000, 0b1, 0b00, 0b0, g1); ++ INSN(vl1re64_v, 0b0000111, 0b111, 0b01000, 0b1, 0b00, 0b0, g1); ++ INSN(vl2re8_v, 0b0000111, 0b000, 0b01000, 0b1, 0b00, 0b0, g2); ++ INSN(vl2re16_v, 0b0000111, 0b101, 0b01000, 0b1, 0b00, 0b0, g2); ++ INSN(vl2re32_v, 0b0000111, 0b110, 0b01000, 0b1, 0b00, 0b0, g2); ++ INSN(vl2re64_v, 0b0000111, 0b111, 0b01000, 0b1, 0b00, 0b0, g2); ++ INSN(vl4re8_v, 0b0000111, 0b000, 0b01000, 0b1, 0b00, 0b0, g4); ++ INSN(vl4re16_v, 0b0000111, 0b101, 0b01000, 0b1, 0b00, 0b0, g4); ++ INSN(vl4re32_v, 0b0000111, 0b110, 0b01000, 0b1, 0b00, 0b0, g4); ++ INSN(vl4re64_v, 0b0000111, 0b111, 0b01000, 0b1, 0b00, 0b0, g4); ++ INSN(vl8re8_v, 0b0000111, 0b000, 0b01000, 0b1, 0b00, 0b0, g8); ++ INSN(vl8re16_v, 0b0000111, 0b101, 0b01000, 0b1, 0b00, 0b0, g8); ++ INSN(vl8re32_v, 0b0000111, 0b110, 0b01000, 0b1, 0b00, 0b0, g8); ++ INSN(vl8re64_v, 0b0000111, 0b111, 0b01000, 0b1, 0b00, 0b0, g8); + + #undef INSN + +@@ -1726,6 +1741,9 @@ enum Nf { + + // Vector Load/Store Instructions + INSN(vs1r_v, 0b0100111, 0b000, 0b01000, 0b1, 0b00, 0b0, g1); ++ INSN(vs2r_v, 0b0100111, 0b000, 0b01000, 0b1, 0b00, 0b0, g2); ++ INSN(vs4r_v, 0b0100111, 0b000, 0b01000, 0b1, 0b00, 0b0, g4); ++ INSN(vs8r_v, 0b0100111, 0b000, 0b01000, 0b1, 0b00, 0b0, g8); + + #undef INSN + +@@ -1771,10 +1789,12 @@ enum Nf { + } + + // Vector unordered indexed load instructions ++ INSN( vluxei8_v, 0b0000111, 0b000, 0b01, 0b0); + INSN(vluxei32_v, 0b0000111, 0b110, 0b01, 0b0); + INSN(vluxei64_v, 0b0000111, 0b111, 0b01, 0b0); + + // Vector unordered indexed store instructions ++ INSN( vsuxei8_v, 0b0100111, 0b000, 0b01, 0b0); + INSN(vsuxei32_v, 0b0100111, 0b110, 0b01, 0b0); + INSN(vsuxei64_v, 0b0100111, 0b111, 0b01, 0b0); + +@@ -1794,6 +1814,55 @@ enum Nf { + #undef INSN + #undef patch_VLdSt + ++// ==================================== ++// RISC-V Vector Crypto Extension ++// ==================================== ++ ++#define INSN(NAME, op, funct3, funct6) \ ++ void NAME(VectorRegister Vd, VectorRegister Vs2, VectorRegister Vs1, VectorMask vm = unmasked) { \ ++ patch_VArith(op, Vd, funct3, Vs1->raw_encoding(), Vs2, vm, funct6); \ ++ } ++ ++ // Vector Bit-manipulation used in Cryptography (Zvkb) Extension ++ INSN(vandn_vv, 0b1010111, 0b000, 0b000001); ++ INSN(vandn_vx, 0b1010111, 0b100, 0b000001); ++ INSN(vandn_vi, 0b1010111, 0b011, 0b000001); ++ INSN(vclmul_vv, 0b1010111, 0b010, 0b001100); ++ INSN(vclmul_vx, 0b1010111, 0b110, 0b001100); ++ INSN(vclmulh_vv, 0b1010111, 0b010, 0b001101); ++ INSN(vclmulh_vx, 0b1010111, 0b110, 0b001101); ++ INSN(vror_vv, 0b1010111, 0b000, 0b010100); ++ INSN(vror_vx, 0b1010111, 0b100, 0b010100); ++ INSN(vrol_vv, 0b1010111, 0b000, 0b010101); ++ INSN(vrol_vx, 0b1010111, 0b100, 0b010101); ++ ++#undef INSN ++ ++#define INSN(NAME, op, funct3, Vs1, funct6) \ ++ void NAME(VectorRegister Vd, VectorRegister Vs2, VectorMask vm = unmasked) { \ ++ patch_VArith(op, Vd, funct3, Vs1, Vs2, vm, funct6); \ ++ } ++ ++ // Vector Bit-manipulation used in Cryptography (Zvkb) Extension ++ INSN(vbrev8_v, 0b1010111, 0b010, 0b01000, 0b010010); ++ INSN(vrev8_v, 0b1010111, 0b010, 0b01001, 0b010010); ++ ++#undef INSN ++ ++#define INSN(NAME, op, funct3, vm, funct6) \ ++ void NAME(VectorRegister Vd, VectorRegister Vs2, VectorRegister Vs1) { \ ++ patch_VArith(op, Vd, funct3, Vs1->raw_encoding(), Vs2, vm, funct6); \ ++ } ++ ++ // Vector SHA-2 Secure Hash (Zvknh[ab]) Extension ++ INSN(vsha2ms_vv, 0b1110111, 0b010, 0b1, 0b101101); ++ INSN(vsha2ch_vv, 0b1110111, 0b010, 0b1, 0b101110); ++ INSN(vsha2cl_vv, 0b1110111, 0b010, 0b1, 0b101111); ++ ++#undef INSN ++ ++#undef patch_VArith ++ + // ==================================== + // RISC-V Bit-Manipulation Extension + // Currently only support Zba, Zbb and Zbs bitmanip extensions. +diff --git a/src/hotspot/cpu/riscv/globals_riscv.hpp b/src/hotspot/cpu/riscv/globals_riscv.hpp +index 2ee0b4b94..4d39d9905 100644 +--- a/src/hotspot/cpu/riscv/globals_riscv.hpp ++++ b/src/hotspot/cpu/riscv/globals_riscv.hpp +@@ -112,6 +112,8 @@ define_pd_global(intx, InlineSmallCode, 1000); + product(bool, UseZicboz, false, EXPERIMENTAL, "Use Zicboz instructions") \ + product(bool, UseZihintpause, false, EXPERIMENTAL, \ + "Use Zihintpause instructions") \ ++ product(bool, UseZvkn, false, EXPERIMENTAL, \ ++ "Use Zvkn group extension, Zvkned, Zvknhb, Zvkb, Zvkt") \ + product(bool, UseRVVForBigIntegerShiftIntrinsics, true, \ + "Use RVV instructions for left/right shift of BigInteger") + +diff --git a/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp b/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp +index 08c953b2c..77cc5e056 100644 +--- a/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp ++++ b/src/hotspot/cpu/riscv/macroAssembler_riscv.hpp +@@ -1331,6 +1331,16 @@ public: + vmfle_vv(vd, vs1, vs2, vm); + } + ++ inline void vmsltu_vi(VectorRegister Vd, VectorRegister Vs2, uint32_t imm, VectorMask vm = unmasked) { ++ guarantee(imm >= 1 && imm <= 16, "imm is invalid"); ++ vmsleu_vi(Vd, Vs2, imm-1, vm); ++ } ++ ++ inline void vmsgeu_vi(VectorRegister Vd, VectorRegister Vs2, uint32_t imm, VectorMask vm = unmasked) { ++ guarantee(imm >= 1 && imm <= 16, "imm is invalid"); ++ vmsgtu_vi(Vd, Vs2, imm-1, vm); ++ } ++ + // Copy mask register + inline void vmmv_m(VectorRegister vd, VectorRegister vs) { + vmand_mm(vd, vs, vs); +@@ -1346,6 +1356,10 @@ public: + vmxnor_mm(vd, vd, vd); + } + ++ inline void vnot_v(VectorRegister Vd, VectorRegister Vs, VectorMask vm = unmasked) { ++ vxor_vi(Vd, Vs, -1, vm); ++ } ++ + static const int zero_words_block_size; + + void cast_primitive_type(BasicType type, Register Rt) { +diff --git a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp +index 8c5e1c097..dec9a8464 100644 +--- a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp ++++ b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp +@@ -3715,118 +3715,8 @@ class StubGenerator: public StubCodeGenerator { + return entry; + } + }; +-#endif // COMPILER2 +- +- // Continuation point for throwing of implicit exceptions that are +- // not handled in the current activation. Fabricates an exception +- // oop and initiates normal exception dispatching in this +- // frame. Since we need to preserve callee-saved values (currently +- // only for C2, but done for C1 as well) we need a callee-saved oop +- // map and therefore have to make these stubs into RuntimeStubs +- // rather than BufferBlobs. If the compiler needs all registers to +- // be preserved between the fault point and the exception handler +- // then it must assume responsibility for that in +- // AbstractCompiler::continuation_for_implicit_null_exception or +- // continuation_for_implicit_division_by_zero_exception. All other +- // implicit exceptions (e.g., NullPointerException or +- // AbstractMethodError on entry) are either at call sites or +- // otherwise assume that stack unwinding will be initiated, so +- // caller saved registers were assumed volatile in the compiler. +- +-#undef __ +-#define __ masm-> +- +- address generate_throw_exception(const char* name, +- address runtime_entry, +- Register arg1 = noreg, +- Register arg2 = noreg) { +- // Information about frame layout at time of blocking runtime call. +- // Note that we only have to preserve callee-saved registers since +- // the compilers are responsible for supplying a continuation point +- // if they expect all registers to be preserved. +- // n.b. riscv asserts that frame::arg_reg_save_area_bytes == 0 +- assert_cond(runtime_entry != nullptr); +- enum layout { +- fp_off = 0, +- fp_off2, +- return_off, +- return_off2, +- framesize // inclusive of return address +- }; +- +- const int insts_size = 1024; +- const int locs_size = 64; +- +- CodeBuffer code(name, insts_size, locs_size); +- OopMapSet* oop_maps = new OopMapSet(); +- MacroAssembler* masm = new MacroAssembler(&code); +- assert_cond(oop_maps != nullptr && masm != nullptr); +- +- address start = __ pc(); +- +- // This is an inlined and slightly modified version of call_VM +- // which has the ability to fetch the return PC out of +- // thread-local storage and also sets up last_Java_sp slightly +- // differently than the real call_VM +- +- __ enter(); // Save FP and RA before call +- +- assert(is_even(framesize / 2), "sp not 16-byte aligned"); +- +- // ra and fp are already in place +- __ addi(sp, fp, 0 - ((unsigned)framesize << LogBytesPerInt)); // prolog +- +- int frame_complete = __ pc() - start; +- +- // Set up last_Java_sp and last_Java_fp +- address the_pc = __ pc(); +- __ set_last_Java_frame(sp, fp, the_pc, t0); +- +- // Call runtime +- if (arg1 != noreg) { +- assert(arg2 != c_rarg1, "clobbered"); +- __ mv(c_rarg1, arg1); +- } +- if (arg2 != noreg) { +- __ mv(c_rarg2, arg2); +- } +- __ mv(c_rarg0, xthread); +- BLOCK_COMMENT("call runtime_entry"); +- __ call(runtime_entry); + +- // Generate oop map +- OopMap* map = new OopMap(framesize, 0); +- assert_cond(map != nullptr); +- +- oop_maps->add_gc_map(the_pc - start, map); +- +- __ reset_last_Java_frame(true); +- +- __ leave(); +- +- // check for pending exceptions +-#ifdef ASSERT +- Label L; +- __ ld(t0, Address(xthread, Thread::pending_exception_offset())); +- __ bnez(t0, L); +- __ should_not_reach_here(); +- __ bind(L); +-#endif // ASSERT +- __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); +- +- // codeBlob framesize is in words (not VMRegImpl::slot_size) +- RuntimeStub* stub = +- RuntimeStub::new_runtime_stub(name, +- &code, +- frame_complete, +- (framesize >> (LogBytesPerWord - LogBytesPerInt)), +- oop_maps, false); +- assert(stub != nullptr, "create runtime stub fail!"); +- return stub->entry_point(); +- } +- +-#undef __ +-#define __ _masm-> ++#endif // COMPILER2 + + address generate_cont_thaw(Continuation::thaw_kind kind) { + bool return_barrier = Continuation::is_thaw_return_barrier(kind); +@@ -3970,6 +3860,395 @@ class StubGenerator: public StubCodeGenerator { + return start; + } + ++#if COMPILER2_OR_JVMCI ++ ++#undef __ ++#define __ this-> ++ ++ class Sha2Generator : public MacroAssembler { ++ StubCodeGenerator* _cgen; ++ public: ++ Sha2Generator(MacroAssembler* masm, StubCodeGenerator* cgen) : MacroAssembler(masm->code()), _cgen(cgen) {} ++ address generate_sha256_implCompress(bool multi_block) { ++ return generate_sha2_implCompress(Assembler::e32, multi_block); ++ } ++ address generate_sha512_implCompress(bool multi_block) { ++ return generate_sha2_implCompress(Assembler::e64, multi_block); ++ } ++ private: ++ ++ void vleXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) { ++ if (vset_sew == Assembler::e32) __ vle32_v(vr, sr); ++ else __ vle64_v(vr, sr); ++ } ++ ++ void vseXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) { ++ if (vset_sew == Assembler::e32) __ vse32_v(vr, sr); ++ else __ vse64_v(vr, sr); ++ } ++ ++ // Overview of the logic in each "quad round". ++ // ++ // The code below repeats 16/20 times the logic implementing four rounds ++ // of the SHA-256/512 core loop as documented by NIST. 16/20 "quad rounds" ++ // to implementing the 64/80 single rounds. ++ // ++ // // Load four word (u32/64) constants (K[t+3], K[t+2], K[t+1], K[t+0]) ++ // // Output: ++ // // vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]} ++ // vl1reXX.v vTmp1, ofs ++ // ++ // // Increment word constant address by stride (16/32 bytes, 4*4B/8B, 128b/256b) ++ // addi ofs, ofs, 16/32 ++ // ++ // // Add constants to message schedule words: ++ // // Input ++ // // vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]} ++ // // vW0 = {W[t+3], W[t+2], W[t+1], W[t+0]}; // Vt0 = W[3:0]; ++ // // Output ++ // // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]} ++ // vadd.vv vTmp0, vTmp1, vW0 ++ // ++ // // 2 rounds of working variables updates. ++ // // vState1[t+4] <- vState1[t], vState0[t], vTmp0[t] ++ // // Input: ++ // // vState1 = {c[t],d[t],g[t],h[t]} " = vState1[t] " ++ // // vState0 = {a[t],b[t],e[t],f[t]} ++ // // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]} ++ // // Output: ++ // // vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]} " = vState0[t+2] " ++ // // = {h[t+4],g[t+4],d[t+4],c[t+4]} " = vState1[t+4] " ++ // vsha2cl.vv vState1, vState0, vTmp0 ++ // ++ // // 2 rounds of working variables updates. ++ // // vState0[t+4] <- vState0[t], vState0[t+2], vTmp0[t] ++ // // Input ++ // // vState0 = {a[t],b[t],e[t],f[t]} " = vState0[t] " ++ // // = {h[t+2],g[t+2],d[t+2],c[t+2]} " = vState1[t+2] " ++ // // vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]} " = vState0[t+2] " ++ // // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]} ++ // // Output: ++ // // vState0 = {f[t+4],e[t+4],b[t+4],a[t+4]} " = vState0[t+4] " ++ // vsha2ch.vv vState0, vState1, vTmp0 ++ // ++ // // Combine 2QW into 1QW ++ // // ++ // // To generate the next 4 words, "new_vW0"/"vTmp0" from vW0-vW3, vsha2ms needs ++ // // vW0[0..3], vW1[0], vW2[1..3], vW3[0, 2..3] ++ // // and it can only take 3 vectors as inputs. Hence we need to combine ++ // // vW1[0] and vW2[1..3] in a single vector. ++ // // ++ // // vmerge Vt4, Vt1, Vt2, V0 ++ // // Input ++ // // V0 = mask // first word from vW2, 1..3 words from vW1 ++ // // vW2 = {Wt-8, Wt-7, Wt-6, Wt-5} ++ // // vW1 = {Wt-12, Wt-11, Wt-10, Wt-9} ++ // // Output ++ // // Vt4 = {Wt-12, Wt-7, Wt-6, Wt-5} ++ // vmerge.vvm vTmp0, vW2, vW1, v0 ++ // ++ // // Generate next Four Message Schedule Words (hence allowing for 4 more rounds) ++ // // Input ++ // // vW0 = {W[t+ 3], W[t+ 2], W[t+ 1], W[t+ 0]} W[ 3: 0] ++ // // vW3 = {W[t+15], W[t+14], W[t+13], W[t+12]} W[15:12] ++ // // vTmp0 = {W[t+11], W[t+10], W[t+ 9], W[t+ 4]} W[11: 9,4] ++ // // Output (next four message schedule words) ++ // // vW0 = {W[t+19], W[t+18], W[t+17], W[t+16]} W[19:16] ++ // vsha2ms.vv vW0, vTmp0, vW3 ++ // ++ // BEFORE ++ // vW0 - vW3 hold the message schedule words (initially the block words) ++ // vW0 = W[ 3: 0] "oldest" ++ // vW1 = W[ 7: 4] ++ // vW2 = W[11: 8] ++ // vW3 = W[15:12] "newest" ++ // ++ // vt6 - vt7 hold the working state variables ++ // vState0 = {a[t],b[t],e[t],f[t]} // initially {H5,H4,H1,H0} ++ // vState1 = {c[t],d[t],g[t],h[t]} // initially {H7,H6,H3,H2} ++ // ++ // AFTER ++ // vW0 - vW3 hold the message schedule words (initially the block words) ++ // vW1 = W[ 7: 4] "oldest" ++ // vW2 = W[11: 8] ++ // vW3 = W[15:12] ++ // vW0 = W[19:16] "newest" ++ // ++ // vState0 and vState1 hold the working state variables ++ // vState0 = {a[t+4],b[t+4],e[t+4],f[t+4]} ++ // vState1 = {c[t+4],d[t+4],g[t+4],h[t+4]} ++ // ++ // The group of vectors vW0,vW1,vW2,vW3 is "rotated" by one in each quad-round, ++ // hence the uses of those vectors rotate in each round, and we get back to the ++ // initial configuration every 4 quad-rounds. We could avoid those changes at ++ // the cost of moving those vectors at the end of each quad-rounds. ++ void sha2_quad_round(Assembler::SEW vset_sew, VectorRegister rot1, VectorRegister rot2, VectorRegister rot3, VectorRegister rot4, ++ Register scalarconst, VectorRegister vtemp, VectorRegister vtemp2, VectorRegister v_abef, VectorRegister v_cdgh, ++ bool gen_words = true, bool step_const = true) { ++ __ vleXX_v(vset_sew, vtemp, scalarconst); ++ if (step_const) { ++ __ addi(scalarconst, scalarconst, vset_sew == Assembler::e32 ? 16 : 32); ++ } ++ __ vadd_vv(vtemp2, vtemp, rot1); ++ __ vsha2cl_vv(v_cdgh, v_abef, vtemp2); ++ __ vsha2ch_vv(v_abef, v_cdgh, vtemp2); ++ if (gen_words) { ++ __ vmerge_vvm(vtemp2, rot3, rot2); ++ __ vsha2ms_vv(rot1, vtemp2, rot4); ++ } ++ } ++ ++ const char* stub_name(Assembler::SEW vset_sew, bool multi_block) { ++ if (vset_sew == Assembler::e32 && !multi_block) return "sha256_implCompress"; ++ if (vset_sew == Assembler::e32 && multi_block) return "sha256_implCompressMB"; ++ if (vset_sew == Assembler::e64 && !multi_block) return "sha512_implCompress"; ++ if (vset_sew == Assembler::e64 && multi_block) return "sha512_implCompressMB"; ++ ShouldNotReachHere(); ++ return "bad name lookup"; ++ } ++ ++ // Arguments: ++ // ++ // Inputs: ++ // c_rarg0 - byte[] source+offset ++ // c_rarg1 - int[] SHA.state ++ // c_rarg2 - int offset ++ // c_rarg3 - int limit ++ // ++ address generate_sha2_implCompress(Assembler::SEW vset_sew, bool multi_block) { ++ alignas(64) static const uint32_t round_consts_256[64] = { ++ 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, ++ 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, ++ 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, ++ 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, ++ 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, ++ 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, ++ 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, ++ 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, ++ 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, ++ 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, ++ 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, ++ 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, ++ 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, ++ 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, ++ 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, ++ 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, ++ }; ++ alignas(64) static const uint64_t round_consts_512[80] = { ++ 0x428a2f98d728ae22l, 0x7137449123ef65cdl, 0xb5c0fbcfec4d3b2fl, ++ 0xe9b5dba58189dbbcl, 0x3956c25bf348b538l, 0x59f111f1b605d019l, ++ 0x923f82a4af194f9bl, 0xab1c5ed5da6d8118l, 0xd807aa98a3030242l, ++ 0x12835b0145706fbel, 0x243185be4ee4b28cl, 0x550c7dc3d5ffb4e2l, ++ 0x72be5d74f27b896fl, 0x80deb1fe3b1696b1l, 0x9bdc06a725c71235l, ++ 0xc19bf174cf692694l, 0xe49b69c19ef14ad2l, 0xefbe4786384f25e3l, ++ 0x0fc19dc68b8cd5b5l, 0x240ca1cc77ac9c65l, 0x2de92c6f592b0275l, ++ 0x4a7484aa6ea6e483l, 0x5cb0a9dcbd41fbd4l, 0x76f988da831153b5l, ++ 0x983e5152ee66dfabl, 0xa831c66d2db43210l, 0xb00327c898fb213fl, ++ 0xbf597fc7beef0ee4l, 0xc6e00bf33da88fc2l, 0xd5a79147930aa725l, ++ 0x06ca6351e003826fl, 0x142929670a0e6e70l, 0x27b70a8546d22ffcl, ++ 0x2e1b21385c26c926l, 0x4d2c6dfc5ac42aedl, 0x53380d139d95b3dfl, ++ 0x650a73548baf63del, 0x766a0abb3c77b2a8l, 0x81c2c92e47edaee6l, ++ 0x92722c851482353bl, 0xa2bfe8a14cf10364l, 0xa81a664bbc423001l, ++ 0xc24b8b70d0f89791l, 0xc76c51a30654be30l, 0xd192e819d6ef5218l, ++ 0xd69906245565a910l, 0xf40e35855771202al, 0x106aa07032bbd1b8l, ++ 0x19a4c116b8d2d0c8l, 0x1e376c085141ab53l, 0x2748774cdf8eeb99l, ++ 0x34b0bcb5e19b48a8l, 0x391c0cb3c5c95a63l, 0x4ed8aa4ae3418acbl, ++ 0x5b9cca4f7763e373l, 0x682e6ff3d6b2b8a3l, 0x748f82ee5defb2fcl, ++ 0x78a5636f43172f60l, 0x84c87814a1f0ab72l, 0x8cc702081a6439ecl, ++ 0x90befffa23631e28l, 0xa4506cebde82bde9l, 0xbef9a3f7b2c67915l, ++ 0xc67178f2e372532bl, 0xca273eceea26619cl, 0xd186b8c721c0c207l, ++ 0xeada7dd6cde0eb1el, 0xf57d4f7fee6ed178l, 0x06f067aa72176fbal, ++ 0x0a637dc5a2c898a6l, 0x113f9804bef90dael, 0x1b710b35131c471bl, ++ 0x28db77f523047d84l, 0x32caab7b40c72493l, 0x3c9ebe0a15c9bebcl, ++ 0x431d67c49c100d4cl, 0x4cc5d4becb3e42b6l, 0x597f299cfc657e2al, ++ 0x5fcb6fab3ad6faecl, 0x6c44198c4a475817l ++ }; ++ const int const_add = vset_sew == Assembler::e32 ? 16 : 32; ++ ++ __ align(CodeEntryAlignment); ++ StubCodeMark mark(_cgen, "StubRoutines", stub_name(vset_sew, multi_block)); ++ address start = __ pc(); ++ ++ Register buf = c_rarg0; ++ Register state = c_rarg1; ++ Register ofs = c_rarg2; ++ Register limit = c_rarg3; ++ Register consts = t2; // caller saved ++ Register state_c = x28; // caller saved ++ VectorRegister vindex = v2; ++ VectorRegister vW0 = v4; ++ VectorRegister vW1 = v6; ++ VectorRegister vW2 = v8; ++ VectorRegister vW3 = v10; ++ VectorRegister vState0 = v12; ++ VectorRegister vState1 = v14; ++ VectorRegister vHash0 = v16; ++ VectorRegister vHash1 = v18; ++ VectorRegister vTmp0 = v20; ++ VectorRegister vTmp1 = v22; ++ ++ Label multi_block_loop; ++ ++ __ enter(); ++ ++ address constant_table = vset_sew == Assembler::e32 ? (address)round_consts_256 : (address)round_consts_512; ++ la(consts, ExternalAddress(constant_table)); ++ ++ // Register use in this function: ++ // ++ // VECTORS ++ // vW0 - vW3 (512/1024-bits / 4*128/256 bits / 4*4*32/65 bits), hold the message ++ // schedule words (Wt). They start with the message block ++ // content (W0 to W15), then further words in the message ++ // schedule generated via vsha2ms from previous Wt. ++ // Initially: ++ // vW0 = W[ 3:0] = { W3, W2, W1, W0} ++ // vW1 = W[ 7:4] = { W7, W6, W5, W4} ++ // vW2 = W[ 11:8] = {W11, W10, W9, W8} ++ // vW3 = W[15:12] = {W15, W14, W13, W12} ++ // ++ // vState0 - vState1 hold the working state variables (a, b, ..., h) ++ // vState0 = {f[t],e[t],b[t],a[t]} ++ // vState1 = {h[t],g[t],d[t],c[t]} ++ // Initially: ++ // vState0 = {H5i-1, H4i-1, H1i-1 , H0i-1} ++ // vState1 = {H7i-i, H6i-1, H3i-1 , H2i-1} ++ // ++ // v0 = masks for vrgather/vmerge. Single value during the 16 rounds. ++ // ++ // vTmp0 = temporary, Wt+Kt ++ // vTmp1 = temporary, Kt ++ // ++ // vHash0/vHash1 = hold the initial values of the hash, byte-swapped. ++ // ++ // During most of the function the vector state is configured so that each ++ // vector is interpreted as containing four 32/64 bits (e32/e64) elements (128/256 bits). ++ ++ // vsha2ch/vsha2cl uses EGW of 4*SEW. ++ // SHA256 SEW = e32, EGW = 128-bits ++ // SHA512 SEW = e64, EGW = 256-bits ++ // ++ // VLEN is required to be at least 128. ++ // For the case of VLEN=128 and SHA512 we need LMUL=2 to work with 4*e64 (EGW = 256) ++ // ++ // m1: LMUL=1/2 ++ // ta: tail agnostic (don't care about those lanes) ++ // ma: mask agnostic (don't care about those lanes) ++ // x0 is not written, we known the number of vector elements. ++ ++ if (vset_sew == Assembler::e64 && MaxVectorSize == 16) { // SHA512 and VLEN = 128 ++ __ vsetivli(x0, 4, vset_sew, Assembler::m2, Assembler::ma, Assembler::ta); ++ } else { ++ __ vsetivli(x0, 4, vset_sew, Assembler::m1, Assembler::ma, Assembler::ta); ++ } ++ ++ int64_t indexes = vset_sew == Assembler::e32 ? 0x00041014ul : 0x00082028ul; ++ __ li(t0, indexes); ++ __ vmv_v_x(vindex, t0); ++ ++ // Step-over a,b, so we are pointing to c. ++ // const_add is equal to 4x state variable, div by 2 is thus 2, a,b ++ __ addi(state_c, state, const_add/2); ++ ++ // Use index-load to get {f,e,b,a},{h,g,d,c} ++ __ vluxei8_v(vState0, state, vindex); ++ __ vluxei8_v(vState1, state_c, vindex); ++ ++ __ bind(multi_block_loop); ++ ++ // Capture the initial H values in vHash0 and vHash1 to allow for computing ++ // the resulting H', since H' = H+{a',b',c',...,h'}. ++ __ vmv_v_v(vHash0, vState0); ++ __ vmv_v_v(vHash1, vState1); ++ ++ // Load the 512/1024-bits of the message block in vW0-vW3 and perform ++ // an endian swap on each 4/8 bytes element. ++ // ++ // If Zvkb is not implemented one can use vrgather ++ // with an index sequence to byte-swap. ++ // sequence = [3 2 1 0 7 6 5 4 11 10 9 8 15 14 13 12] ++ // gives us "N ^ 3" as a nice formula to generate ++ // this sequence. 'vid' gives us the N. ++ __ vleXX_v(vset_sew, vW0, buf); ++ __ vrev8_v(vW0, vW0); ++ __ addi(buf, buf, const_add); ++ __ vleXX_v(vset_sew, vW1, buf); ++ __ vrev8_v(vW1, vW1); ++ __ addi(buf, buf, const_add); ++ __ vleXX_v(vset_sew, vW2, buf); ++ __ vrev8_v(vW2, vW2); ++ __ addi(buf, buf, const_add); ++ __ vleXX_v(vset_sew, vW3, buf); ++ __ vrev8_v(vW3, vW3); ++ __ addi(buf, buf, const_add); ++ ++ // Set v0 up for the vmerge that replaces the first word (idx==0) ++ __ vid_v(v0); ++ __ vmseq_vi(v0, v0, 0x0); // v0.mask[i] = (i == 0 ? 1 : 0) ++ ++ VectorRegister rotation_regs[] = {vW0, vW1, vW2, vW3}; ++ int rot_pos = 0; ++ // Quad-round #0 (+0, vW0->vW1->vW2->vW3) ... #11 (+3, vW3->vW0->vW1->vW2) ++ const int qr_end = vset_sew == Assembler::e32 ? 12 : 16; ++ for (int i = 0; i < qr_end; i++) { ++ sha2_quad_round(vset_sew, ++ rotation_regs[(rot_pos + 0) & 0x3], ++ rotation_regs[(rot_pos + 1) & 0x3], ++ rotation_regs[(rot_pos + 2) & 0x3], ++ rotation_regs[(rot_pos + 3) & 0x3], ++ consts, ++ vTmp1, vTmp0, vState0, vState1); ++ ++rot_pos; ++ } ++ // Quad-round #12 (+0, vW0->vW1->vW2->vW3) ... #15 (+3, vW3->vW0->vW1->vW2) ++ // Note that we stop generating new message schedule words (Wt, vW0-13) ++ // as we already generated all the words we end up consuming (i.e., W[63:60]). ++ const int qr_c_end = qr_end + 4; ++ for (int i = qr_end; i < qr_c_end; i++) { ++ sha2_quad_round(vset_sew, ++ rotation_regs[(rot_pos + 0) & 0x3], ++ rotation_regs[(rot_pos + 1) & 0x3], ++ rotation_regs[(rot_pos + 2) & 0x3], ++ rotation_regs[(rot_pos + 3) & 0x3], ++ consts, ++ vTmp1, vTmp0, vState0, vState1, false, i < (qr_c_end-1)); ++ ++rot_pos; ++ } ++ ++ //-------------------------------------------------------------------------------- ++ // Compute the updated hash value H' ++ // H' = H + {h',g',...,b',a'} ++ // = {h,g,...,b,a} + {h',g',...,b',a'} ++ // = {h+h',g+g',...,b+b',a+a'} ++ ++ // H' = H+{a',b',c',...,h'} ++ __ vadd_vv(vState0, vHash0, vState0); ++ __ vadd_vv(vState1, vHash1, vState1); ++ ++ if (multi_block) { ++ int total_adds = vset_sew == Assembler::e32 ? 240 : 608; ++ __ addi(consts, consts, -total_adds); ++ __ add(ofs, ofs, vset_sew == Assembler::e32 ? 64 : 128); ++ __ ble(ofs, limit, multi_block_loop); ++ __ mv(c_rarg0, ofs); // return ofs ++ } ++ ++ // Store H[0..8] = {a,b,c,d,e,f,g,h} from ++ // vState0 = {f,e,b,a} ++ // vState1 = {h,g,d,c} ++ __ vsuxei8_v(vState0, state, vindex); ++ __ vsuxei8_v(vState1, state_c, vindex); ++ ++ __ leave(); ++ __ ret(); ++ ++ return start; ++ } ++ }; ++ ++#undef __ ++#define __ _masm-> ++ + // Set of L registers that correspond to a contiguous memory area. + // Each 64-bit register typically corresponds to 2 32-bit integers. + template +@@ -4339,6 +4618,7 @@ class StubGenerator: public StubCodeGenerator { + return (address) start; + } + ++#endif // COMPILER2_OR_JVMCI + #if INCLUDE_JFR + + static void jfr_prologue(address the_pc, MacroAssembler* _masm, Register thread) { +@@ -4430,6 +4710,115 @@ class StubGenerator: public StubCodeGenerator { + + #endif // INCLUDE_JFR + ++ ++ // Continuation point for throwing of implicit exceptions that are ++ // not handled in the current activation. Fabricates an exception ++ // oop and initiates normal exception dispatching in this ++ // frame. Since we need to preserve callee-saved values (currently ++ // only for C2, but done for C1 as well) we need a callee-saved oop ++ // map and therefore have to make these stubs into RuntimeStubs ++ // rather than BufferBlobs. If the compiler needs all registers to ++ // be preserved between the fault point and the exception handler ++ // then it must assume responsibility for that in ++ // AbstractCompiler::continuation_for_implicit_null_exception or ++ // continuation_for_implicit_division_by_zero_exception. All other ++ // implicit exceptions (e.g., NullPointerException or ++ // AbstractMethodError on entry) are either at call sites or ++ // otherwise assume that stack unwinding will be initiated, so ++ // caller saved registers were assumed volatile in the compiler. ++ ++#undef __ ++#define __ masm-> ++ ++ address generate_throw_exception(const char* name, ++ address runtime_entry, ++ Register arg1 = noreg, ++ Register arg2 = noreg) { ++ // Information about frame layout at time of blocking runtime call. ++ // Note that we only have to preserve callee-saved registers since ++ // the compilers are responsible for supplying a continuation point ++ // if they expect all registers to be preserved. ++ // n.b. riscv asserts that frame::arg_reg_save_area_bytes == 0 ++ assert_cond(runtime_entry != nullptr); ++ enum layout { ++ fp_off = 0, ++ fp_off2, ++ return_off, ++ return_off2, ++ framesize // inclusive of return address ++ }; ++ ++ const int insts_size = 1024; ++ const int locs_size = 64; ++ ++ CodeBuffer code(name, insts_size, locs_size); ++ OopMapSet* oop_maps = new OopMapSet(); ++ MacroAssembler* masm = new MacroAssembler(&code); ++ assert_cond(oop_maps != nullptr && masm != nullptr); ++ ++ address start = __ pc(); ++ ++ // This is an inlined and slightly modified version of call_VM ++ // which has the ability to fetch the return PC out of ++ // thread-local storage and also sets up last_Java_sp slightly ++ // differently than the real call_VM ++ ++ __ enter(); // Save FP and RA before call ++ ++ assert(is_even(framesize / 2), "sp not 16-byte aligned"); ++ ++ // ra and fp are already in place ++ __ addi(sp, fp, 0 - ((unsigned)framesize << LogBytesPerInt)); // prolog ++ ++ int frame_complete = __ pc() - start; ++ ++ // Set up last_Java_sp and last_Java_fp ++ address the_pc = __ pc(); ++ __ set_last_Java_frame(sp, fp, the_pc, t0); ++ ++ // Call runtime ++ if (arg1 != noreg) { ++ assert(arg2 != c_rarg1, "clobbered"); ++ __ mv(c_rarg1, arg1); ++ } ++ if (arg2 != noreg) { ++ __ mv(c_rarg2, arg2); ++ } ++ __ mv(c_rarg0, xthread); ++ BLOCK_COMMENT("call runtime_entry"); ++ __ call(runtime_entry); ++ ++ // Generate oop map ++ OopMap* map = new OopMap(framesize, 0); ++ assert_cond(map != nullptr); ++ ++ oop_maps->add_gc_map(the_pc - start, map); ++ ++ __ reset_last_Java_frame(true); ++ ++ __ leave(); ++ ++ // check for pending exceptions ++#ifdef ASSERT ++ Label L; ++ __ ld(t0, Address(xthread, Thread::pending_exception_offset())); ++ __ bnez(t0, L); ++ __ should_not_reach_here(); ++ __ bind(L); ++#endif // ASSERT ++ __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); ++ ++ // codeBlob framesize is in words (not VMRegImpl::slot_size) ++ RuntimeStub* stub = ++ RuntimeStub::new_runtime_stub(name, ++ &code, ++ frame_complete, ++ (framesize >> (LogBytesPerWord - LogBytesPerInt)), ++ oop_maps, false); ++ assert(stub != nullptr, "create runtime stub fail!"); ++ return stub->entry_point(); ++ } ++ + #undef __ + + // Initialization +@@ -4550,6 +4939,18 @@ class StubGenerator: public StubCodeGenerator { + } + #endif // COMPILER2 + ++ if (UseSHA256Intrinsics) { ++ Sha2Generator sha2(_masm, this); ++ StubRoutines::_sha256_implCompress = sha2.generate_sha256_implCompress(false); ++ StubRoutines::_sha256_implCompressMB = sha2.generate_sha256_implCompress(true); ++ } ++ ++ if (UseSHA512Intrinsics) { ++ Sha2Generator sha2(_masm, this); ++ StubRoutines::_sha512_implCompress = sha2.generate_sha512_implCompress(false); ++ StubRoutines::_sha512_implCompressMB = sha2.generate_sha512_implCompress(true); ++ } ++ + generate_compare_long_strings(); + + generate_string_indexof_stubs(); +diff --git a/src/hotspot/cpu/riscv/vm_version_riscv.cpp b/src/hotspot/cpu/riscv/vm_version_riscv.cpp +index a27acb25b..38da4752c 100644 +--- a/src/hotspot/cpu/riscv/vm_version_riscv.cpp ++++ b/src/hotspot/cpu/riscv/vm_version_riscv.cpp +@@ -121,26 +121,11 @@ void VM_Version::initialize() { + FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); + } + +- if (UseSHA) { +- warning("SHA instructions are not available on this CPU"); +- FLAG_SET_DEFAULT(UseSHA, false); +- } +- + if (UseSHA1Intrinsics) { + warning("Intrinsics for SHA-1 crypto hash functions not available on this CPU."); + FLAG_SET_DEFAULT(UseSHA1Intrinsics, false); + } + +- if (UseSHA256Intrinsics) { +- warning("Intrinsics for SHA-224 and SHA-256 crypto hash functions not available on this CPU."); +- FLAG_SET_DEFAULT(UseSHA256Intrinsics, false); +- } +- +- if (UseSHA512Intrinsics) { +- warning("Intrinsics for SHA-384 and SHA-512 crypto hash functions not available on this CPU."); +- FLAG_SET_DEFAULT(UseSHA512Intrinsics, false); +- } +- + if (UseSHA3Intrinsics) { + warning("Intrinsics for SHA3-224, SHA3-256, SHA3-384 and SHA3-512 crypto hash functions not available on this CPU."); + FLAG_SET_DEFAULT(UseSHA3Intrinsics, false); +@@ -224,6 +209,36 @@ void VM_Version::initialize() { + #ifdef COMPILER2 + c2_initialize(); + #endif // COMPILER2 ++ ++ if (UseZvkn && !UseRVV) { ++ FLAG_SET_DEFAULT(UseZvkn, false); ++ warning("Cannot enable Zvkn on cpu without RVV support."); ++ } ++ ++ if (!UseZvkn && UseSHA) { ++ warning("SHA instructions are not available on this CPU"); ++ FLAG_SET_DEFAULT(UseSHA, false); ++ } else if (UseZvkn && FLAG_IS_DEFAULT(UseSHA)) { ++ FLAG_SET_DEFAULT(UseSHA, true); ++ } ++ ++ if (!UseSHA) { ++ if (UseSHA256Intrinsics) { ++ warning("Intrinsics for SHA-224 and SHA-256 crypto hash functions not available on this CPU, UseZvkn needed."); ++ FLAG_SET_DEFAULT(UseSHA256Intrinsics, false); ++ } ++ if (UseSHA512Intrinsics) { ++ warning("Intrinsics for SHA-384 and SHA-512 crypto hash functions not available on this CPU, UseZvkn needed."); ++ FLAG_SET_DEFAULT(UseSHA512Intrinsics, false); ++ } ++ } else { ++ if (FLAG_IS_DEFAULT(UseSHA256Intrinsics)) { ++ FLAG_SET_DEFAULT(UseSHA256Intrinsics, true); ++ } ++ if (FLAG_IS_DEFAULT(UseSHA512Intrinsics)) { ++ FLAG_SET_DEFAULT(UseSHA512Intrinsics, true); ++ } ++ } + } + + #ifdef COMPILER2 +diff --git a/src/hotspot/cpu/riscv/vm_version_riscv.hpp b/src/hotspot/cpu/riscv/vm_version_riscv.hpp +index 01c5cf0c6..590585b42 100644 +--- a/src/hotspot/cpu/riscv/vm_version_riscv.hpp ++++ b/src/hotspot/cpu/riscv/vm_version_riscv.hpp +@@ -144,6 +144,7 @@ class VM_Version : public Abstract_VM_Version { + decl(ext_Zifencei , "Zifencei" , RV_NO_FLAG_BIT, true , NO_UPDATE_DEFAULT) \ + decl(ext_Zic64b , "Zic64b" , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZic64b)) \ + decl(ext_Zihintpause , "Zihintpause" , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZihintpause)) \ ++ decl(ext_Zvkn , "Zvkn" , RV_NO_FLAG_BIT, true , UPDATE_DEFAULT(UseZvkn)) \ + decl(mvendorid , "VendorId" , RV_NO_FLAG_BIT, false, NO_UPDATE_DEFAULT) \ + decl(marchid , "ArchId" , RV_NO_FLAG_BIT, false, NO_UPDATE_DEFAULT) \ + decl(mimpid , "ImpId" , RV_NO_FLAG_BIT, false, NO_UPDATE_DEFAULT) \ diff --git a/Backport-JDK-8322179-RISC-V-Implement-SHA-1-intrinsic.patch b/Backport-JDK-8322179-RISC-V-Implement-SHA-1-intrinsic.patch new file mode 100644 index 0000000000000000000000000000000000000000..3f4c485ebe23b4bd91bb17beda60948762c25eed --- /dev/null +++ b/Backport-JDK-8322179-RISC-V-Implement-SHA-1-intrinsic.patch @@ -0,0 +1,481 @@ +diff --git a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp +index dec9a8464..a554729ab 100644 +--- a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp ++++ b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp +@@ -4618,6 +4618,344 @@ class StubGenerator: public StubCodeGenerator { + return (address) start; + } + ++ // ------------------------ SHA-1 intrinsic ------------------------ ++ ++ // K't = ++ // 5a827999, 0 <= t <= 19 ++ // 6ed9eba1, 20 <= t <= 39 ++ // 8f1bbcdc, 40 <= t <= 59 ++ // ca62c1d6, 60 <= t <= 79 ++ void sha1_prepare_k(Register cur_k, int round) { ++ assert(round >= 0 && round < 80, "must be"); ++ ++ static const int64_t ks[] = {0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6}; ++ if ((round % 20) == 0) { ++ __ mv(cur_k, ks[round/20]); ++ } ++ } ++ ++ // W't = ++ // M't, 0 <= t <= 15 ++ // ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79 ++ void sha1_prepare_w(Register cur_w, Register ws[], Register buf, int round) { ++ assert(round >= 0 && round < 80, "must be"); ++ ++ if (round < 16) { ++ // in the first 16 rounds, in ws[], every register contains 2 W't, e.g. ++ // in ws[0], high part contains W't-0, low part contains W't-1, ++ // in ws[1], high part contains W't-2, low part contains W't-3, ++ // ... ++ // in ws[7], high part contains W't-14, low part contains W't-15. ++ ++ if ((round % 2) == 0) { ++ __ ld(ws[round/2], Address(buf, (round/2) * 8)); ++ // reverse bytes, as SHA-1 is defined in big-endian. ++ __ revb(ws[round/2], ws[round/2]); ++ __ srli(cur_w, ws[round/2], 32); ++ } else { ++ __ mv(cur_w, ws[round/2]); ++ } ++ ++ return; ++ } ++ ++ if ((round % 2) == 0) { ++ int idx = 16; ++ // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79 ++ __ srli(t1, ws[(idx-8)/2], 32); ++ __ xorr(t0, ws[(idx-3)/2], t1); ++ ++ __ srli(t1, ws[(idx-14)/2], 32); ++ __ srli(cur_w, ws[(idx-16)/2], 32); ++ __ xorr(cur_w, cur_w, t1); ++ ++ __ xorr(cur_w, cur_w, t0); ++ __ rolw_imm(cur_w, cur_w, 1, t0); ++ ++ // copy the cur_w value to ws[8]. ++ // now, valid w't values are at: ++ // w0: ws[0]'s lower 32 bits ++ // w1 ~ w14: ws[1] ~ ws[7] ++ // w15: ws[8]'s higher 32 bits ++ __ slli(ws[idx/2], cur_w, 32); ++ ++ return; ++ } ++ ++ int idx = 17; ++ // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79 ++ __ srli(t1, ws[(idx-3)/2], 32); ++ __ xorr(t0, t1, ws[(idx-8)/2]); ++ ++ __ xorr(cur_w, ws[(idx-16)/2], ws[(idx-14)/2]); ++ ++ __ xorr(cur_w, cur_w, t0); ++ __ rolw_imm(cur_w, cur_w, 1, t0); ++ ++ // copy the cur_w value to ws[8] ++ __ zero_extend(cur_w, cur_w, 32); ++ __ orr(ws[idx/2], ws[idx/2], cur_w); ++ ++ // shift the w't registers, so they start from ws[0] again. ++ // now, valid w't values are at: ++ // w0 ~ w15: ws[0] ~ ws[7] ++ Register ws_0 = ws[0]; ++ for (int i = 0; i < 16/2; i++) { ++ ws[i] = ws[i+1]; ++ } ++ ws[8] = ws_0; ++ } ++ ++ // f't(x, y, z) = ++ // Ch(x, y, z) = (x & y) ^ (~x & z) , 0 <= t <= 19 ++ // Parity(x, y, z) = x ^ y ^ z , 20 <= t <= 39 ++ // Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) , 40 <= t <= 59 ++ // Parity(x, y, z) = x ^ y ^ z , 60 <= t <= 79 ++ void sha1_f(Register dst, Register x, Register y, Register z, int round) { ++ assert(round >= 0 && round < 80, "must be"); ++ assert_different_registers(dst, x, y, z, t0, t1); ++ ++ if (round < 20) { ++ // (x & y) ^ (~x & z) ++ __ andr(t0, x, y); ++ __ andn(dst, z, x); ++ __ xorr(dst, dst, t0); ++ } else if (round >= 40 && round < 60) { ++ // (x & y) ^ (x & z) ^ (y & z) ++ __ andr(t0, x, y); ++ __ andr(t1, x, z); ++ __ andr(dst, y, z); ++ __ xorr(dst, dst, t0); ++ __ xorr(dst, dst, t1); ++ } else { ++ // x ^ y ^ z ++ __ xorr(dst, x, y); ++ __ xorr(dst, dst, z); ++ } ++ } ++ ++ // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't ++ // e = d ++ // d = c ++ // c = ROTL'30(b) ++ // b = a ++ // a = T ++ void sha1_process_round(Register a, Register b, Register c, Register d, Register e, ++ Register cur_k, Register cur_w, Register tmp, int round) { ++ assert(round >= 0 && round < 80, "must be"); ++ assert_different_registers(a, b, c, d, e, cur_w, cur_k, tmp, t0); ++ ++ // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't ++ ++ // cur_w will be recalculated at the beginning of each round, ++ // so, we can reuse it as a temp register here. ++ Register tmp2 = cur_w; ++ ++ // reuse e as a temporary register, as we will mv new value into it later ++ Register tmp3 = e; ++ __ add(tmp2, cur_k, tmp2); ++ __ add(tmp3, tmp3, tmp2); ++ __ rolw_imm(tmp2, a, 5, t0); ++ ++ sha1_f(tmp, b, c, d, round); ++ ++ __ add(tmp2, tmp2, tmp); ++ __ add(tmp2, tmp2, tmp3); ++ ++ // e = d ++ // d = c ++ // c = ROTL'30(b) ++ // b = a ++ // a = T ++ __ mv(e, d); ++ __ mv(d, c); ++ ++ __ rolw_imm(c, b, 30); ++ __ mv(b, a); ++ __ mv(a, tmp2); ++ } ++ ++ // H(i)0 = a + H(i-1)0 ++ // H(i)1 = b + H(i-1)1 ++ // H(i)2 = c + H(i-1)2 ++ // H(i)3 = d + H(i-1)3 ++ // H(i)4 = e + H(i-1)4 ++ void sha1_calculate_im_hash(Register a, Register b, Register c, Register d, Register e, ++ Register prev_ab, Register prev_cd, Register prev_e) { ++ assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e); ++ ++ __ add(a, a, prev_ab); ++ __ srli(prev_ab, prev_ab, 32); ++ __ add(b, b, prev_ab); ++ ++ __ add(c, c, prev_cd); ++ __ srli(prev_cd, prev_cd, 32); ++ __ add(d, d, prev_cd); ++ ++ __ add(e, e, prev_e); ++ } ++ ++ void sha1_preserve_prev_abcde(Register a, Register b, Register c, Register d, Register e, ++ Register prev_ab, Register prev_cd, Register prev_e) { ++ assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e, t0); ++ ++ __ slli(t0, b, 32); ++ __ zero_extend(prev_ab, a, 32); ++ __ orr(prev_ab, prev_ab, t0); ++ ++ __ slli(t0, d, 32); ++ __ zero_extend(prev_cd, c, 32); ++ __ orr(prev_cd, prev_cd, t0); ++ ++ __ mv(prev_e, e); ++ } ++ ++ // Intrinsic for: ++ // void sun.security.provider.SHA.implCompress0(byte[] buf, int ofs) ++ // void sun.security.provider.DigestBase.implCompressMultiBlock0(byte[] b, int ofs, int limit) ++ // ++ // Arguments: ++ // ++ // Inputs: ++ // c_rarg0: byte[] src array + offset ++ // c_rarg1: int[] SHA.state ++ // - - - - - - below are only for implCompressMultiBlock0 - - - - - - ++ // c_rarg2: int offset ++ // c_rarg3: int limit ++ // ++ // Outputs: ++ // - - - - - - below are only for implCompressMultiBlock0 - - - - - - ++ // c_rarg0: int offset, when (multi_block == true) ++ // ++ address generate_sha1_implCompress(bool multi_block, const char *name) { ++ __ align(CodeEntryAlignment); ++ StubCodeMark mark(this, "StubRoutines", name); ++ ++ address start = __ pc(); ++ __ enter(); ++ ++ RegSet saved_regs = RegSet::range(x18, x27); ++ if (multi_block) { ++ // use x9 as src below. ++ saved_regs += RegSet::of(x9); ++ } ++ __ push_reg(saved_regs, sp); ++ ++ // c_rarg0 - c_rarg3: x10 - x13 ++ Register buf = c_rarg0; ++ Register state = c_rarg1; ++ Register offset = c_rarg2; ++ Register limit = c_rarg3; ++ // use src to contain the original start point of the array. ++ Register src = x9; ++ ++ if (multi_block) { ++ __ sub(limit, limit, offset); ++ __ add(limit, limit, buf); ++ __ sub(src, buf, offset); ++ } ++ ++ // [args-reg]: x14 - x17 ++ // [temp-reg]: x28 - x31 ++ // [saved-reg]: x18 - x27 ++ ++ // h0/1/2/3/4 ++ const Register a = x14, b = x15, c = x16, d = x17, e = x28; ++ // w0, w1, ... w15 ++ // put two adjecent w's in one register: ++ // one at high word part, another at low word part ++ // at different round (even or odd), w't value reside in different items in ws[]. ++ // w0 ~ w15, either reside in ++ // ws[0] ~ ws[7], where ++ // w0 at higher 32 bits of ws[0], ++ // w1 at lower 32 bits of ws[0], ++ // ... ++ // w14 at higher 32 bits of ws[7], ++ // w15 at lower 32 bits of ws[7]. ++ // or, reside in ++ // w0: ws[0]'s lower 32 bits ++ // w1 ~ w14: ws[1] ~ ws[7] ++ // w15: ws[8]'s higher 32 bits ++ Register ws[9] = {x29, x30, x31, x18, ++ x19, x20, x21, x22, ++ x23}; // auxiliary register for calculating w's value ++ // current k't's value ++ const Register cur_k = x24; ++ // current w't's value ++ const Register cur_w = x25; ++ // values of a, b, c, d, e in the previous round ++ const Register prev_ab = x26, prev_cd = x27; ++ const Register prev_e = offset; // reuse offset/c_rarg2 ++ ++ // load 5 words state into a, b, c, d, e. ++ // ++ // To minimize the number of memory operations, we apply following ++ // optimization: read the states (a/b/c/d) of 4-byte values in pairs, ++ // with a single ld, and split them into 2 registers. ++ // ++ // And, as the core algorithm of SHA-1 works on 32-bits words, so ++ // in the following code, it does not care about the content of ++ // higher 32-bits in a/b/c/d/e. Based on this observation, ++ // we can apply further optimization, which is to just ignore the ++ // higher 32-bits in a/c/e, rather than set the higher ++ // 32-bits of a/c/e to zero explicitly with extra instructions. ++ __ ld(a, Address(state, 0)); ++ __ srli(b, a, 32); ++ __ ld(c, Address(state, 8)); ++ __ srli(d, c, 32); ++ __ lw(e, Address(state, 16)); ++ ++ Label L_sha1_loop; ++ if (multi_block) { ++ __ BIND(L_sha1_loop); ++ } ++ ++ sha1_preserve_prev_abcde(a, b, c, d, e, prev_ab, prev_cd, prev_e); ++ ++ for (int round = 0; round < 80; round++) { ++ // prepare K't value ++ sha1_prepare_k(cur_k, round); ++ ++ // prepare W't value ++ sha1_prepare_w(cur_w, ws, buf, round); ++ ++ // one round process ++ sha1_process_round(a, b, c, d, e, cur_k, cur_w, t2, round); ++ } ++ ++ // compute the intermediate hash value ++ sha1_calculate_im_hash(a, b, c, d, e, prev_ab, prev_cd, prev_e); ++ ++ if (multi_block) { ++ int64_t block_bytes = 16 * 4; ++ __ addi(buf, buf, block_bytes); ++ ++ __ bge(limit, buf, L_sha1_loop, true); ++ } ++ ++ // store back the state. ++ __ zero_extend(a, a, 32); ++ __ slli(b, b, 32); ++ __ orr(a, a, b); ++ __ sd(a, Address(state, 0)); ++ __ zero_extend(c, c, 32); ++ __ slli(d, d, 32); ++ __ orr(c, c, d); ++ __ sd(c, Address(state, 8)); ++ __ sw(e, Address(state, 16)); ++ ++ // return offset ++ if (multi_block) { ++ __ sub(c_rarg0, buf, src); ++ } ++ ++ __ pop_reg(saved_regs, sp); ++ ++ __ leave(); ++ __ ret(); ++ ++ return (address) start; ++ } + #endif // COMPILER2_OR_JVMCI + #if INCLUDE_JFR + +@@ -4959,6 +5297,12 @@ class StubGenerator: public StubCodeGenerator { + StubRoutines::_md5_implCompress = generate_md5_implCompress(false, "md5_implCompress"); + StubRoutines::_md5_implCompressMB = generate_md5_implCompress(true, "md5_implCompressMB"); + } ++ ++ if (UseSHA1Intrinsics) { ++ StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); ++ StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); ++ } ++ + #endif // COMPILER2_OR_JVMCI + } + +diff --git a/src/hotspot/cpu/riscv/stubRoutines_riscv.hpp b/src/hotspot/cpu/riscv/stubRoutines_riscv.hpp +index fde88f5e3..45e9bc0dc 100644 +--- a/src/hotspot/cpu/riscv/stubRoutines_riscv.hpp ++++ b/src/hotspot/cpu/riscv/stubRoutines_riscv.hpp +@@ -39,7 +39,7 @@ enum platform_dependent_constants { + // simply increase sizes if too small (assembler will crash if too small) + _initial_stubs_code_size = 10000, + _continuation_stubs_code_size = 2000, +- _compiler_stubs_code_size = 15000 ZGC_ONLY(+5000), ++ _compiler_stubs_code_size = 25000 ZGC_ONLY(+5000), + _final_stubs_code_size = 20000 ZGC_ONLY(+10000) + }; + +diff --git a/src/hotspot/cpu/riscv/vm_version_riscv.cpp b/src/hotspot/cpu/riscv/vm_version_riscv.cpp +index 38da4752c..39f41e0bf 100644 +--- a/src/hotspot/cpu/riscv/vm_version_riscv.cpp ++++ b/src/hotspot/cpu/riscv/vm_version_riscv.cpp +@@ -121,16 +121,6 @@ void VM_Version::initialize() { + FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); + } + +- if (UseSHA1Intrinsics) { +- warning("Intrinsics for SHA-1 crypto hash functions not available on this CPU."); +- FLAG_SET_DEFAULT(UseSHA1Intrinsics, false); +- } +- +- if (UseSHA3Intrinsics) { +- warning("Intrinsics for SHA3-224, SHA3-256, SHA3-384 and SHA3-512 crypto hash functions not available on this CPU."); +- FLAG_SET_DEFAULT(UseSHA3Intrinsics, false); +- } +- + if (UseCRC32Intrinsics) { + warning("CRC32 intrinsics are not available on this CPU."); + FLAG_SET_DEFAULT(UseCRC32Intrinsics, false); +@@ -210,34 +200,66 @@ void VM_Version::initialize() { + c2_initialize(); + #endif // COMPILER2 + +- if (UseZvkn && !UseRVV) { +- FLAG_SET_DEFAULT(UseZvkn, false); +- warning("Cannot enable Zvkn on cpu without RVV support."); +- } + +- if (!UseZvkn && UseSHA) { +- warning("SHA instructions are not available on this CPU"); +- FLAG_SET_DEFAULT(UseSHA, false); +- } else if (UseZvkn && FLAG_IS_DEFAULT(UseSHA)) { ++ // SHA's ++ if (FLAG_IS_DEFAULT(UseSHA)) { + FLAG_SET_DEFAULT(UseSHA, true); + } + +- if (!UseSHA) { ++ // SHA-1, no RVV required though. ++ if (UseSHA) { ++ if (FLAG_IS_DEFAULT(UseSHA1Intrinsics)) { ++ FLAG_SET_DEFAULT(UseSHA1Intrinsics, true); ++ } ++ } else if (UseSHA1Intrinsics) { ++ warning("Intrinsics for SHA-1 crypto hash functions not available on this CPU."); ++ FLAG_SET_DEFAULT(UseSHA1Intrinsics, false); ++ } ++ ++ // UseZvkn (depends on RVV) and SHA-2. ++ if (UseZvkn && !UseRVV) { ++ FLAG_SET_DEFAULT(UseZvkn, false); ++ warning("Cannot enable Zvkn on cpu without RVV support."); ++ } ++ // SHA-2, depends on Zvkn. ++ if (UseSHA) { ++ if (UseZvkn) { ++ if (FLAG_IS_DEFAULT(UseSHA256Intrinsics)) { ++ FLAG_SET_DEFAULT(UseSHA256Intrinsics, true); ++ } ++ if (FLAG_IS_DEFAULT(UseSHA512Intrinsics)) { ++ FLAG_SET_DEFAULT(UseSHA512Intrinsics, true); ++ } ++ } else { ++ if (UseSHA256Intrinsics) { ++ warning("Intrinsics for SHA-224 and SHA-256 crypto hash functions not available on this CPU, UseZvkn needed."); ++ FLAG_SET_DEFAULT(UseSHA256Intrinsics, false); ++ } ++ if (UseSHA512Intrinsics) { ++ warning("Intrinsics for SHA-384 and SHA-512 crypto hash functions not available on this CPU, UseZvkn needed."); ++ FLAG_SET_DEFAULT(UseSHA512Intrinsics, false); ++ } ++ } ++ } else { + if (UseSHA256Intrinsics) { +- warning("Intrinsics for SHA-224 and SHA-256 crypto hash functions not available on this CPU, UseZvkn needed."); ++ warning("Intrinsics for SHA-224 and SHA-256 crypto hash functions not available on this CPU, as UseSHA disabled."); + FLAG_SET_DEFAULT(UseSHA256Intrinsics, false); + } + if (UseSHA512Intrinsics) { +- warning("Intrinsics for SHA-384 and SHA-512 crypto hash functions not available on this CPU, UseZvkn needed."); ++ warning("Intrinsics for SHA-384 and SHA-512 crypto hash functions not available on this CPU, as UseSHA disabled."); + FLAG_SET_DEFAULT(UseSHA512Intrinsics, false); + } +- } else { +- if (FLAG_IS_DEFAULT(UseSHA256Intrinsics)) { +- FLAG_SET_DEFAULT(UseSHA256Intrinsics, true); +- } +- if (FLAG_IS_DEFAULT(UseSHA512Intrinsics)) { +- FLAG_SET_DEFAULT(UseSHA512Intrinsics, true); +- } ++ } ++ ++ // SHA-3 ++ if (UseSHA3Intrinsics) { ++ warning("Intrinsics for SHA3-224, SHA3-256, SHA3-384 and SHA3-512 crypto hash functions not available on this CPU."); ++ FLAG_SET_DEFAULT(UseSHA3Intrinsics, false); ++ } ++ ++ // UseSHA ++ if (!(UseSHA1Intrinsics || UseSHA256Intrinsics || UseSHA3Intrinsics || UseSHA512Intrinsics)) { ++ FLAG_SET_DEFAULT(UseSHA, false); + } + } + diff --git a/Backport-JDK-8322209-8322179-8329641-RISC-V-Enable-sha-md5-tests.patch b/Backport-JDK-8322209-8322179-8329641-RISC-V-Enable-sha-md5-tests.patch new file mode 100644 index 0000000000000000000000000000000000000000..84e6b279777d6e4179ced1f8975337f388f8e470 --- /dev/null +++ b/Backport-JDK-8322209-8322179-8329641-RISC-V-Enable-sha-md5-tests.patch @@ -0,0 +1,62 @@ +diff --git a/test/hotspot/jtreg/compiler/intrinsics/sha/cli/TestUseMD5IntrinsicsOptionOnUnsupportedCPU.java b/test/hotspot/jtreg/compiler/intrinsics/sha/cli/TestUseMD5IntrinsicsOptionOnUnsupportedCPU.java +index e9ae2f6c1..cd5933ec9 100644 +--- a/test/hotspot/jtreg/compiler/intrinsics/sha/cli/TestUseMD5IntrinsicsOptionOnUnsupportedCPU.java ++++ b/test/hotspot/jtreg/compiler/intrinsics/sha/cli/TestUseMD5IntrinsicsOptionOnUnsupportedCPU.java +@@ -39,6 +39,7 @@ package compiler.intrinsics.sha.cli; + + import compiler.intrinsics.sha.cli.testcases.GenericTestCaseForOtherCPU; + import compiler.intrinsics.sha.cli.testcases.GenericTestCaseForUnsupportedAArch64CPU; ++import compiler.intrinsics.sha.cli.testcases.GenericTestCaseForUnsupportedRISCV64CPU; + import compiler.intrinsics.sha.cli.testcases.GenericTestCaseForUnsupportedX86CPU; + import compiler.intrinsics.sha.cli.testcases.UseSHAIntrinsicsSpecificTestCaseForUnsupportedCPU; + +@@ -49,6 +50,8 @@ public class TestUseMD5IntrinsicsOptionOnUnsupportedCPU { + DigestOptionsBase.USE_MD5_INTRINSICS_OPTION, /* checkUseSHA = */ false), + new GenericTestCaseForUnsupportedAArch64CPU( + DigestOptionsBase.USE_MD5_INTRINSICS_OPTION, /* checkUseSHA = */ false), ++ new GenericTestCaseForUnsupportedRISCV64CPU( ++ DigestOptionsBase.USE_MD5_INTRINSICS_OPTION, /* checkUseSHA = */ false), + new GenericTestCaseForOtherCPU( + DigestOptionsBase.USE_MD5_INTRINSICS_OPTION, /* checkUseSHA = */ false)).test(); + } +diff --git a/test/hotspot/jtreg/compiler/testlibrary/sha/predicate/IntrinsicPredicates.java b/test/hotspot/jtreg/compiler/testlibrary/sha/predicate/IntrinsicPredicates.java +index 689c7c8cc..27fe99892 100644 +--- a/test/hotspot/jtreg/compiler/testlibrary/sha/predicate/IntrinsicPredicates.java ++++ b/test/hotspot/jtreg/compiler/testlibrary/sha/predicate/IntrinsicPredicates.java +@@ -61,14 +61,16 @@ public class IntrinsicPredicates { + + public static final BooleanSupplier MD5_INSTRUCTION_AVAILABLE + = new OrPredicate(new CPUSpecificPredicate("aarch64.*", null, null), ++ new OrPredicate(new CPUSpecificPredicate("riscv64.*", null, null), + // x86 variants + new OrPredicate(new CPUSpecificPredicate("amd64.*", null, null), + new OrPredicate(new CPUSpecificPredicate("i386.*", null, null), +- new CPUSpecificPredicate("x86.*", null, null)))); ++ new CPUSpecificPredicate("x86.*", null, null))))); + + public static final BooleanSupplier SHA1_INSTRUCTION_AVAILABLE + = new OrPredicate(new CPUSpecificPredicate("aarch64.*", new String[] { "sha1" }, null), +- new OrPredicate(new CPUSpecificPredicate("riscv64.*", new String[] { "sha1" }, null), ++ // SHA-1 intrinsic is implemented with scalar instructions on riscv64 ++ new OrPredicate(new CPUSpecificPredicate("riscv64.*", null, null), + new OrPredicate(new CPUSpecificPredicate("s390.*", new String[] { "sha1" }, null), + // x86 variants + new OrPredicate(new CPUSpecificPredicate("amd64.*", new String[] { "sha" }, null), +@@ -77,7 +79,7 @@ public class IntrinsicPredicates { + + public static final BooleanSupplier SHA256_INSTRUCTION_AVAILABLE + = new OrPredicate(new CPUSpecificPredicate("aarch64.*", new String[] { "sha256" }, null), +- new OrPredicate(new CPUSpecificPredicate("riscv64.*", new String[] { "sha256" }, null), ++ new OrPredicate(new CPUSpecificPredicate("riscv64.*", new String[] { "zvkn" }, null), + new OrPredicate(new CPUSpecificPredicate("s390.*", new String[] { "sha256" }, null), + new OrPredicate(new CPUSpecificPredicate("ppc64.*", new String[] { "sha" }, null), + new OrPredicate(new CPUSpecificPredicate("ppc64le.*", new String[] { "sha" }, null), +@@ -90,7 +92,7 @@ public class IntrinsicPredicates { + + public static final BooleanSupplier SHA512_INSTRUCTION_AVAILABLE + = new OrPredicate(new CPUSpecificPredicate("aarch64.*", new String[] { "sha512" }, null), +- new OrPredicate(new CPUSpecificPredicate("riscv64.*", new String[] { "sha512" }, null), ++ new OrPredicate(new CPUSpecificPredicate("riscv64.*", new String[] { "zvkn" }, null), + new OrPredicate(new CPUSpecificPredicate("s390.*", new String[] { "sha512" }, null), + new OrPredicate(new CPUSpecificPredicate("ppc64.*", new String[] { "sha" }, null), + new OrPredicate(new CPUSpecificPredicate("ppc64le.*", new String[] { "sha" }, null), diff --git a/Backport-JDK-8334999-RISC-V-AES-single-block-cryption-intrinsics.patch b/Backport-JDK-8334999-RISC-V-AES-single-block-cryption-intrinsics.patch new file mode 100644 index 0000000000000000000000000000000000000000..c6281af5e1e0044ff1954c675aef179602400138 --- /dev/null +++ b/Backport-JDK-8334999-RISC-V-AES-single-block-cryption-intrinsics.patch @@ -0,0 +1,271 @@ +diff --git a/src/hotspot/cpu/riscv/assembler_riscv.hpp b/src/hotspot/cpu/riscv/assembler_riscv.hpp +index 24e5f4fa8..4f2d2bfb3 100644 +--- a/src/hotspot/cpu/riscv/assembler_riscv.hpp ++++ b/src/hotspot/cpu/riscv/assembler_riscv.hpp +@@ -1846,6 +1846,12 @@ enum Nf { + // Vector Bit-manipulation used in Cryptography (Zvkb) Extension + INSN(vbrev8_v, 0b1010111, 0b010, 0b01000, 0b010010); + INSN(vrev8_v, 0b1010111, 0b010, 0b01001, 0b010010); ++ // Vector AES instructions (Zvkned extension) ++ INSN(vaesem_vv, 0b1110111, 0b010, 0b00010, 0b101000); ++ INSN(vaesef_vv, 0b1110111, 0b010, 0b00011, 0b101000); ++ ++ INSN(vaesdm_vv, 0b1110111, 0b010, 0b00000, 0b101000); ++ INSN(vaesdf_vv, 0b1110111, 0b010, 0b00001, 0b101000); + + #undef INSN + +diff --git a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp +index a554729ab..dd478edbb 100644 +--- a/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp ++++ b/src/hotspot/cpu/riscv/stubGenerator_riscv.cpp +@@ -2313,6 +2313,173 @@ class StubGenerator: public StubCodeGenerator { + StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); + } + ++ void generate_aes_loadkeys(const Register &key, VectorRegister *working_vregs, int rounds) { ++ const int step = 16; ++ for (int i = 0; i < rounds; i++) { ++ __ vle32_v(working_vregs[i], key); ++ // The keys are stored in little-endian array, while we need ++ // to operate in big-endian. ++ // So performing an endian-swap here with vrev8.v instruction ++ __ vrev8_v(working_vregs[i], working_vregs[i]); ++ __ addi(key, key, step); ++ } ++ } ++ ++ void generate_aes_encrypt(const VectorRegister &res, VectorRegister *working_vregs, int rounds) { ++ assert(rounds <= 15, "rounds should be less than or equal to working_vregs size"); ++ ++ __ vxor_vv(res, res, working_vregs[0]); ++ for (int i = 1; i < rounds - 1; i++) { ++ __ vaesem_vv(res, working_vregs[i]); ++ } ++ __ vaesef_vv(res, working_vregs[rounds - 1]); ++ } ++ ++ // Arguments: ++ // ++ // Inputs: ++ // c_rarg0 - source byte array address ++ // c_rarg1 - destination byte array address ++ // c_rarg2 - K (key) in little endian int array ++ // ++ address generate_aescrypt_encryptBlock() { ++ assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support"); ++ ++ __ align(CodeEntryAlignment); ++ StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); ++ ++ Label L_aes128, L_aes192; ++ ++ const Register from = c_rarg0; // source array address ++ const Register to = c_rarg1; // destination array address ++ const Register key = c_rarg2; // key array address ++ const Register keylen = c_rarg3; ++ ++ VectorRegister working_vregs[] = { ++ v4, v5, v6, v7, v8, v9, v10, v11, ++ v12, v13, v14, v15, v16, v17, v18 ++ }; ++ const VectorRegister res = v19; ++ ++ address start = __ pc(); ++ __ enter(); ++ ++ __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); ++ ++ __ vsetivli(x0, 4, Assembler::e32, Assembler::m1); ++ __ vle32_v(res, from); ++ ++ __ mv(t2, 52); ++ __ blt(keylen, t2, L_aes128); ++ __ beq(keylen, t2, L_aes192); ++ // Else we fallthrough to the biggest case (256-bit key size) ++ ++ // Note: the following function performs key += 15*16 ++ generate_aes_loadkeys(key, working_vregs, 15); ++ generate_aes_encrypt(res, working_vregs, 15); ++ __ vse32_v(res, to); ++ __ mv(c_rarg0, 0); ++ __ leave(); ++ __ ret(); ++ ++ __ bind(L_aes192); ++ // Note: the following function performs key += 13*16 ++ generate_aes_loadkeys(key, working_vregs, 13); ++ generate_aes_encrypt(res, working_vregs, 13); ++ __ vse32_v(res, to); ++ __ mv(c_rarg0, 0); ++ __ leave(); ++ __ ret(); ++ ++ __ bind(L_aes128); ++ // Note: the following function performs key += 11*16 ++ generate_aes_loadkeys(key, working_vregs, 11); ++ generate_aes_encrypt(res, working_vregs, 11); ++ __ vse32_v(res, to); ++ __ mv(c_rarg0, 0); ++ __ leave(); ++ __ ret(); ++ ++ return start; ++ } ++ ++ void generate_aes_decrypt(const VectorRegister &res, VectorRegister *working_vregs, int rounds) { ++ assert(rounds <= 15, "rounds should be less than or equal to working_vregs size"); ++ ++ __ vxor_vv(res, res, working_vregs[rounds - 1]); ++ for (int i = rounds - 2; i > 0; i--) { ++ __ vaesdm_vv(res, working_vregs[i]); ++ } ++ __ vaesdf_vv(res, working_vregs[0]); ++ } ++ ++ // Arguments: ++ // ++ // Inputs: ++ // c_rarg0 - source byte array address ++ // c_rarg1 - destination byte array address ++ // c_rarg2 - K (key) in little endian int array ++ // ++ address generate_aescrypt_decryptBlock() { ++ assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support"); ++ ++ __ align(CodeEntryAlignment); ++ StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); ++ ++ Label L_aes128, L_aes192; ++ ++ const Register from = c_rarg0; // source array address ++ const Register to = c_rarg1; // destination array address ++ const Register key = c_rarg2; // key array address ++ const Register keylen = c_rarg3; ++ ++ VectorRegister working_vregs[] = { ++ v4, v5, v6, v7, v8, v9, v10, v11, ++ v12, v13, v14, v15, v16, v17, v18 ++ }; ++ const VectorRegister res = v19; ++ ++ address start = __ pc(); ++ __ enter(); // required for proper stackwalking of RuntimeStub frame ++ ++ __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); ++ ++ __ vsetivli(x0, 4, Assembler::e32, Assembler::m1); ++ __ vle32_v(res, from); ++ ++ __ mv(t2, 52); ++ __ blt(keylen, t2, L_aes128); ++ __ beq(keylen, t2, L_aes192); ++ // Else we fallthrough to the biggest case (256-bit key size) ++ ++ // Note: the following function performs key += 15*16 ++ generate_aes_loadkeys(key, working_vregs, 15); ++ generate_aes_decrypt(res, working_vregs, 15); ++ __ vse32_v(res, to); ++ __ mv(c_rarg0, 0); ++ __ leave(); ++ __ ret(); ++ ++ __ bind(L_aes192); ++ // Note: the following function performs key += 13*16 ++ generate_aes_loadkeys(key, working_vregs, 13); ++ generate_aes_decrypt(res, working_vregs, 13); ++ __ vse32_v(res, to); ++ __ mv(c_rarg0, 0); ++ __ leave(); ++ __ ret(); ++ ++ __ bind(L_aes128); ++ // Note: the following function performs key += 11*16 ++ generate_aes_loadkeys(key, working_vregs, 11); ++ generate_aes_decrypt(res, working_vregs, 11); ++ __ vse32_v(res, to); ++ __ mv(c_rarg0, 0); ++ __ leave(); ++ __ ret(); ++ ++ return start; ++ } + // code for comparing 16 bytes of strings with same encoding + void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) { + const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, tmp1 = x28, tmp2 = x29, tmp4 = x7, tmp5 = x31; +@@ -5271,6 +5438,11 @@ class StubGenerator: public StubCodeGenerator { + StubRoutines::_montgomerySquare = g.generate_square(); + } + ++ if (UseAESIntrinsics) { ++ StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); ++ StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); ++ } ++ + if (UseRVVForBigIntegerShiftIntrinsics) { + StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); + StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); +diff --git a/src/hotspot/cpu/riscv/vm_version_riscv.cpp b/src/hotspot/cpu/riscv/vm_version_riscv.cpp +index 39f41e0bf..c49072633 100644 +--- a/src/hotspot/cpu/riscv/vm_version_riscv.cpp ++++ b/src/hotspot/cpu/riscv/vm_version_riscv.cpp +@@ -105,17 +105,6 @@ void VM_Version::initialize() { + FLAG_SET_DEFAULT(AllocatePrefetchDistance, 0); + } + +- if (UseAES || UseAESIntrinsics) { +- if (UseAES && !FLAG_IS_DEFAULT(UseAES)) { +- warning("AES instructions are not available on this CPU"); +- FLAG_SET_DEFAULT(UseAES, false); +- } +- if (UseAESIntrinsics && !FLAG_IS_DEFAULT(UseAESIntrinsics)) { +- warning("AES intrinsics are not available on this CPU"); +- FLAG_SET_DEFAULT(UseAESIntrinsics, false); +- } +- } +- + if (UseAESCTRIntrinsics) { + warning("AES/CTR intrinsics are not available on this CPU"); + FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); +@@ -360,6 +349,23 @@ void VM_Version::c2_initialize() { + if (FLAG_IS_DEFAULT(UseMontgomerySquareIntrinsic)) { + FLAG_SET_DEFAULT(UseMontgomerySquareIntrinsic, true); + } ++ ++ // AES ++ if (UseZvkn) { ++ UseAES = UseAES || FLAG_IS_DEFAULT(UseAES); ++ UseAESIntrinsics = ++ UseAESIntrinsics || (UseAES && FLAG_IS_DEFAULT(UseAESIntrinsics)); ++ if (UseAESIntrinsics && !UseAES) { ++ warning("UseAESIntrinsics enabled, but UseAES not, enabling"); ++ UseAES = true; ++ } ++ } else if (UseAESIntrinsics || UseAES) { ++ if (!FLAG_IS_DEFAULT(UseAESIntrinsics) || !FLAG_IS_DEFAULT(UseAES)) { ++ warning("AES intrinsics require Zvkn extension (not available on this CPU)."); ++ } ++ FLAG_SET_DEFAULT(UseAES, false); ++ FLAG_SET_DEFAULT(UseAESIntrinsics, false); ++ } + } + #endif // COMPILER2 + +diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp +index 7ca86d094..b24afb297 100644 +--- a/src/hotspot/share/opto/library_call.cpp ++++ b/src/hotspot/share/opto/library_call.cpp +@@ -7007,11 +7007,11 @@ bool LibraryCallKit::inline_counterMode_AESCrypt(vmIntrinsics::ID id) { + + //------------------------------get_key_start_from_aescrypt_object----------------------- + Node * LibraryCallKit::get_key_start_from_aescrypt_object(Node *aescrypt_object) { +-#if defined(PPC64) || defined(S390) ++#if defined(PPC64) || defined(S390) || defined(RISCV64) + // MixColumns for decryption can be reduced by preprocessing MixColumns with round keys. + // Intel's extension is based on this optimization and AESCrypt generates round keys by preprocessing MixColumns. + // However, ppc64 vncipher processes MixColumns and requires the same round keys with encryption. +- // The ppc64 stubs of encryption and decryption use the same round keys (sessionK[0]). ++ // The ppc64 and riscv64 stubs of encryption and decryption use the same round keys (sessionK[0]). + Node* objSessionK = load_field_from_object(aescrypt_object, "sessionK", "[[I"); + assert (objSessionK != nullptr, "wrong version of com.sun.crypto.provider.AESCrypt"); + if (objSessionK == nullptr) { diff --git a/openjdk-21.spec b/openjdk-21.spec index d7e8e1eb3a9a9d1e8f43e6c3cc669a35d647459e..eabf9aafc2209d77289117014924e07fdf41e5e0 100644 --- a/openjdk-21.spec +++ b/openjdk-21.spec @@ -905,7 +905,7 @@ Name: java-21-%{origin} Version: %{newjavaver}.%{buildver} # This package needs `.rolling` as part of Release so as to not conflict on install with # java-X-openjdk. I.e. when latest rolling release is also an LTS release packaged as -Release: 3 +Release: 4 # java-1.5.0-ibm from jpackage.org set Epoch to 1 for unknown reasons # and this change was brought into RHEL-4. java-1.5.0-ibm packages @@ -1059,6 +1059,10 @@ Patch3004: Backport-JDK-8348554-Enhance-Linux-kernel-version-ch.patch Patch3005: Backport-JDK-8348384-RISC-V-Disable-auto-enable-Vect.patch Patch3006: Backport-JDK-8352673-RISC-V-Vector-can-t-be-turned-o.patch Patch3007: Backport-JDK-8355878-RISC-V-jdk-incubator-vector-Dou.patch +Patch3008: Backport-JDK-8319716-8327283-RISC-V-Add-SHA-2.patch +Patch3009: Backport-JDK-8322179-RISC-V-Implement-SHA-1-intrinsic.patch +Patch3010: Backport-JDK-8322209-8322179-8329641-RISC-V-Enable-sha-md5-tests.patch +Patch3011: Backport-JDK-8334999-RISC-V-AES-single-block-cryption-intrinsics.patch BuildRequires: autoconf BuildRequires: automake @@ -1361,6 +1365,10 @@ pushd %{top_level_dir_name} %patch3005 -p1 %patch3006 -p1 %patch3007 -p1 +%patch3008 -p1 +%patch3009 -p1 +%patch3010 -p1 +%patch3011 -p1 popd %endif @@ -1918,6 +1926,9 @@ cjc.mainProgram(args) -- the returns from copy_jdk_configs.lua should not affect %changelog +* Fri Oct 10 2025 chenlang - 1:21.0.8.9-4 +- RISC-V add sha1 sha2 and zvkn patches + * Tue Aug 26 2025 songliyang - 1:21.0.8.9-3 - update LoongArch64 port to 21.0.8