openjdk-1.8.0_2
/
8143925-enhancing-CounterMode.crypt-f...

From 02b097417275acaad294d71a852c2def2222be25 Mon Sep 17 00:00:00 2001
From: kuenking111 <[email protected]>
Date: Sat, 3 Sep 2022 14:17:50 +0000
Subject: [PATCH 1/6] 8143925-enhancing-CounterMode.crypt-for-AESCrypt

---
 .../src/cpu/aarch64/vm/assembler_aarch64.hpp  |  35 +-
 .../cpu/aarch64/vm/macroAssembler_aarch64.hpp |  17 +
 .../aarch64/vm/macroAssembler_aarch64_aes.cpp | 685 ++++++++++++++++++
 .../cpu/aarch64/vm/stubGenerator_aarch64.cpp  | 324 ++++++++-
 .../cpu/aarch64/vm/stubRoutines_aarch64.hpp   |   2 +-
 .../src/cpu/aarch64/vm/vm_version_aarch64.cpp |  13 +-
 hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp     |   5 +
 hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp |   5 +
 hotspot/src/cpu/x86/vm/assembler_x86.cpp      |  74 +-
 hotspot/src/cpu/x86/vm/assembler_x86.hpp      |  12 +
 .../src/cpu/x86/vm/stubGenerator_x86_32.cpp   | 344 +++++++++
 .../src/cpu/x86/vm/stubGenerator_x86_64.cpp   | 340 ++++++++-
 hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp   |   1 +
 hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp   |   5 +
 .../src/cpu/x86/vm/stubRoutines_x86_32.hpp    |   2 +-
 .../src/cpu/x86/vm/stubRoutines_x86_64.hpp    |   2 +-
 hotspot/src/cpu/x86/vm/vm_version_x86.cpp     |  36 +
 hotspot/src/share/vm/classfile/vmSymbols.hpp  |   4 +
 hotspot/src/share/vm/opto/escape.cpp          |   1 +
 hotspot/src/share/vm/opto/library_call.cpp    | 174 +++++
 hotspot/src/share/vm/opto/runtime.cpp         |  29 +
 hotspot/src/share/vm/opto/runtime.hpp         |   1 +
 hotspot/src/share/vm/runtime/globals.hpp      |   3 +
 hotspot/src/share/vm/runtime/stubRoutines.cpp |   1 +
 hotspot/src/share/vm/runtime/stubRoutines.hpp |   2 +
 hotspot/src/share/vm/runtime/vmStructs.cpp    |   1 +
 .../test/compiler/7184394/TestAESBase.java    |   4 +-
 .../test/compiler/7184394/TestAESMain.java    |   7 +
 .../com/sun/crypto/provider/CounterMode.java  |  11 +-
 .../classes/com/sun/crypto/provider/GCTR.java |  89 +--
 .../com/sun/crypto/provider/GHASH.java        |  20 +-
 .../sun/security/ssl/SSLSocketImpl.java       |  14 +-
 .../security/ssl/SSLSocketInputRecord.java    | 215 +++---
 .../sun/security/ssl/SSLTransport.java        |   4 +
 .../bench/javax/crypto/full/AESGCMBench.java  | 128 ++++
 .../javax/crypto/full/AESGCMByteBuffer.java   | 163 +++++
 .../bench/javax/crypto/full/CryptoBase.java   | 102 +++
 .../bench/javax/crypto/small/AESGCMBench.java |  36 +
 .../javax/crypto/small/AESGCMByteBuffer.java  |  36 +
 .../ssl/SSLSocketImpl/ClientTimeout.java      |   3 +-
 .../SSLSocketImpl/SSLExceptionForIOIssue.java |   4 +-
 41 files changed, 2738 insertions(+), 216 deletions(-)
 create mode 100644 hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64_aes.cpp
 create mode 100644 jdk/test/micro/org/openjdk/bench/javax/crypto/full/AESGCMBench.java
 create mode 100644 jdk/test/micro/org/openjdk/bench/javax/crypto/full/AESGCMByteBuffer.java
 create mode 100644 jdk/test/micro/org/openjdk/bench/javax/crypto/full/CryptoBase.java
 create mode 100644 jdk/test/micro/org/openjdk/bench/javax/crypto/small/AESGCMBench.java
 create mode 100644 jdk/test/micro/org/openjdk/bench/javax/crypto/small/AESGCMByteBuffer.java

diff --git a/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp b/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp
index b0fa9b5fc..9202e61f8 100644
--- a/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp
+++ b/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp
@@ -146,6 +146,21 @@ REGISTER_DECLARATION(Register, esp,      r20);

 #define assert_cond(ARG1) assert(ARG1, #ARG1)

+// In many places we've added C-style casts to silence compiler
+// warnings, for example when truncating a size_t to an int when we
+// know the size_t is a small struct. Such casts are risky because
+// they effectively disable useful compiler warnings. We can make our
+// lives safer with this function, which ensures that any cast is
+// reversible without loss of information. It doesn't check
+// everything: it isn't intended to make sure that pointer types are
+// compatible, for example.
+template <typename T2, typename T1>
+T2 checked_cast(T1 thing) {
+  T2 result = static_cast<T2>(thing);
+  assert(static_cast<T1>(result) == thing, "must be");
+  return result;
+}
+
 namespace asm_util {
   uint32_t encode_logical_immediate(bool is32, uint64_t imm);
 };
@@ -193,7 +208,7 @@ public:
   static inline uint32_t extract(uint32_t val, int msb, int lsb) {
     int nbits = msb - lsb + 1;
     assert_cond(msb >= lsb);
-    uint32_t mask = (1U << nbits) - 1;
+    uint32_t mask = checked_cast<uint32_t>(right_n_bits(nbits));
     uint32_t result = val >> lsb;
     result &= mask;
     return result;
@@ -208,7 +223,7 @@ public:
     int nbits = msb - lsb + 1;
     guarantee(val < (1U << nbits), "Field too big for insn");
     assert_cond(msb >= lsb);
-    unsigned mask = (1U << nbits) - 1;
+    unsigned mask = checked_cast<unsigned>(right_n_bits(nbits));
     val <<= lsb;
     mask <<= lsb;
     unsigned target = *(unsigned *)a;
@@ -222,7 +237,7 @@ public:
     long chk = val >> (nbits - 1);
     guarantee (chk == -1 || chk == 0, "Field too big for insn");
     unsigned uval = val;
-    unsigned mask = (1U << nbits) - 1;
+    unsigned mask = checked_cast<unsigned>(right_n_bits(nbits));
     uval &= mask;
     uval <<= lsb;
     mask <<= lsb;
@@ -234,9 +249,9 @@ public:

   void f(unsigned val, int msb, int lsb) {
     int nbits = msb - lsb + 1;
-    guarantee(val < (1U << nbits), "Field too big for insn");
+    guarantee(val < (1ULL << nbits), "Field too big for insn");
     assert_cond(msb >= lsb);
-    unsigned mask = (1U << nbits) - 1;
+    unsigned mask = checked_cast<unsigned>(right_n_bits(nbits));
     val <<= lsb;
     mask <<= lsb;
     insn |= val;
@@ -255,7 +270,7 @@ public:
     long chk = val >> (nbits - 1);
     guarantee (chk == -1 || chk == 0, "Field too big for insn");
     unsigned uval = val;
-    unsigned mask = (1U << nbits) - 1;
+    unsigned mask = checked_cast<unsigned>(right_n_bits(nbits));
     uval &= mask;
     f(uval, lsb + nbits - 1, lsb);
   }
@@ -280,7 +295,7 @@ public:

   unsigned get(int msb = 31, int lsb = 0) {
     int nbits = msb - lsb + 1;
-    unsigned mask = ((1U << nbits) - 1) << lsb;
+    unsigned mask = checked_cast<unsigned>(right_n_bits(nbits)) << lsb;
     assert_cond((bits & mask) == mask);
     return (insn & mask) >> lsb;
   }
@@ -1991,21 +2006,21 @@ public:
     starti;
     f(0,31), f((int)T & 1, 30);
     f(op1, 29, 21), f(0, 20, 16), f(op2, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+    f((int)T >> 1, 11, 10), srf(Xn, 5), rf(Vt, 0);
   }
   void ld_st(FloatRegister Vt, SIMD_Arrangement T, Register Xn,
              int imm, int op1, int op2) {
     starti;
     f(0,31), f((int)T & 1, 30);
     f(op1 | 0b100, 29, 21), f(0b11111, 20, 16), f(op2, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+    f((int)T >> 1, 11, 10), srf(Xn, 5), rf(Vt, 0);
   }
   void ld_st(FloatRegister Vt, SIMD_Arrangement T, Register Xn,
              Register Xm, int op1, int op2) {
     starti;
     f(0,31), f((int)T & 1, 30);
     f(op1 | 0b100, 29, 21), rf(Xm, 16), f(op2, 15, 12);
-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+    f((int)T >> 1, 11, 10), srf(Xn, 5), rf(Vt, 0);
   }

  void ld_st(FloatRegister Vt, SIMD_Arrangement T, Address a, int op1, int op2) {
diff --git a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
index 0ca694038..d334f1b69 100644
--- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
+++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
@@ -1240,6 +1240,23 @@ public:
   void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z,
                        Register zlen, Register tmp1, Register tmp2, Register tmp3,
                        Register tmp4, Register tmp5, Register tmp6, Register tmp7);
+  void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
+                        FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
+                        FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3);
+  void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
+                      FloatRegister p, FloatRegister z, FloatRegister t1);
+  void ghash_processBlocks_wide(address p, Register state, Register subkeyH,
+                                  Register data, Register blocks, int unrolls);
+  void ghash_modmul (FloatRegister result,
+                       FloatRegister result_lo, FloatRegister result_hi, FloatRegister b,
+                       FloatRegister a, FloatRegister vzr, FloatRegister a1_xor_a0, FloatRegister p,
+                       FloatRegister t1, FloatRegister t2, FloatRegister t3);
+
+  void aesenc_loadkeys(Register key, Register keylen);
+  void aesecb_encrypt(Register from, Register to, Register keylen,
+                        FloatRegister data = v0, int unrolls = 1);
+  void aesecb_decrypt(Register from, Register to, Register key, Register keylen);
+  void aes_round(FloatRegister input, FloatRegister subkey);
   // ISB may be needed because of a safepoint
   void maybe_isb() { isb(); }

diff --git a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64_aes.cpp b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64_aes.cpp
new file mode 100644
index 000000000..1db79c97a
--- /dev/null
+++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64_aes.cpp
@@ -0,0 +1,685 @@
+/*
+ * Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+
+#include "asm/assembler.hpp"
+#include "asm/assembler.inline.hpp"
+#include "macroAssembler_aarch64.hpp"
+#include "memory/resourceArea.hpp"
+#include "runtime/stubRoutines.hpp"
+
+void MacroAssembler::aesecb_decrypt(Register from, Register to, Register key, Register keylen) {
+  Label L_doLast;
+
+  ld1(v0, T16B, from); // get 16 bytes of input
+
+  ld1(v5, T16B, post(key, 16));
+  rev32(v5, T16B, v5);
+
+  ld1(v1, v2, v3, v4, T16B, post(key, 64));
+  rev32(v1, T16B, v1);
+  rev32(v2, T16B, v2);
+  rev32(v3, T16B, v3);
+  rev32(v4, T16B, v4);
+  aesd(v0, v1);
+  aesimc(v0, v0);
+  aesd(v0, v2);
+  aesimc(v0, v0);
+  aesd(v0, v3);
+  aesimc(v0, v0);
+  aesd(v0, v4);
+  aesimc(v0, v0);
+
+  ld1(v1, v2, v3, v4, T16B, post(key, 64));
+  rev32(v1, T16B, v1);
+  rev32(v2, T16B, v2);
+  rev32(v3, T16B, v3);
+  rev32(v4, T16B, v4);
+  aesd(v0, v1);
+  aesimc(v0, v0);
+  aesd(v0, v2);
+  aesimc(v0, v0);
+  aesd(v0, v3);
+  aesimc(v0, v0);
+  aesd(v0, v4);
+  aesimc(v0, v0);
+
+  ld1(v1, v2, T16B, post(key, 32));
+  rev32(v1, T16B, v1);
+  rev32(v2, T16B, v2);
+
+  cmpw(keylen, 44);
+  br(Assembler::EQ, L_doLast);
+
+  aesd(v0, v1);
+  aesimc(v0, v0);
+  aesd(v0, v2);
+  aesimc(v0, v0);
+
+  ld1(v1, v2, T16B, post(key, 32));
+  rev32(v1, T16B, v1);
+  rev32(v2, T16B, v2);
+
+  cmpw(keylen, 52);
+  br(Assembler::EQ, L_doLast);
+
+  aesd(v0, v1);
+  aesimc(v0, v0);
+  aesd(v0, v2);
+  aesimc(v0, v0);
+
+  ld1(v1, v2, T16B, post(key, 32));
+  rev32(v1, T16B, v1);
+  rev32(v2, T16B, v2);
+
+  bind(L_doLast);
+
+  aesd(v0, v1);
+  aesimc(v0, v0);
+  aesd(v0, v2);
+
+  eor(v0, T16B, v0, v5);
+
+  st1(v0, T16B, to);
+
+  // Preserve the address of the start of the key
+  sub(key, key, keylen, LSL, exact_log2(sizeof (jint)));
+}
+
+// Load expanded key into v17..v31
+void MacroAssembler::aesenc_loadkeys(Register key, Register keylen) {
+  Label L_loadkeys_44, L_loadkeys_52;
+  cmpw(keylen, 52);
+  br(Assembler::LO, L_loadkeys_44);
+  br(Assembler::EQ, L_loadkeys_52);
+
+  ld1(v17, v18,  T16B,  post(key, 32));
+  rev32(v17,  T16B, v17);
+  rev32(v18,  T16B, v18);
+  bind(L_loadkeys_52);
+  ld1(v19, v20,  T16B,  post(key, 32));
+  rev32(v19,  T16B, v19);
+  rev32(v20,  T16B, v20);
+  bind(L_loadkeys_44);
+  ld1(v21, v22, v23, v24,  T16B,  post(key, 64));
+  rev32(v21,  T16B, v21);
+  rev32(v22,  T16B, v22);
+  rev32(v23,  T16B, v23);
+  rev32(v24,  T16B, v24);
+  ld1(v25, v26, v27, v28,  T16B,  post(key, 64));
+  rev32(v25,  T16B, v25);
+  rev32(v26,  T16B, v26);
+  rev32(v27,  T16B, v27);
+  rev32(v28,  T16B, v28);
+  ld1(v29, v30, v31,  T16B, post(key, 48));
+  rev32(v29,  T16B, v29);
+  rev32(v30,  T16B, v30);
+  rev32(v31,  T16B, v31);
+
+  // Preserve the address of the start of the key
+  sub(key, key, keylen, LSL, exact_log2(sizeof (jint)));
+}
+
+// NeoverseTM N1Software Optimization Guide:
+// Adjacent AESE/AESMC instruction pairs and adjacent AESD/AESIMC
+// instruction pairs will exhibit the performance characteristics
+// described in Section 4.6.
+void MacroAssembler::aes_round(FloatRegister input, FloatRegister subkey) {
+  aese(input, subkey); aesmc(input, input);
+}
+
+// KernelGenerator
+//
+// The abstract base class of an unrolled function generator.
+// Subclasses override generate(), length(), and next() to generate
+// unrolled and interleaved functions.
+//
+// The core idea is that a subclass defines a method which generates
+// the base case of a function and a method to generate a clone of it,
+// shifted to a different set of registers. KernelGenerator will then
+// generate several interleaved copies of the function, with each one
+// using a different set of registers.
+
+// The subclass must implement three methods: length(), which is the
+// number of instruction bundles in the intrinsic, generate(int n)
+// which emits the nth instruction bundle in the intrinsic, and next()
+// which takes an instance of the generator and returns a version of it,
+// shifted to a new set of registers.
+
+class KernelGenerator: public MacroAssembler {
+protected:
+  const int _unrolls;
+public:
+  KernelGenerator(Assembler *as, int unrolls)
+    : MacroAssembler(as->code()), _unrolls(unrolls) { }
+  virtual void generate(int index) = 0;
+  virtual int length() = 0;
+  virtual KernelGenerator *next() = 0;
+  int unrolls() { return _unrolls; }
+  void unroll();
+};
+
+void KernelGenerator::unroll() {
+  ResourceMark rm;
+  KernelGenerator **generators
+    = NEW_RESOURCE_ARRAY(KernelGenerator *, unrolls());
+
+  generators[0] = this;
+  for (int i = 1; i < unrolls(); i++) {
+    generators[i] = generators[i-1]->next();
+  }
+
+  for (int j = 0; j < length(); j++) {
+    for (int i = 0; i < unrolls(); i++) {
+      generators[i]->generate(j);
+    }
+  }
+}
+
+// An unrolled and interleaved generator for AES encryption.
+class AESKernelGenerator: public KernelGenerator {
+  Register _from, _to;
+  const Register _keylen;
+  FloatRegister _data;
+  const FloatRegister _subkeys;
+  bool _once;
+  Label _rounds_44, _rounds_52;
+
+public:
+  AESKernelGenerator(Assembler *as, int unrolls,
+                     Register from, Register to, Register keylen, FloatRegister data,
+                     FloatRegister subkeys, bool once = true)
+    : KernelGenerator(as, unrolls),
+      _from(from), _to(to), _keylen(keylen), _data(data),
+      _subkeys(subkeys), _once(once) {
+  }
+
+  virtual void generate(int index) {
+    switch (index) {
+    case  0:
+      if (_from != noreg) {
+        ld1(_data, T16B, _from); // get 16 bytes of input
+      }
+      break;
+    case  1:
+      if (_once) {
+        cmpw(_keylen, 52);
+        br(Assembler::LO, _rounds_44);
+        br(Assembler::EQ, _rounds_52);
+      }
+      break;
+    case  2:  aes_round(_data, _subkeys +  0);  break;
+    case  3:  aes_round(_data, _subkeys +  1);  break;
+    case  4:
+      if (_once)  bind(_rounds_52);
+      break;
+    case  5:  aes_round(_data, _subkeys +  2);  break;
+    case  6:  aes_round(_data, _subkeys +  3);  break;
+    case  7:
+      if (_once)  bind(_rounds_44);
+      break;
+    case  8:  aes_round(_data, _subkeys +  4);  break;
+    case  9:  aes_round(_data, _subkeys +  5);  break;
+    case 10:  aes_round(_data, _subkeys +  6);  break;
+    case 11:  aes_round(_data, _subkeys +  7);  break;
+    case 12:  aes_round(_data, _subkeys +  8);  break;
+    case 13:  aes_round(_data, _subkeys +  9);  break;
+    case 14:  aes_round(_data, _subkeys + 10);  break;
+    case 15:  aes_round(_data, _subkeys + 11);  break;
+    case 16:  aes_round(_data, _subkeys + 12);  break;
+    case 17:  aese(_data, _subkeys + 13);  break;
+    case 18:  eor(_data, T16B, _data, _subkeys + 14);  break;
+    case 19:
+      if (_to != noreg) {
+        st1(_data, T16B, _to);
+      }
+      break;
+    default: ShouldNotReachHere();
+    }
+  }
+
+  virtual KernelGenerator *next() {
+    return new AESKernelGenerator(this, _unrolls,
+                                  _from, _to, _keylen,
+                                  _data + 1, _subkeys, /*once*/false);
+  }
+
+  virtual int length() { return 20; }
+};
+
+// Uses expanded key in v17..v31
+// Returns encrypted values in inputs.
+// If to != noreg, store value at to; likewise from
+// Preserves key, keylen
+// Increments from, to
+// Input data in v0, v1, ...
+// unrolls controls the number of times to unroll the generated function
+void MacroAssembler::aesecb_encrypt(Register from, Register to, Register keylen,
+                                    FloatRegister data, int unrolls) {
+  AESKernelGenerator(this, unrolls, from, to, keylen, data, v17) .unroll();
+}
+
+// ghash_multiply and ghash_reduce are the non-unrolled versions of
+// the GHASH function generators.
+void MacroAssembler::ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
+                                     FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
+                                     FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3) {
+  // Karatsuba multiplication performs a 128*128 -> 256-bit
+  // multiplication in three 128-bit multiplications and a few
+  // additions.
+  //
+  // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
+  // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
+  //
+  // Inputs:
+  //
+  // A0 in a.d[0]     (subkey)
+  // A1 in a.d[1]
+  // (A1+A0) in a1_xor_a0.d[0]
+  //
+  // B0 in b.d[0]     (state)
+  // B1 in b.d[1]
+
+  ext(tmp1, T16B, b, b, 0x08);
+  pmull2(result_hi, T1Q, b, a, T2D);  // A1*B1
+  eor(tmp1, T16B, tmp1, b);           // (B1+B0)
+  pmull(result_lo,  T1Q, b, a, T1D);  // A0*B0
+  pmull(tmp2, T1Q, tmp1, a1_xor_a0, T1D); // (A1+A0)(B1+B0)
+
+  ext(tmp1, T16B, result_lo, result_hi, 0x08);
+  eor(tmp3, T16B, result_hi, result_lo); // A1*B1+A0*B0
+  eor(tmp2, T16B, tmp2, tmp1);
+  eor(tmp2, T16B, tmp2, tmp3);
+
+  // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
+  ins(result_hi, D, tmp2, 0, 1);
+  ins(result_lo, D, tmp2, 1, 0);
+}
+
+void MacroAssembler::ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
+                  FloatRegister p, FloatRegister vzr, FloatRegister t1) {
+  const FloatRegister t0 = result;
+
+  // The GCM field polynomial f is z^128 + p(z), where p =
+  // z^7+z^2+z+1.
+  //
+  //    z^128 === -p(z)  (mod (z^128 + p(z)))
+  //
+  // so, given that the product we're reducing is
+  //    a == lo + hi * z^128
+  // substituting,
+  //      === lo - hi * p(z)  (mod (z^128 + p(z)))
+  //
+  // we reduce by multiplying hi by p(z) and subtracting the result
+  // from (i.e. XORing it with) lo.  Because p has no nonzero high
+  // bits we can do this with two 64-bit multiplications, lo*p and
+  // hi*p.
+
+  pmull2(t0, T1Q, hi, p, T2D);
+  ext(t1, T16B, t0, vzr, 8);
+  eor(hi, T16B, hi, t1);
+  ext(t1, T16B, vzr, t0, 8);
+  eor(lo, T16B, lo, t1);
+  pmull(t0, T1Q, hi, p, T1D);
+  eor(result, T16B, lo, t0);
+}
+
+class GHASHMultiplyGenerator: public KernelGenerator {
+  FloatRegister _result_lo, _result_hi, _b,
+    _a, _vzr, _a1_xor_a0, _p,
+    _tmp1, _tmp2, _tmp3;
+
+public:
+  GHASHMultiplyGenerator(Assembler *as, int unrolls,
+                         FloatRegister result_lo, FloatRegister result_hi,
+                         /* offsetted registers */
+                         FloatRegister b,
+                         /* non-offsetted (shared) registers */
+                         FloatRegister a, FloatRegister a1_xor_a0, FloatRegister p, FloatRegister vzr,
+                         /* offseted (temp) registers */
+                         FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3)
+    : KernelGenerator(as, unrolls),
+      _result_lo(result_lo), _result_hi(result_hi), _b(b),
+      _a(a), _vzr(vzr), _a1_xor_a0(a1_xor_a0), _p(p),
+      _tmp1(tmp1), _tmp2(tmp2), _tmp3(tmp3) { }
+
+  static const int register_stride = 7;
+
+  virtual void generate(int index) {
+    // Karatsuba multiplication performs a 128*128 -> 256-bit
+    // multiplication in three 128-bit multiplications and a few
+    // additions.
+    //
+    // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
+    // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
+    //
+    // Inputs:
+    //
+    // A0 in a.d[0]     (subkey)
+    // A1 in a.d[1]
+    // (A1+A0) in a1_xor_a0.d[0]
+    //
+    // B0 in b.d[0]     (state)
+    // B1 in b.d[1]
+
+    switch (index) {
+      case  0:  ext(_tmp1, T16B, _b, _b, 0x08);  break;
+      case  1:  pmull2(_result_hi, T1Q, _b, _a, T2D);  // A1*B1
+        break;
+      case  2:  eor(_tmp1, T16B, _tmp1, _b);           // (B1+B0)
+        break;
+      case  3:  pmull(_result_lo,  T1Q, _b, _a, T1D);  // A0*B0
+        break;
+      case  4:  pmull(_tmp2, T1Q, _tmp1, _a1_xor_a0, T1D); // (A1+A0)(B1+B0)
+        break;
+
+      case  5:  ext(_tmp1, T16B, _result_lo, _result_hi, 0x08);  break;
+      case  6:  eor(_tmp3, T16B, _result_hi, _result_lo); // A1*B1+A0*B0
+        break;
+      case  7:  eor(_tmp2, T16B, _tmp2, _tmp1);  break;
+      case  8:  eor(_tmp2, T16B, _tmp2, _tmp3);  break;
+
+        // Register pair <_result_hi:_result_lo> holds the _result of carry-less multiplication
+      case  9:  ins(_result_hi, D, _tmp2, 0, 1);  break;
+      case 10:  ins(_result_lo, D, _tmp2, 1, 0);  break;
+      default: ShouldNotReachHere();
+    }
+  }
+
+  virtual KernelGenerator *next() {
+    GHASHMultiplyGenerator *result
+      = new GHASHMultiplyGenerator(this, _unrolls, _result_lo, _result_hi,
+                                   _b, _a, _a1_xor_a0, _p, _vzr,
+                                   _tmp1, _tmp2, _tmp3);
+    result->_result_lo += register_stride;
+    result->_result_hi += register_stride;
+    result->_b += register_stride;
+    result->_tmp1 += register_stride;
+    result->_tmp2 += register_stride;
+    result->_tmp3 += register_stride;
+    return result;
+  }
+
+  virtual int length() { return 11; }
+};
+
+// Reduce the 128-bit product in hi:lo by the GCM field polynomial.
+// The FloatRegister argument called data is optional: if it is a
+// valid register, we interleave LD1 instructions with the
+// reduction. This is to reduce latency next time around the loop.
+class GHASHReduceGenerator: public KernelGenerator {
+  FloatRegister _result, _lo, _hi, _p, _vzr, _data, _t1;
+  int _once;
+public:
+  GHASHReduceGenerator(Assembler *as, int unrolls,
+                       /* offsetted registers */
+                       FloatRegister result, FloatRegister lo, FloatRegister hi,
+                       /* non-offsetted (shared) registers */
+                       FloatRegister p, FloatRegister vzr, FloatRegister data,
+                       /* offseted (temp) registers */
+                       FloatRegister t1)
+    : KernelGenerator(as, unrolls),
+      _result(result), _lo(lo), _hi(hi),
+      _p(p), _vzr(vzr), _data(data), _t1(t1), _once(true) { }
+
+  static const int register_stride = 7;
+
+  virtual void generate(int index) {
+    const FloatRegister t0 = _result;
+
+    switch (index) {
+      // The GCM field polynomial f is z^128 + p(z), where p =
+      // z^7+z^2+z+1.
+      //
+      //    z^128 === -p(z)  (mod (z^128 + p(z)))
+      //
+      // so, given that the product we're reducing is
+      //    a == lo + hi * z^128
+      // substituting,
+      //      === lo - hi * p(z)  (mod (z^128 + p(z)))
+      //
+      // we reduce by multiplying hi by p(z) and subtracting the _result
+      // from (i.e. XORing it with) lo.  Because p has no nonzero high
+      // bits we can do this with two 64-bit multiplications, lo*p and
+      // hi*p.
+
+      case  0:  pmull2(t0, T1Q, _hi, _p, T2D);  break;
+      case  1:  ext(_t1, T16B, t0, _vzr, 8);  break;
+      case  2:  eor(_hi, T16B, _hi, _t1);  break;
+      case  3:  ext(_t1, T16B, _vzr, t0, 8);  break;
+      case  4:  eor(_lo, T16B, _lo, _t1);  break;
+      case  5:  pmull(t0, T1Q, _hi, _p, T1D);  break;
+      case  6:  eor(_result, T16B, _lo, t0);  break;
+      default: ShouldNotReachHere();
+    }
+
+    // Sprinkle load instructions into the generated instructions
+    if (_data->is_valid() && _once) {
+      assert(length() >= unrolls(), "not enough room for inteleaved loads");
+      if (index < unrolls()) {
+        ld1((_data + index*register_stride), T16B, post(r2, 0x10));
+      }
+    }
+  }
+
+  virtual KernelGenerator *next() {
+    GHASHReduceGenerator *result
+      = new GHASHReduceGenerator(this, _unrolls,
+                                 _result, _lo, _hi, _p, _vzr, _data, _t1);
+    result->_result += register_stride;
+    result->_hi += register_stride;
+    result->_lo += register_stride;
+    result->_t1 += register_stride;
+    result->_once = false;
+    return result;
+  }
+
+ int length() { return 7; }
+};
+
+// Perform a GHASH multiply/reduce on a single FloatRegister.
+void MacroAssembler::ghash_modmul(FloatRegister result,
+                                  FloatRegister result_lo, FloatRegister result_hi, FloatRegister b,
+                                  FloatRegister a, FloatRegister vzr, FloatRegister a1_xor_a0, FloatRegister p,
+                                  FloatRegister t1, FloatRegister t2, FloatRegister t3) {
+  ghash_multiply(result_lo, result_hi, a, b, a1_xor_a0, t1, t2, t3);
+  ghash_reduce(result, result_lo, result_hi, p, vzr, t1);
+}
+
+// Interleaved GHASH processing.
+//
+// Clobbers all vector registers.
+//
+void MacroAssembler::ghash_processBlocks_wide(address field_polynomial, Register state,
+                                              Register subkeyH,
+                                              Register data, Register blocks, int unrolls) {
+  int register_stride = 7;
+
+  // Bafflingly, GCM uses little-endian for the byte order, but
+  // big-endian for the bit order.  For example, the polynomial 1 is
+  // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
+  //
+  // So, we must either reverse the bytes in each word and do
+  // everything big-endian or reverse the bits in each byte and do
+  // it little-endian.  On AArch64 it's more idiomatic to reverse
+  // the bits in each byte (we have an instruction, RBIT, to do
+  // that) and keep the data in little-endian bit order throught the
+  // calculation, bit-reversing the inputs and outputs.
+
+  assert(unrolls * register_stride < 32, "out of registers");
+
+  FloatRegister a1_xor_a0 = v28;
+  FloatRegister Hprime = v29;
+  FloatRegister vzr = v30;
+  FloatRegister p = v31;
+  eor(vzr, T16B, vzr, vzr); // zero register
+
+  ldrq(p, field_polynomial);    // The field polynomial
+
+  ldrq(v0, Address(state));
+  ldrq(Hprime, Address(subkeyH));
+
+  rev64(v0, T16B, v0);          // Bit-reverse words in state and subkeyH
+  rbit(v0, T16B, v0);
+  rev64(Hprime, T16B, Hprime);
+  rbit(Hprime, T16B, Hprime);
+
+  // Powers of H -> Hprime
+
+  Label already_calculated, done;
+  {
+    // The first time around we'll have to calculate H**2, H**3, etc.
+    // Look at the largest power of H in the subkeyH array to see if
+    // it's already been calculated.
+    ldp(rscratch1, rscratch2, Address(subkeyH, 16 * (unrolls - 1)));
+    orr(rscratch1, rscratch1, rscratch2);
+    cbnz(rscratch1, already_calculated);
+
+    orr(v6, T16B, Hprime, Hprime);  // Start with H in v6 and Hprime
+    for (int i = 1; i < unrolls; i++) {
+      ext(a1_xor_a0, T16B, Hprime, Hprime, 0x08); // long-swap subkeyH into a1_xor_a0
+      eor(a1_xor_a0, T16B, a1_xor_a0, Hprime);    // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
+      ghash_modmul(/*result*/v6, /*result_lo*/v5, /*result_hi*/v4, /*b*/v6,
+                   Hprime, vzr, a1_xor_a0, p,
+                   /*temps*/v1, v3, v2);
+      rev64(v1, T16B, v6);
+      rbit(v1, T16B, v1);
+      strq(v1, Address(subkeyH, 16 * i));
+    }
+    b(done);
+  }
+  {
+    bind(already_calculated);
+
+    // Load the largest power of H we need into v6.
+    ldrq(v6, Address(subkeyH, 16 * (unrolls - 1)));
+    rev64(v6, T16B, v6);
+    rbit(v6, T16B, v6);
+  }
+  bind(done);
+
+  orr(Hprime, T16B, v6, v6);     // Move H ** unrolls into Hprime
+
+  // Hprime contains (H ** 1, H ** 2, ... H ** unrolls)
+  // v0 contains the initial state. Clear the others.
+  for (int i = 1; i < unrolls; i++) {
+    int ofs = register_stride * i;
+    eor(ofs+v0, T16B, ofs+v0, ofs+v0); // zero each state register
+  }
+
+  ext(a1_xor_a0, T16B, Hprime, Hprime, 0x08); // long-swap subkeyH into a1_xor_a0
+  eor(a1_xor_a0, T16B, a1_xor_a0, Hprime);    // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
+
+  // Load #unrolls blocks of data
+  for (int ofs = 0; ofs < unrolls * register_stride; ofs += register_stride) {
+    ld1(v2+ofs, T16B, post(data, 0x10));
+  }
+
+  // Register assignments, replicated across 4 clones, v0 ... v23
+  //
+  // v0: input / output: current state, result of multiply/reduce
+  // v1: temp
+  // v2: input: one block of data (the ciphertext)
+  //     also used as a temp once the data has been consumed
+  // v3: temp
+  // v4: output: high part of product
+  // v5: output: low part ...
+  // v6: unused
+  //
+  // Not replicated:
+  //
+  // v28: High part of H xor low part of H'
+  // v29: H' (hash subkey)
+  // v30: zero
+  // v31: Reduction polynomial of the Galois field
+
+  // Inner loop.
+  // Do the whole load/add/multiply/reduce over all our data except
+  // the last few rows.
+  {
+    Label L_ghash_loop;
+    bind(L_ghash_loop);
+
+    // Prefetching doesn't help here. In fact, on Neoverse N1 it's worse.
+    // prfm(Address(data, 128), PLDL1KEEP);
+
+    // Xor data into current state
+    for (int ofs = 0; ofs < unrolls * register_stride; ofs += register_stride) {
+      rbit((v2+ofs), T16B, (v2+ofs));
+      eor((v2+ofs), T16B, v0+ofs, (v2+ofs));   // bit-swapped data ^ bit-swapped state
+    }
+
+    // Generate fully-unrolled multiply-reduce in two stages.
+
+    (new GHASHMultiplyGenerator(this, unrolls,
+                                /*result_lo*/v5, /*result_hi*/v4, /*data*/v2,
+                                Hprime, a1_xor_a0, p, vzr,
+                                /*temps*/v1, v3, /* reuse b*/v2))->unroll();
+
+    // NB: GHASHReduceGenerator also loads the next #unrolls blocks of
+    // data into v0, v0+ofs, the current state.
+    (new GHASHReduceGenerator (this, unrolls,
+                               /*result*/v0, /*lo*/v5, /*hi*/v4, p, vzr,
+                               /*data*/v2, /*temp*/v3))->unroll();
+
+    sub(blocks, blocks, unrolls);
+    cmp(blocks, (unsigned char)(unrolls * 2));
+    br(GE, L_ghash_loop);
+  }
+
+  // Merge the #unrolls states.  Note that the data for the next
+  // iteration has already been loaded into v4, v4+ofs, etc...
+
+  // First, we multiply/reduce each clone by the appropriate power of H.
+  for (int i = 0; i < unrolls; i++) {
+    int ofs = register_stride * i;
+    ldrq(Hprime, Address(subkeyH, 16 * (unrolls - i - 1)));
+
+    rbit(v2+ofs, T16B, v2+ofs);
+    eor(v2+ofs, T16B, ofs+v0, v2+ofs);   // bit-swapped data ^ bit-swapped state
+
+    rev64(Hprime, T16B, Hprime);
+    rbit(Hprime, T16B, Hprime);
+    ext(a1_xor_a0, T16B, Hprime, Hprime, 0x08); // long-swap subkeyH into a1_xor_a0
+    eor(a1_xor_a0, T16B, a1_xor_a0, Hprime);    // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
+    ghash_modmul(/*result*/v0+ofs, /*result_lo*/v5+ofs, /*result_hi*/v4+ofs, /*b*/v2+ofs,
+                 Hprime, vzr, a1_xor_a0, p,
+                 /*temps*/v1+ofs, v3+ofs, /* reuse b*/v2+ofs);
+  }
+
+  // Then we sum the results.
+  for (int i = 0; i < unrolls - 1; i++) {
+    int ofs = register_stride * i;
+    eor(v0, T16B, v0, v0 + register_stride + ofs);
+  }
+
+  sub(blocks, blocks, (unsigned char)unrolls);
+
+  // And finally bit-reverse the state back to big endian.
+  rev64(v0, T16B, v0);
+  rbit(v0, T16B, v0);
+  st1(v0, T16B, state);
+}
\ No newline at end of file
diff --git a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp
index 2e2e8ae78..c024dec55 100644
--- a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp
+++ b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp
@@ -2804,6 +2804,266 @@ class StubGenerator: public StubCodeGenerator {
       return start;
   }

+  // CTR AES crypt.
+    // Arguments:
+    //
+    // Inputs:
+    //   c_rarg0   - source byte array address
+    //   c_rarg1   - destination byte array address
+    //   c_rarg2   - K (key) in little endian int array
+    //   c_rarg3   - counter vector byte array address
+    //   c_rarg4   - input length
+    //   c_rarg5   - saved encryptedCounter start
+    //   c_rarg6   - saved used length
+    //
+    // Output:
+    //   r0       - input length
+    //
+    address generate_counterMode_AESCrypt() {
+      const Register in = c_rarg0;
+      const Register out = c_rarg1;
+      const Register key = c_rarg2;
+      const Register counter = c_rarg3;
+      const Register saved_len = c_rarg4, len = r10;
+      const Register saved_encrypted_ctr = c_rarg5;
+      const Register used_ptr = c_rarg6, used = r12;
+
+      const Register offset = r7;
+      const Register keylen = r11;
+
+      const unsigned char block_size = 16;
+      const int bulk_width = 4;
+      // NB: bulk_width can be 4 or 8. 8 gives slightly faster
+      // performance with larger data sizes, but it also means that the
+      // fast path isn't used until you have at least 8 blocks, and up
+      // to 127 bytes of data will be executed on the slow path. For
+      // that reason, and also so as not to blow away too much icache, 4
+      // blocks seems like a sensible compromise.
+
+      // Algorithm:
+      //
+      //    if (len == 0) {
+      //        goto DONE;
+      //    }
+      //    int result = len;
+      //    do {
+      //        if (used >= blockSize) {
+      //            if (len >= bulk_width * blockSize) {
+      //                CTR_large_block();
+      //                if (len == 0)
+      //                    goto DONE;
+      //            }
+      //            for (;;) {
+      //                16ByteVector v0 = counter;
+      //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
+      //                used = 0;
+      //                if (len < blockSize)
+      //                    break;    /* goto NEXT */
+      //                16ByteVector v1 = load16Bytes(in, offset);
+      //                v1 = v1 ^ encryptedCounter;
+      //                store16Bytes(out, offset);
+      //                used = blockSize;
+      //                offset += blockSize;
+      //                len -= blockSize;
+      //                if (len == 0)
+      //                    goto DONE;
+      //            }
+      //        }
+      //      NEXT:
+      //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
+      //        len--;
+      //    } while (len != 0);
+      //  DONE:
+      //    return result;
+      //
+      // CTR_large_block()
+      //    Wide bulk encryption of whole blocks.
+
+      __ align(CodeEntryAlignment);
+      StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
+      const address start = __ pc();
+      __ enter();
+
+      Label DONE, CTR_large_block, large_block_return;
+      __ ldrw(used, Address(used_ptr));
+      __ cbzw(saved_len, DONE);
+
+      __ mov(len, saved_len);
+      __ mov(offset, 0);
+
+      // Compute #rounds for AES based on the length of the key array
+      __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+
+      __ aesenc_loadkeys(key, keylen);
+
+      {
+        Label L_CTR_loop, NEXT;
+
+        __ bind(L_CTR_loop);
+
+        __ cmp(used, block_size);
+        __ br(__ LO, NEXT);
+
+        // Maybe we have a lot of data
+        __ subsw(rscratch1, len, bulk_width * block_size);
+        __ br(__ HS, CTR_large_block);
+        __ BIND(large_block_return);
+        __ cbzw(len, DONE);
+
+        // Setup the counter
+        __ movi(v4, __ T4S, 0);
+        __ movi(v5, __ T4S, 1);
+        __ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 }
+
+        __ ld1(v0, __ T16B, counter); // Load the counter into v0
+        __ rev32(v16, __ T16B, v0);
+        __ addv(v16, __ T4S, v16, v4);
+        __ rev32(v16, __ T16B, v16);
+        __ st1(v16, __ T16B, counter); // Save the incremented counter back
+
+        {
+          // We have fewer than bulk_width blocks of data left. Encrypt
+          // them one by one until there is less than a full block
+          // remaining, being careful to save both the encrypted counter
+          // and the counter.
+
+          Label inner_loop;
+          __ bind(inner_loop);
+          // Counter to encrypt is in v0
+          __ aesecb_encrypt(noreg, noreg, keylen);
+          __ st1(v0, __ T16B, saved_encrypted_ctr);
+
+          // Do we have a remaining full block?
+
+          __ mov(used, 0);
+          __ cmp(len, block_size);
+          __ br(__ LO, NEXT);
+
+          // Yes, we have a full block
+          __ ldrq(v1, Address(in, offset));
+          __ eor(v1, __ T16B, v1, v0);
+          __ strq(v1, Address(out, offset));
+          __ mov(used, block_size);
+          __ add(offset, offset, block_size);
+
+          __ subw(len, len, block_size);
+          __ cbzw(len, DONE);
+
+          // Increment the counter, store it back
+          __ orr(v0, __ T16B, v16, v16);
+          __ rev32(v16, __ T16B, v16);
+          __ addv(v16, __ T4S, v16, v4);
+          __ rev32(v16, __ T16B, v16);
+          __ st1(v16, __ T16B, counter); // Save the incremented counter back
+
+          __ b(inner_loop);
+        }
+
+        __ BIND(NEXT);
+
+        // Encrypt a single byte, and loop.
+        // We expect this to be a rare event.
+        __ ldrb(rscratch1, Address(in, offset));
+        __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
+        __ eor(rscratch1, rscratch1, rscratch2);
+        __ strb(rscratch1, Address(out, offset));
+        __ add(offset, offset, 1);
+        __ add(used, used, 1);
+        __ subw(len, len,1);
+        __ cbnzw(len, L_CTR_loop);
+      }
+
+      __ bind(DONE);
+      __ strw(used, Address(used_ptr));
+      __ mov(r0, saved_len);
+
+      __ leave(); // required for proper stackwalking of RuntimeStub frame
+      __ ret(lr);
+
+      // Bulk encryption
+
+      __ BIND (CTR_large_block);
+      assert(bulk_width == 4 || bulk_width == 8, "must be");
+
+      if (bulk_width == 8) {
+        __ sub(sp, sp, 4 * 16);
+        __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
+      }
+      __ sub(sp, sp, 4 * 16);
+      __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
+      RegSet saved_regs = (RegSet::of(in, out, offset)
+                           + RegSet::of(saved_encrypted_ctr, used_ptr, len));
+      __ push(saved_regs, sp);
+      __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
+      __ add(in, in, offset);
+      __ add(out, out, offset);
+
+      // Keys should already be loaded into the correct registers
+
+      __ ld1(v0, __ T16B, counter); // v0 contains the first counter
+      __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
+
+      // AES/CTR loop
+      {
+        Label L_CTR_loop;
+        __ BIND(L_CTR_loop);
+
+        // Setup the counters
+        __ movi(v8, __ T4S, 0);
+        __ movi(v9, __ T4S, 1);
+        __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
+
+        for (FloatRegister f = v0; f < v0 + bulk_width; f++) {
+          __ rev32(f, __ T16B, v16);
+          __ addv(v16, __ T4S, v16, v8);
+        }
+
+        __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
+
+        // Encrypt the counters
+        __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
+
+        if (bulk_width == 8) {
+          __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
+        }
+
+        // XOR the encrypted counters with the inputs
+        for (int i = 0; i < bulk_width; i++) {
+          __ eor(v0 + i, __ T16B, v0 + i, v8 + i);
+        }
+
+        // Write the encrypted data
+        __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
+        if (bulk_width == 8) {
+          __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
+        }
+
+        __ subw(len, len, 16 * bulk_width);
+        __ cbnzw(len, L_CTR_loop);
+      }
+
+      // Save the counter back where it goes
+      __ rev32(v16, __ T16B, v16);
+      __ st1(v16, __ T16B, counter);
+
+      __ pop(saved_regs, sp);
+
+      __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
+      if (bulk_width == 8) {
+        __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
+      }
+
+      __ andr(rscratch1, len, -16 * bulk_width);
+      __ sub(len, len, rscratch1);
+      __ add(offset, offset, rscratch1);
+      __ mov(used, 16);
+      __ strw(used, Address(used_ptr));
+      __ b(large_block_return);
+
+      return start;
+    }
+
+
   // Arguments:
   //
   // Inputs:
@@ -3677,6 +3937,56 @@ class StubGenerator: public StubCodeGenerator {
     return start;
   }

+  address generate_ghash_processBlocks_wide() {
+    address small = generate_ghash_processBlocks();
+
+    StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
+    __ align(wordSize * 2);
+    address p = __ pc();
+    __ emit_int64(0x87);  // The low-order bits of the field
+                            // polynomial (i.e. p = z^7+z^2+z+1)
+                            // repeated in the low and high parts of a
+                            // 128-bit vector
+    __ emit_int64(0x87);
+
+    __ align(CodeEntryAlignment);
+    address start = __ pc();
+
+    Register state   = c_rarg0;
+    Register subkeyH = c_rarg1;
+    Register data    = c_rarg2;
+    Register blocks  = c_rarg3;
+
+    const int unroll = 4;
+
+    __ cmp(blocks, (unsigned char)(unroll * 2));
+    __ br(__ LT, small);
+
+    if (unroll > 1) {
+    // Save state before entering routine
+      __ sub(sp, sp, 4 * 16);
+      __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
+      __ sub(sp, sp, 4 * 16);
+      __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
+    }
+
+    __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
+
+    if (unroll > 1) {
+      // And restore state
+      __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
+      __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
+    }
+
+    __ cmp(blocks, 0u);
+    __ br(__ GT, small);
+
+    __ ret(lr);
+
+    return start;
+  }
+
+
   // Continuation point for throwing of implicit exceptions that are
   // not handled in the current activation. Fabricates an exception
   // oop and initiates normal exception dispatching in this
@@ -4687,6 +4997,15 @@ class StubGenerator: public StubCodeGenerator {
       StubRoutines::_montgomerySquare = g.generate_multiply();
     }

+    // generate GHASH intrinsics code
+    if (UseGHASHIntrinsics) {
+      if (UseAESCTRIntrinsics) {
+        StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
+      } else {
+        StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
+      }
+    }
+
     if (UseAESIntrinsics) {
       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
@@ -4694,9 +5013,8 @@ class StubGenerator: public StubCodeGenerator {
       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
     }

-    // generate GHASH intrinsics code
-    if (UseGHASHIntrinsics) {
-      StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
+    if (UseAESCTRIntrinsics) {
+      StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
     }

     if (UseSHA1Intrinsics) {
diff --git a/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp b/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp
index d1c312ab3..05619ce7f 100644
--- a/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp
+++ b/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp
@@ -37,7 +37,7 @@ static bool    returns_to_call_stub(address return_pc)   {

 enum platform_dependent_constants {
   code_size1 = 19000,          // simply increase if too small (assembler will crash if too small)
-  code_size2 = 22000           // simply increase if too small (assembler will crash if too small)
+  code_size2 = 32000           // simply increase if too small (assembler will crash if too small)
 };

 class aarch64 {
diff --git a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp
index 9808337a0..de636fb83 100644
--- a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp
+++ b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp
@@ -233,12 +233,21 @@ void VM_Version::get_processor_features() {
       warning("UseAESIntrinsics enabled, but UseAES not, enabling");
       UseAES = true;
     }
+    if (FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) {
+      FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
+    }
   } else {
     if (UseAES) {
-      warning("UseAES specified, but not supported on this CPU");
+      warning("AES instructions are not available on this CPU");
+      FLAG_SET_DEFAULT(UseAES, false);
     }
     if (UseAESIntrinsics) {
-      warning("UseAESIntrinsics specified, but not supported on this CPU");
+      warning("AES intrinsics are not available on this CPU");
+      FLAG_SET_DEFAULT(UseAESIntrinsics, false);
+    }
+    if (UseAESCTRIntrinsics) {
+      warning("AES/CTR intrinsics are not available on this CPU");
+      FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
     }
   }

diff --git a/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp b/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp
index b5ce1cfa9..fea8b1f87 100644
--- a/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp
+++ b/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp
@@ -194,6 +194,11 @@ void VM_Version::initialize() {
     FLAG_SET_DEFAULT(UseAESIntrinsics, false);
   }

+  if (UseAESCTRIntrinsics) {
+    warning("AES/CTR intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
+  }
+
   if (UseGHASHIntrinsics) {
     warning("GHASH intrinsics are not available on this CPU");
     FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
diff --git a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp
index bd893e138..08d7a7311 100644
--- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp
@@ -319,6 +319,11 @@ void VM_Version::initialize() {
     }
   }

+  if (UseAESCTRIntrinsics) {
+    warning("AES/CTR intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
+  }
+
   // GHASH/GCM intrinsics
   if (has_vis3() && (UseVIS > 2)) {
     if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) {
diff --git a/hotspot/src/cpu/x86/vm/assembler_x86.cpp b/hotspot/src/cpu/x86/vm/assembler_x86.cpp
index 1759ecdfd..ddc1acfd8 100644
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp
@@ -2373,20 +2373,52 @@ void Assembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {

 void Assembler::pextrd(Register dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, false);
+  int encode = simd_prefix_and_encode(src, xnoreg, as_XMMRegister(dst->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, false);
   emit_int8(0x16);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(imm8);
 }

+void Assembler::pextrd(Address dst, XMMRegister src, int imm8) {
+  assert(VM_Version::supports_sse4_1(), "");
+  simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A, false);
+  emit_int8(0x16);
+  emit_operand(src, dst);
+  emit_int8(imm8);
+}
+
 void Assembler::pextrq(Register dst, XMMRegister src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
-  int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, true);
+  int encode = simd_prefix_and_encode(src, xnoreg, as_XMMRegister(dst->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, true);
   emit_int8(0x16);
   emit_int8((unsigned char)(0xC0 | encode));
   emit_int8(imm8);
 }

+void Assembler::pextrq(Address dst, XMMRegister src, int imm8) {
+  assert(VM_Version::supports_sse4_1(), "");
+  simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A, true);
+  emit_int8(0x16);
+  emit_operand(src, dst);
+  emit_int8(imm8);
+}
+
+void Assembler::pextrw(Address dst, XMMRegister src, int imm8) {
+  assert(VM_Version::supports_sse4_1(), "");
+  simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A);
+  emit_int8((unsigned char)0x15);
+  emit_operand(src, dst);
+  emit_int8(imm8);
+}
+
+void Assembler::pextrb(Address dst, XMMRegister src, int imm8) {
+  assert(VM_Version::supports_sse4_1(), "");
+  simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A);
+  emit_int8(0x14);
+  emit_operand(src, dst);
+  emit_int8(imm8);
+}
+
 void Assembler::pinsrd(XMMRegister dst, Register src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
   int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, false);
@@ -2395,6 +2427,14 @@ void Assembler::pinsrd(XMMRegister dst, Register src, int imm8) {
   emit_int8(imm8);
 }

+void Assembler::pinsrd(XMMRegister dst, Address src, int imm8) {
+  assert(VM_Version::supports_sse4_1(), "");
+  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, false);
+  emit_int8(0x22);
+  emit_operand(dst,src);
+  emit_int8(imm8);
+}
+
 void Assembler::pinsrq(XMMRegister dst, Register src, int imm8) {
   assert(VM_Version::supports_sse4_1(), "");
   int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, true);
@@ -2403,6 +2443,30 @@ void Assembler::pinsrq(XMMRegister dst, Register src, int imm8) {
   emit_int8(imm8);
 }

+void Assembler::pinsrq(XMMRegister dst, Address src, int imm8) {
+  assert(VM_Version::supports_sse4_1(), "");
+  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, true);
+  emit_int8(0x22);
+  emit_operand(dst, src);
+  emit_int8(imm8);
+}
+
+void Assembler::pinsrw(XMMRegister dst, Address src, int imm8) {
+  assert(VM_Version::supports_sse2(), "");
+  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F);
+  emit_int8((unsigned char)0xC4);
+  emit_operand(dst, src);
+  emit_int8(imm8);
+}
+
+void Assembler::pinsrb(XMMRegister dst, Address src, int imm8) {
+  assert(VM_Version::supports_sse4_1(), "");
+  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A);
+  emit_int8(0x20);
+  emit_operand(dst, src);
+  emit_int8(imm8);
+}
+
 void Assembler::pmovzxbw(XMMRegister dst, Address src) {
   assert(VM_Version::supports_sse4_1(), "");
   InstructionMark im(this);
@@ -3075,6 +3139,12 @@ void Assembler::xorl(Register dst, Register src) {
   emit_arith(0x33, 0xC0, dst, src);
 }

+void Assembler::xorb(Register dst, Address src) {
+  InstructionMark im(this);
+  prefix(src, dst);
+  emit_int8(0x32);
+  emit_operand(dst, src);
+}

 // AVX 3-operands scalar float-point arithmetic instructions

diff --git a/hotspot/src/cpu/x86/vm/assembler_x86.hpp b/hotspot/src/cpu/x86/vm/assembler_x86.hpp
index 5ea01311e..c2e70bc2a 100644
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp
@@ -1479,10 +1479,20 @@ private:
   // SSE 4.1 extract
   void pextrd(Register dst, XMMRegister src, int imm8);
   void pextrq(Register dst, XMMRegister src, int imm8);
+  void pextrd(Address dst, XMMRegister src, int imm8);
+  void pextrq(Address dst, XMMRegister src, int imm8);
+  void pextrb(Address dst, XMMRegister src, int imm8);
+  // SSE 2 extract
+  void pextrw(Address dst, XMMRegister src, int imm8);

   // SSE 4.1 insert
   void pinsrd(XMMRegister dst, Register src, int imm8);
   void pinsrq(XMMRegister dst, Register src, int imm8);
+  void pinsrd(XMMRegister dst, Address src, int imm8);
+  void pinsrq(XMMRegister dst, Address src, int imm8);
+  void pinsrb(XMMRegister dst, Address src, int imm8);
+  // SSE 2 insert
+  void pinsrw(XMMRegister dst, Address src, int imm8);

   // SSE4.1 packed move
   void pmovzxbw(XMMRegister dst, XMMRegister src);
@@ -1687,6 +1697,8 @@ private:
   void xorl(Register dst, Address src);
   void xorl(Register dst, Register src);

+  void xorb(Register dst, Address src);
+
   void xorq(Register dst, Address src);
   void xorq(Register dst, Register src);

diff --git a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp
index 2e5599807..f555f3326 100644
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp
@@ -2153,6 +2153,17 @@ class StubGenerator: public StubCodeGenerator {
     return start;
   }

+  address generate_counter_shuffle_mask() {
+    __ align(16);
+    StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
+    address start = __ pc();
+    __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
+    __ emit_data(0x08090a0b, relocInfo::none, 0);
+    __ emit_data(0x04050607, relocInfo::none, 0);
+    __ emit_data(0x00010203, relocInfo::none, 0);
+    return start;
+  }
+
   // Utility routine for loading a 128-bit key word in little endian format
   // can optionally specify that the shuffle mask is already in an xmmregister
   void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
@@ -2178,6 +2189,31 @@ class StubGenerator: public StubCodeGenerator {
     __ aesdec(xmmdst, xmmtmp);
   }

+  // Utility routine for increase 128bit counter (iv in CTR mode)
+  //  XMM_128bit,  D3, D2, D1, D0
+  void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
+    __ pextrd(reg, xmmdst, 0x0);
+    __ addl(reg, inc_delta);
+    __ pinsrd(xmmdst, reg, 0x0);
+    __ jcc(Assembler::carryClear, next_block); // jump if no carry
+
+    __ pextrd(reg, xmmdst, 0x01); // Carry-> D1
+    __ addl(reg, 0x01);
+    __ pinsrd(xmmdst, reg, 0x01);
+    __ jcc(Assembler::carryClear, next_block); // jump if no carry
+
+    __ pextrd(reg, xmmdst, 0x02); // Carry-> D2
+    __ addl(reg, 0x01);
+    __ pinsrd(xmmdst, reg, 0x02);
+    __ jcc(Assembler::carryClear, next_block); // jump if no carry
+
+    __ pextrd(reg, xmmdst, 0x03); // Carry -> D3
+    __ addl(reg, 0x01);
+    __ pinsrd(xmmdst, reg, 0x03);
+
+    __ BIND(next_block);          // next instruction
+  }
+

   // Arguments:
   //
@@ -2719,6 +2755,309 @@ class StubGenerator: public StubCodeGenerator {
     return start;
   }

+
+  // CTR AES crypt.
+  // In 32-bit stub, parallelize 4 blocks at a time
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //   c_rarg3   - counter vector byte array address
+  //   c_rarg4   - input length
+  //
+  // Output:
+  //   rax       - input length
+  //
+  address generate_counterMode_AESCrypt_Parallel() {
+    assert(UseAES, "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
+    address start = __ pc();
+    const Register from        = rsi;      // source array address
+    const Register to          = rdx;      // destination array address
+    const Register key         = rcx;      // key array address
+    const Register counter     = rdi;      // counter byte array initialized from initvector array address
+
+    // and left with the results of the last encryption block
+    const Register len_reg     = rbx;
+    const Register pos         = rax;
+
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+    handleSOERegisters(true /*saving*/); // save rbx, rsi, rdi
+
+    // load registers from incoming parameters
+    const Address  from_param(rbp, 8+0);
+    const Address  to_param  (rbp, 8+4);
+    const Address  key_param (rbp, 8+8);
+    const Address  rvec_param (rbp, 8+12);
+    const Address  len_param  (rbp, 8+16);
+    const Address  saved_counter_param(rbp, 8 + 20);
+    const Address  used_addr_param(rbp, 8 + 24);
+
+    __ movptr(from , from_param);
+    __ movptr(to   , to_param);
+    //__ movptr(key, key_param);
+    //__ movptr(counter, rvec_param);
+    __ movptr(len_reg , len_param);
+    //__ movptr(pos, 0);
+
+    // Use the partially used encrpyted counter from last invocation
+    Label L_exit_preLoop, L_preLoop_start;
+
+    // Use the registers 'counter' and 'key' here in this preloop
+    // to hold of last 2 params 'used' and 'saved_encCounter_start'
+    Register used = counter;
+    Register saved_encCounter_start = key;
+    Register used_addr = saved_encCounter_start;
+
+    __ movptr(used_addr, used_addr_param);
+    __ movptr(used, Address(used_addr, 0));
+    __ movptr(saved_encCounter_start, saved_counter_param);
+
+    __ BIND(L_preLoop_start);
+    __ cmpptr(used, 16);
+    __ jcc(Assembler::aboveEqual, L_exit_preLoop);
+    __ cmpptr(len_reg, 0);
+    __ jcc(Assembler::lessEqual, L_exit_preLoop);
+    __ movb(rax, Address(saved_encCounter_start, used));
+    __ xorb(rax, Address(from, 0));
+    __ movb(Address(to, 0), rax);
+    __ addptr(from, 1);
+    __ addptr(to, 1);
+    __ addptr(used, 1);
+    __ subptr(len_reg, 1);
+
+    __ jmp(L_preLoop_start);
+
+    __ BIND(L_exit_preLoop);
+    __ movptr(used_addr, used_addr_param);
+    __ movptr(used_addr, used_addr_param);
+    __ movl(Address(used_addr, 0), used);
+
+    // load the parameters 'key' and 'counter'
+    __ movptr(key, key_param);
+    __ movptr(counter, rvec_param);
+
+    // xmm register assignments for the loops below
+    const XMMRegister xmm_curr_counter      = xmm0;
+    const XMMRegister xmm_counter_shuf_mask = xmm1;  // need to be reloaded
+    const XMMRegister xmm_key_shuf_mask     = xmm2;  // need to be reloaded
+    const XMMRegister xmm_key               = xmm3;
+    const XMMRegister xmm_result0           = xmm4;
+    const XMMRegister xmm_result1           = xmm5;
+    const XMMRegister xmm_result2           = xmm6;
+    const XMMRegister xmm_result3           = xmm7;
+    const XMMRegister xmm_from0             = xmm1;   //reuse XMM register
+    const XMMRegister xmm_from1             = xmm2;
+    const XMMRegister xmm_from2             = xmm3;
+    const XMMRegister xmm_from3             = xmm4;
+
+    //for key_128, key_192, key_256
+    const int rounds[3] = {10, 12, 14};
+    Label L_singleBlockLoopTop[3];
+    Label L_multiBlock_loopTop[3];
+    Label L_key192_top, L_key256_top;
+    Label L_incCounter[3][4]; // 3: different key length,  4: 4 blocks at a time
+    Label L_incCounter_single[3]; //for single block, key128, key192, key256
+    Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
+    Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
+
+    Label L_exit;
+    const int PARALLEL_FACTOR = 4;  //because of the limited register number
+
+    // initialize counter with initial counter
+    __ movdqu(xmm_curr_counter, Address(counter, 0x00));
+    __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
+    __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled for increase
+
+    // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
+    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+    __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+    __ cmpl(rax, 52);
+    __ jcc(Assembler::equal, L_key192_top);
+    __ cmpl(rax, 60);
+    __ jcc(Assembler::equal, L_key256_top);
+
+    //key128 begins here
+    __ movptr(pos, 0); // init pos before L_multiBlock_loopTop
+
+#define CTR_DoFour(opc, src_reg)               \
+    __ opc(xmm_result0, src_reg);              \
+    __ opc(xmm_result1, src_reg);              \
+    __ opc(xmm_result2, src_reg);              \
+    __ opc(xmm_result3, src_reg);
+
+    // k == 0 :  generate code for key_128
+    // k == 1 :  generate code for key_192
+    // k == 2 :  generate code for key_256
+    for (int k = 0; k < 3; ++k) {
+      //multi blocks starts here
+      __ align(OptoLoopAlignment);
+      __ BIND(L_multiBlock_loopTop[k]);
+      __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
+      __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
+
+      __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+      __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
+
+      //load, then increase counters
+      CTR_DoFour(movdqa, xmm_curr_counter);
+      __ push(rbx);
+      inc_counter(rbx, xmm_result1, 0x01, L_incCounter[k][0]);
+      inc_counter(rbx, xmm_result2, 0x02, L_incCounter[k][1]);
+      inc_counter(rbx, xmm_result3, 0x03, L_incCounter[k][2]);
+      inc_counter(rbx, xmm_curr_counter, 0x04, L_incCounter[k][3]);
+      __ pop (rbx);
+
+      load_key(xmm_key, key, 0x00, xmm_key_shuf_mask); // load Round 0 key. interleaving for better performance
+
+      CTR_DoFour(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
+      CTR_DoFour(pxor, xmm_key);   //PXOR with Round 0 key
+
+      for (int i = 1; i < rounds[k]; ++i) {
+        load_key(xmm_key, key, (0x10 * i), xmm_key_shuf_mask);
+        CTR_DoFour(aesenc, xmm_key);
+      }
+      load_key(xmm_key, key, (0x10 * rounds[k]), xmm_key_shuf_mask);
+      CTR_DoFour(aesenclast, xmm_key);
+
+      // get next PARALLEL_FACTOR blocks into xmm_from registers
+      __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
+      __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
+      __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
+
+      // PXOR with input text
+      __ pxor(xmm_result0, xmm_from0); //result0 is xmm4
+      __ pxor(xmm_result1, xmm_from1);
+      __ pxor(xmm_result2, xmm_from2);
+
+      // store PARALLEL_FACTOR results into the next 64 bytes of output
+      __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
+      __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
+      __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
+
+      // do it here after xmm_result0 is saved, because xmm_from3 reuse the same register of xmm_result0.
+      __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
+      __ pxor(xmm_result3, xmm_from3);
+      __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
+
+      __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
+      __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
+      __ jmp(L_multiBlock_loopTop[k]);
+
+      // singleBlock starts here
+      __ align(OptoLoopAlignment);
+      __ BIND(L_singleBlockLoopTop[k]);
+      __ cmpptr(len_reg, 0);
+      __ jcc(Assembler::equal, L_exit);
+      __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+      __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
+      __ movdqa(xmm_result0, xmm_curr_counter);
+      load_key(xmm_key, key, 0x00, xmm_key_shuf_mask);
+      __ push(rbx);//rbx is used for increasing counter
+      inc_counter(rbx, xmm_curr_counter, 0x01, L_incCounter_single[k]);
+      __ pop (rbx);
+      __ pshufb(xmm_result0, xmm_counter_shuf_mask);
+      __ pxor(xmm_result0, xmm_key);
+      for (int i = 1; i < rounds[k]; i++) {
+        load_key(xmm_key, key, (0x10 * i), xmm_key_shuf_mask);
+        __ aesenc(xmm_result0, xmm_key);
+      }
+      load_key(xmm_key, key, (0x10 * rounds[k]), xmm_key_shuf_mask);
+      __ aesenclast(xmm_result0, xmm_key);
+      __ cmpptr(len_reg, AESBlockSize);
+      __ jcc(Assembler::less, L_processTail_insr[k]);
+        __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
+        __ pxor(xmm_result0, xmm_from0);
+        __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
+        __ addptr(pos, AESBlockSize);
+        __ subptr(len_reg, AESBlockSize);
+        __ jmp(L_singleBlockLoopTop[k]);
+
+      __ BIND(L_processTail_insr[k]);
+        __ addptr(pos, len_reg);
+        __ testptr(len_reg, 8);
+        __ jcc(Assembler::zero, L_processTail_4_insr[k]);
+          __ subptr(pos,8);
+          __ pinsrd(xmm_from0, Address(from, pos), 0);
+          __ pinsrd(xmm_from0, Address(from, pos, Address::times_1, 4), 1);
+        __ BIND(L_processTail_4_insr[k]);
+        __ testptr(len_reg, 4);
+        __ jcc(Assembler::zero, L_processTail_2_insr[k]);
+          __ subptr(pos,4);
+          __ pslldq(xmm_from0, 4);
+          __ pinsrd(xmm_from0, Address(from, pos), 0);
+        __ BIND(L_processTail_2_insr[k]);
+        __ testptr(len_reg, 2);
+        __ jcc(Assembler::zero, L_processTail_1_insr[k]);
+          __ subptr(pos, 2);
+          __ pslldq(xmm_from0, 2);
+          __ pinsrw(xmm_from0, Address(from, pos), 0);
+        __ BIND(L_processTail_1_insr[k]);
+        __ testptr(len_reg, 1);
+        __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
+          __ subptr(pos, 1);
+          __ pslldq(xmm_from0, 1);
+          __ pinsrb(xmm_from0, Address(from, pos), 0);
+        __ BIND(L_processTail_exit_insr[k]);
+
+        __ movptr(saved_encCounter_start, saved_counter_param);
+        __ movdqu(Address(saved_encCounter_start, 0), xmm_result0);
+        __ pxor(xmm_result0, xmm_from0);
+
+        __ testptr(len_reg, 8);
+        __ jcc(Assembler::zero, L_processTail_4_extr[k]);
+          __ pextrd(Address(to, pos), xmm_result0, 0);
+          __ pextrd(Address(to, pos, Address::times_1, 4), xmm_result0, 1);
+          __ psrldq(xmm_result0, 8);
+          __ addptr(pos, 8);
+        __ BIND(L_processTail_4_extr[k]);
+        __ testptr(len_reg, 4);
+        __ jcc(Assembler::zero, L_processTail_2_extr[k]);
+          __ pextrd(Address(to, pos), xmm_result0, 0);
+          __ psrldq(xmm_result0, 4);
+          __ addptr(pos, 4);
+        __ BIND(L_processTail_2_extr[k]);
+        __ testptr(len_reg, 2);
+        __ jcc(Assembler::zero, L_processTail_1_extr[k]);
+          __ pextrb(Address(to, pos), xmm_result0, 0);
+          __ pextrb(Address(to, pos, Address::times_1, 1), xmm_result0, 1);
+          __ psrldq(xmm_result0, 2);
+          __ addptr(pos, 2);
+        __ BIND(L_processTail_1_extr[k]);
+        __ testptr(len_reg, 1);
+        __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
+          __ pextrb(Address(to, pos), xmm_result0, 0);
+
+        __ BIND(L_processTail_exit_extr[k]);
+        __ movptr(used_addr, used_addr_param);
+        __ movl(Address(used_addr, 0), len_reg);
+        __ jmp(L_exit);
+    }
+
+    __ BIND(L_exit);
+    __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
+    __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
+    __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
+    handleSOERegisters(false /*restoring*/);
+    __ movptr(rax, len_param); // return length
+    __ leave();                // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+
+    __ BIND (L_key192_top);
+    __ movptr(pos, 0); // init pos before L_multiBlock_loopTop
+    __ jmp(L_multiBlock_loopTop[1]); //key192
+
+    __ BIND (L_key256_top);
+    __ movptr(pos, 0); // init pos before L_multiBlock_loopTop
+    __ jmp(L_multiBlock_loopTop[2]); //key192
+
+    return start;
+  }
+
+
   // byte swap x86 long
   address generate_ghash_long_swap_mask() {
     __ align(CodeEntryAlignment);
@@ -3181,6 +3520,11 @@ class StubGenerator: public StubCodeGenerator {
       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
     }

+    if (UseAESCTRIntrinsics) {
+      StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
+      StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
+    }
+
     // Generate GHASH intrinsics code
     if (UseGHASHIntrinsics) {
       StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
diff --git a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
index c5811b28b..254f63392 100644
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
@@ -3010,6 +3010,15 @@ class StubGenerator: public StubCodeGenerator {
     return start;
   }

+  address generate_counter_shuffle_mask() {
+    __ align(16);
+    StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
+    address start = __ pc();
+    __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
+    __ emit_data64(0x0001020304050607, relocInfo::none);
+    return start;
+  }
+
   // Utility routine for loading a 128-bit key word in little endian format
   // can optionally specify that the shuffle mask is already in an xmmregister
   void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
@@ -3021,6 +3030,18 @@ class StubGenerator: public StubCodeGenerator {
     }
   }

+  // Utility routine for increase 128bit counter (iv in CTR mode)
+  void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
+    __ pextrq(reg, xmmdst, 0x0);
+    __ addq(reg, inc_delta);
+    __ pinsrq(xmmdst, reg, 0x0);
+    __ jcc(Assembler::carryClear, next_block); // jump if no carry
+    __ pextrq(reg, xmmdst, 0x01); // Carry
+    __ addq(reg, 0x01);
+    __ pinsrq(xmmdst, reg, 0x01); //Carry end
+    __ BIND(next_block);          // next instruction
+  }
+
   // Arguments:
   //
   // Inputs:
@@ -3639,6 +3660,320 @@ class StubGenerator: public StubCodeGenerator {
     return start;
   }

+  // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
+  // to hide instruction latency
+  //
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //   c_rarg3   - counter vector byte array address
+  //   Linux
+  //     c_rarg4   -          input length
+  //     c_rarg5   -          saved encryptedCounter start
+  //     rbp + 6 * wordSize - saved used length
+  //   Windows
+  //     rbp + 6 * wordSize - input length
+  //     rbp + 7 * wordSize - saved encryptedCounter start
+  //     rbp + 8 * wordSize - saved used length
+  //
+  // Output:
+  //   rax       - input length
+  //
+  address generate_counterMode_AESCrypt_Parallel() {
+    assert(UseAES, "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
+    address start = __ pc();
+    const Register from = c_rarg0; // source array address
+    const Register to = c_rarg1; // destination array address
+    const Register key = c_rarg2; // key array address
+    const Register counter = c_rarg3; // counter byte array initialized from counter array address
+    // and left with the results of the last encryption block
+#ifndef _WIN64
+    const Register len_reg = c_rarg4;
+    const Register saved_encCounter_start = c_rarg5;
+    const Register used_addr = r10;
+    const Address  used_mem(rbp, 2 * wordSize);
+    const Register used = r11;
+#else
+    const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
+    const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64
+    const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64
+    const Register len_reg = r10; // pick the first volatile windows register
+    const Register saved_encCounter_start = r11;
+    const Register used_addr = r13;
+    const Register used = r14;
+#endif
+    const Register pos = rax;
+
+    const int PARALLEL_FACTOR = 6;
+    const XMMRegister xmm_counter_shuf_mask = xmm0;
+    const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
+    const XMMRegister xmm_curr_counter = xmm2;
+
+    const XMMRegister xmm_key_tmp0 = xmm3;
+    const XMMRegister xmm_key_tmp1 = xmm4;
+
+    // registers holding the four results in the parallelized loop
+    const XMMRegister xmm_result0 = xmm5;
+    const XMMRegister xmm_result1 = xmm6;
+    const XMMRegister xmm_result2 = xmm7;
+    const XMMRegister xmm_result3 = xmm8;
+    const XMMRegister xmm_result4 = xmm9;
+    const XMMRegister xmm_result5 = xmm10;
+
+    const XMMRegister xmm_from0 = xmm11;
+    const XMMRegister xmm_from1 = xmm12;
+    const XMMRegister xmm_from2 = xmm13;
+    const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64.
+    const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text
+    const XMMRegister xmm_from5 = xmm4;
+
+    //for key_128, key_192, key_256
+    const int rounds[3] = {10, 12, 14};
+    Label L_exit_preLoop, L_preLoop_start;
+    Label L_multiBlock_loopTop[3];
+    Label L_singleBlockLoopTop[3];
+    Label L__incCounter[3][6]; //for 6 blocks
+    Label L__incCounter_single[3]; //for single block, key128, key192, key256
+    Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
+    Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
+
+    Label L_exit;
+
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+#ifdef _WIN64
+    // save the xmm registers which must be preserved 6-14
+    const int XMM_REG_NUM_KEY_LAST = 14;
+    __ subptr(rsp, -rsp_after_call_off * wordSize);
+    for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
+      __ movdqu(xmm_save(i), as_XMMRegister(i));
+    }
+
+    const Address r13_save(rbp, rdi_off * wordSize);
+    const Address r14_save(rbp, rsi_off * wordSize);
+
+    __ movptr(r13_save, r13);
+    __ movptr(r14_save, r14);
+
+    // on win64, fill len_reg from stack position
+    __ movl(len_reg, len_mem);
+    __ movptr(saved_encCounter_start, saved_encCounter_mem);
+    __ movptr(used_addr, used_mem);
+    __ movl(used, Address(used_addr, 0));
+#else
+    __ push(len_reg); // Save
+    __ movptr(used_addr, used_mem);
+    __ movl(used, Address(used_addr, 0));
+#endif
+
+    __ push(rbx); // Save RBX
+    __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter
+    __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
+    __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled
+    __ movptr(pos, 0);
+
+    // Use the partially used encrpyted counter from last invocation
+    __ BIND(L_preLoop_start);
+    __ cmpptr(used, 16);
+    __ jcc(Assembler::aboveEqual, L_exit_preLoop);
+      __ cmpptr(len_reg, 0);
+      __ jcc(Assembler::lessEqual, L_exit_preLoop);
+      __ movb(rbx, Address(saved_encCounter_start, used));
+      __ xorb(rbx, Address(from, pos));
+      __ movb(Address(to, pos), rbx);
+      __ addptr(pos, 1);
+      __ addptr(used, 1);
+      __ subptr(len_reg, 1);
+
+    __ jmp(L_preLoop_start);
+
+    __ BIND(L_exit_preLoop);
+    __ movl(Address(used_addr, 0), used);
+
+    // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
+    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+    __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+    __ cmpl(rbx, 52);
+    __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
+    __ cmpl(rbx, 60);
+    __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
+
+#define CTR_DoSix(opc, src_reg)                \
+    __ opc(xmm_result0, src_reg);              \
+    __ opc(xmm_result1, src_reg);              \
+    __ opc(xmm_result2, src_reg);              \
+    __ opc(xmm_result3, src_reg);              \
+    __ opc(xmm_result4, src_reg);              \
+    __ opc(xmm_result5, src_reg);
+
+    // k == 0 :  generate code for key_128
+    // k == 1 :  generate code for key_192
+    // k == 2 :  generate code for key_256
+    for (int k = 0; k < 3; ++k) {
+      //multi blocks starts here
+      __ align(OptoLoopAlignment);
+      __ BIND(L_multiBlock_loopTop[k]);
+      __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
+      __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
+      load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
+
+      //load, then increase counters
+      CTR_DoSix(movdqa, xmm_curr_counter);
+      inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]);
+      inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]);
+      inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]);
+      inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]);
+      inc_counter(rbx, xmm_result5,  0x05, L__incCounter[k][4]);
+      inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]);
+      CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
+      CTR_DoSix(pxor, xmm_key_tmp0);   //PXOR with Round 0 key
+
+      //load two ROUND_KEYs at a time
+      for (int i = 1; i < rounds[k]; ) {
+        load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask);
+        load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask);
+        CTR_DoSix(aesenc, xmm_key_tmp1);
+        i++;
+        if (i != rounds[k]) {
+          CTR_DoSix(aesenc, xmm_key_tmp0);
+        } else {
+          CTR_DoSix(aesenclast, xmm_key_tmp0);
+        }
+        i++;
+      }
+
+      // get next PARALLEL_FACTOR blocks into xmm_result registers
+      __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
+      __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
+      __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
+      __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
+      __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize));
+      __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize));
+
+      __ pxor(xmm_result0, xmm_from0);
+      __ pxor(xmm_result1, xmm_from1);
+      __ pxor(xmm_result2, xmm_from2);
+      __ pxor(xmm_result3, xmm_from3);
+      __ pxor(xmm_result4, xmm_from4);
+      __ pxor(xmm_result5, xmm_from5);
+
+      // store 6 results into the next 64 bytes of output
+      __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
+      __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
+      __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
+      __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
+      __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4);
+      __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5);
+
+      __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
+      __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
+      __ jmp(L_multiBlock_loopTop[k]);
+
+      // singleBlock starts here
+      __ align(OptoLoopAlignment);
+      __ BIND(L_singleBlockLoopTop[k]);
+      __ cmpptr(len_reg, 0);
+      __ jcc(Assembler::lessEqual, L_exit);
+      load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
+      __ movdqa(xmm_result0, xmm_curr_counter);
+      inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]);
+      __ pshufb(xmm_result0, xmm_counter_shuf_mask);
+      __ pxor(xmm_result0, xmm_key_tmp0);
+      for (int i = 1; i < rounds[k]; i++) {
+        load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask);
+        __ aesenc(xmm_result0, xmm_key_tmp0);
+      }
+      load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask);
+      __ aesenclast(xmm_result0, xmm_key_tmp0);
+      __ cmpptr(len_reg, AESBlockSize);
+      __ jcc(Assembler::less, L_processTail_insr[k]);
+        __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
+        __ pxor(xmm_result0, xmm_from0);
+        __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
+        __ addptr(pos, AESBlockSize);
+        __ subptr(len_reg, AESBlockSize);
+        __ jmp(L_singleBlockLoopTop[k]);
+      __ BIND(L_processTail_insr[k]);
+        __ addptr(pos, len_reg);
+        __ testptr(len_reg, 8);
+        __ jcc(Assembler::zero, L_processTail_4_insr[k]);
+          __ subptr(pos,8);
+          __ pinsrq(xmm_from0, Address(from, pos), 0);
+        __ BIND(L_processTail_4_insr[k]);
+        __ testptr(len_reg, 4);
+        __ jcc(Assembler::zero, L_processTail_2_insr[k]);
+          __ subptr(pos,4);
+          __ pslldq(xmm_from0, 4);
+          __ pinsrd(xmm_from0, Address(from, pos), 0);
+        __ BIND(L_processTail_2_insr[k]);
+        __ testptr(len_reg, 2);
+        __ jcc(Assembler::zero, L_processTail_1_insr[k]);
+          __ subptr(pos, 2);
+          __ pslldq(xmm_from0, 2);
+          __ pinsrw(xmm_from0, Address(from, pos), 0);
+        __ BIND(L_processTail_1_insr[k]);
+        __ testptr(len_reg, 1);
+        __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
+          __ subptr(pos, 1);
+          __ pslldq(xmm_from0, 1);
+          __ pinsrb(xmm_from0, Address(from, pos), 0);
+        __ BIND(L_processTail_exit_insr[k]);
+
+        __ movdqu(Address(saved_encCounter_start, 0), xmm_result0);
+        __ pxor(xmm_result0, xmm_from0);
+
+        __ testptr(len_reg, 8);
+        __ jcc(Assembler::zero, L_processTail_4_extr[k]);
+          __ pextrq(Address(to, pos), xmm_result0, 0);
+          __ psrldq(xmm_result0, 8);
+          __ addptr(pos, 8);
+        __ BIND(L_processTail_4_extr[k]);
+        __ testptr(len_reg, 4);
+        __ jcc(Assembler::zero, L_processTail_2_extr[k]);
+          __ pextrd(Address(to, pos), xmm_result0, 0);
+          __ psrldq(xmm_result0, 4);
+          __ addptr(pos, 4);
+        __ BIND(L_processTail_2_extr[k]);
+        __ testptr(len_reg, 2);
+        __ jcc(Assembler::zero, L_processTail_1_extr[k]);
+          __ pextrw(Address(to, pos), xmm_result0, 0);
+          __ psrldq(xmm_result0, 2);
+          __ addptr(pos, 2);
+        __ BIND(L_processTail_1_extr[k]);
+        __ testptr(len_reg, 1);
+        __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
+          __ pextrb(Address(to, pos), xmm_result0, 0);
+
+        __ BIND(L_processTail_exit_extr[k]);
+        __ movl(Address(used_addr, 0), len_reg);
+        __ jmp(L_exit);
+
+    }
+
+    __ BIND(L_exit);
+    __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
+    __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
+    __ pop(rbx); // pop the saved RBX.
+#ifdef _WIN64
+    // restore regs belonging to calling function
+    for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
+      __ movdqu(as_XMMRegister(i), xmm_save(i));
+    }
+    __ movl(rax, len_mem);
+    __ movptr(r13, r13_save);
+    __ movptr(r14, r14_save);
+#else
+    __ pop(rax); // return 'len'
+#endif
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);
+    return start;
+  }

   // byte swap x86 long
   address generate_ghash_long_swap_mask() {
@@ -4239,12 +4574,15 @@ class StubGenerator: public StubCodeGenerator {
     // don't bother generating these AES intrinsic stubs unless global flag is set
     if (UseAESIntrinsics) {
       StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // needed by the others
-
       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
     }
+    if (UseAESCTRIntrinsics){
+      StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
+      StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
+    }

     // Generate GHASH intrinsics code
     if (UseGHASHIntrinsics) {
diff --git a/hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp b/hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp
index 9b0d8fc75..617879377 100644
--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp
@@ -33,6 +33,7 @@

 address StubRoutines::x86::_verify_mxcsr_entry = NULL;
 address StubRoutines::x86::_key_shuffle_mask_addr = NULL;
+address StubRoutines::x86::_counter_shuffle_mask_addr = NULL;
 address StubRoutines::x86::_ghash_long_swap_mask_addr = NULL;
 address StubRoutines::x86::_ghash_byte_swap_mask_addr = NULL;

diff --git a/hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp b/hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp
index bb160486c..70b5a34ac 100644
--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp
@@ -33,6 +33,10 @@
   static address _verify_mxcsr_entry;
   // shuffle mask for fixing up 128-bit words consisting of big-endian 32-bit integers
   static address _key_shuffle_mask_addr;
+
+  //shuffle mask for big-endian 128-bit integers
+  static address _counter_shuffle_mask_addr;
+
   // masks and table for CRC32
   static uint64_t _crc_by128_masks[];
   static juint    _crc_table[];
@@ -43,6 +47,7 @@
  public:
   static address verify_mxcsr_entry()    { return _verify_mxcsr_entry; }
   static address key_shuffle_mask_addr() { return _key_shuffle_mask_addr; }
+  static address counter_shuffle_mask_addr() { return _counter_shuffle_mask_addr; }
   static address crc_by128_masks_addr()  { return (address)_crc_by128_masks; }
   static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; }
   static address ghash_byte_swap_mask_addr() { return _ghash_byte_swap_mask_addr; }
diff --git a/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp b/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp
index bca5d493c..538f83e69 100644
--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp
+++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp
@@ -31,7 +31,7 @@

 enum platform_dependent_constants {
   code_size1 =  9000,           // simply increase if too small (assembler will crash if too small)
-  code_size2 = 22000            // simply increase if too small (assembler will crash if too small)
+  code_size2 = 25800            // simply increase if too small (assembler will crash if too small)
 };

 class x86 {
diff --git a/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp b/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp
index b048fd74e..f963cd2f8 100644
--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp
+++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp
@@ -33,7 +33,7 @@ static bool    returns_to_call_stub(address return_pc)   { return return_pc == _

 enum platform_dependent_constants {
   code_size1 = 19000,          // simply increase if too small (assembler will crash if too small)
-  code_size2 = 24000           // simply increase if too small (assembler will crash if too small)
+  code_size2 = 27000           // simply increase if too small (assembler will crash if too small)
 };

 class x86 {
diff --git a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp
index 46b3e32ea..ce3037d76 100644
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp
@@ -573,6 +573,28 @@ void VM_Version::get_processor_features() {
         }
         FLAG_SET_DEFAULT(UseAESIntrinsics, false);
       }
+
+      // --AES-CTR begins--
+      if (!UseAESIntrinsics) {
+        if (UseAESCTRIntrinsics && !FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) {
+          warning("AES-CTR intrinsics require UseAESIntrinsics flag to be enabled. Intrinsics will be disabled.");
+          FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
+        }
+      } else {
+        if(supports_sse4_1() && UseSSE >= 4) {
+          if (FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) {
+            FLAG_SET_DEFAULT(UseAESCTRIntrinsics, true);
+          }
+        } else {
+           // The AES-CTR intrinsic stubs require AES instruction support (of course)
+           // but also require sse4.1 mode or higher for instructions it use.
+          if (UseAESCTRIntrinsics && !FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) {
+             warning("X86 AES-CTR intrinsics require SSE4.1 instructions or higher. Intrinsics will be disabled.");
+           }
+           FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
+        }
+      }
+      // --AES-CTR ends--
     }
   } else if (UseAES || UseAESIntrinsics) {
     if (UseAES && !FLAG_IS_DEFAULT(UseAES)) {
@@ -583,6 +605,10 @@ void VM_Version::get_processor_features() {
       warning("AES intrinsics are not available on this CPU");
       FLAG_SET_DEFAULT(UseAESIntrinsics, false);
     }
+    if (UseAESCTRIntrinsics && !FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) {
+      warning("AES-CTR intrinsics are not available on this CPU");
+      FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
+    }
   }

   // Use CLMUL instructions if available.
@@ -606,6 +632,16 @@ void VM_Version::get_processor_features() {
     FLAG_SET_DEFAULT(UseCRC32Intrinsics, false);
   }

+  if (UseAESIntrinsics) {
+    if (FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) {
+      UseAESCTRIntrinsics = true;
+    }
+  } else if (UseAESCTRIntrinsics) {
+    if (!FLAG_IS_DEFAULT(UseAESCTRIntrinsics))
+        warning("AES/CTR intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
+  }
+
   // GHASH/GCM intrinsics
   if (UseCLMUL && (UseSSE > 2)) {
     if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) {
diff --git a/hotspot/src/share/vm/classfile/vmSymbols.hpp b/hotspot/src/share/vm/classfile/vmSymbols.hpp
index 942d172a1..4ca2a3ad4 100644
--- a/hotspot/src/share/vm/classfile/vmSymbols.hpp
+++ b/hotspot/src/share/vm/classfile/vmSymbols.hpp
@@ -846,6 +846,10 @@
    do_name(     decrypt_name,                                      "implDecrypt")                                       \
    do_signature(byteArray_int_int_byteArray_int_signature,         "([BII[BI)I")                                        \
                                                                                                                         \
+  do_class(com_sun_crypto_provider_counterMode,      "com/sun/crypto/provider/CounterMode")                             \
+   do_intrinsic(_counterMode_AESCrypt, com_sun_crypto_provider_counterMode, crypt_name, byteArray_int_int_byteArray_int_signature, F_R)   \
+   do_name(     crypt_name,                                 "implCrypt")                                                    \
+                                                                                                                        \
   /* support for sun.security.provider.SHA */                                                                           \
   do_class(sun_security_provider_sha,                              "sun/security/provider/SHA")                         \
   do_intrinsic(_sha_implCompress, sun_security_provider_sha, implCompress_name, implCompress_signature, F_R)            \
diff --git a/hotspot/src/share/vm/opto/escape.cpp b/hotspot/src/share/vm/opto/escape.cpp
index 6f8ffe608..a0e497f08 100644
--- a/hotspot/src/share/vm/opto/escape.cpp
+++ b/hotspot/src/share/vm/opto/escape.cpp
@@ -952,6 +952,7 @@ void ConnectionGraph::process_call_arguments(CallNode *call) {
                   strcmp(call->as_CallLeaf()->_name, "aescrypt_decryptBlock") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_encryptAESCrypt") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_decryptAESCrypt") == 0 ||
+                  strcmp(call->as_CallLeaf()->_name, "counterMode_AESCrypt") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "ghash_processBlocks") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "sha1_implCompress") == 0 ||
                   strcmp(call->as_CallLeaf()->_name, "sha1_implCompressMB") == 0 ||
diff --git a/hotspot/src/share/vm/opto/library_call.cpp b/hotspot/src/share/vm/opto/library_call.cpp
index bb721f6f1..2add82dd1 100644
--- a/hotspot/src/share/vm/opto/library_call.cpp
+++ b/hotspot/src/share/vm/opto/library_call.cpp
@@ -196,6 +196,7 @@ class LibraryCallKit : public GraphKit {
     return generate_method_call(method_id, true, false);
   }
   Node * load_field_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString, bool is_exact, bool is_static);
+  Node * field_address_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString, bool is_exact, bool is_static, ciInstanceKlass * fromKls);

   Node* make_string_method_node(int opcode, Node* str1_start, Node* cnt1, Node* str2_start, Node* cnt2);
   Node* make_string_method_node(int opcode, Node* str1, Node* str2);
@@ -309,7 +310,9 @@ class LibraryCallKit : public GraphKit {
   bool inline_reference_get();
   bool inline_aescrypt_Block(vmIntrinsics::ID id);
   bool inline_cipherBlockChaining_AESCrypt(vmIntrinsics::ID id);
+  bool inline_counterMode_AESCrypt(vmIntrinsics::ID id);
   Node* inline_cipherBlockChaining_AESCrypt_predicate(bool decrypting);
+  Node* inline_counterMode_AESCrypt_predicate();
   Node* get_key_start_from_aescrypt_object(Node* aescrypt_object);
   Node* get_original_key_start_from_aescrypt_object(Node* aescrypt_object);
   bool inline_ghash_processBlocks();
@@ -558,6 +561,13 @@ CallGenerator* Compile::make_vm_intrinsic(ciMethod* m, bool is_virtual) {
     predicates = 1;
     break;

+  case vmIntrinsics::_counterMode_AESCrypt:
+    if (!UseAESCTRIntrinsics) {
+      return NULL;
+    }
+    predicates = 1;
+    break;
+
   case vmIntrinsics::_sha_implCompress:
     if (!UseSHA1Intrinsics) return NULL;
     break;
@@ -950,6 +960,9 @@ bool LibraryCallKit::try_to_inline(int predicate) {
   case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt:
     return inline_cipherBlockChaining_AESCrypt(intrinsic_id());

+  case vmIntrinsics::_counterMode_AESCrypt:
+    return inline_counterMode_AESCrypt(intrinsic_id());
+
   case vmIntrinsics::_sha_implCompress:
   case vmIntrinsics::_sha2_implCompress:
   case vmIntrinsics::_sha5_implCompress:
@@ -1021,6 +1034,8 @@ Node* LibraryCallKit::try_to_predicate(int predicate) {
     return inline_cipherBlockChaining_AESCrypt_predicate(false);
   case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt:
     return inline_cipherBlockChaining_AESCrypt_predicate(true);
+  case vmIntrinsics::_counterMode_AESCrypt:
+    return inline_counterMode_AESCrypt_predicate();
   case vmIntrinsics::_digestBase_implCompressMB:
     return inline_digestBase_implCompressMB_predicate(predicate);

@@ -6581,6 +6596,39 @@ Node * LibraryCallKit::load_field_from_object(Node * fromObj, const char * field
   return loadedField;
 }

+Node * LibraryCallKit::field_address_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString,
+                                                 bool is_exact = true, bool is_static = false,
+                                                 ciInstanceKlass * fromKls = NULL) {
+  if (fromKls == NULL) {
+    const TypeInstPtr* tinst = _gvn.type(fromObj)->isa_instptr();
+    assert(tinst != NULL, "obj is null");
+    assert(tinst->klass()->is_loaded(), "obj is not loaded");
+    assert(!is_exact || tinst->klass_is_exact(), "klass not exact");
+    fromKls = tinst->klass()->as_instance_klass();
+  }
+  else {
+    assert(is_static, "only for static field access");
+  }
+  ciField* field = fromKls->get_field_by_name(ciSymbol::make(fieldName),
+    ciSymbol::make(fieldTypeString),
+    is_static);
+
+  assert(field != NULL, "undefined field");
+  assert(!field->is_volatile(), "not defined for volatile fields");
+
+  if (is_static) {
+    const TypeInstPtr* tip = TypeInstPtr::make(fromKls->java_mirror());
+    fromObj = makecon(tip);
+  }
+
+  // Next code  copied from Parse::do_get_xxx():
+
+  // Compute address and memory type.
+  int offset = field->offset_in_bytes();
+  Node *adr = basic_plus_adr(fromObj, fromObj, offset);
+
+  return adr;
+}

 //------------------------------inline_aescrypt_Block-----------------------
 bool LibraryCallKit::inline_aescrypt_Block(vmIntrinsics::ID id) {
@@ -6747,6 +6795,90 @@ bool LibraryCallKit::inline_cipherBlockChaining_AESCrypt(vmIntrinsics::ID id) {
   return true;
 }

+//------------------------------inline_counterMode_AESCrypt-----------------------
+bool LibraryCallKit::inline_counterMode_AESCrypt(vmIntrinsics::ID id) {
+  assert(UseAES, "need AES instruction support");
+  if (!UseAESCTRIntrinsics) return false;
+
+  address stubAddr = NULL;
+  const char *stubName = NULL;
+  if (id == vmIntrinsics::_counterMode_AESCrypt) {
+    stubAddr = StubRoutines::counterMode_AESCrypt();
+    stubName = "counterMode_AESCrypt";
+  }
+  if (stubAddr == NULL) return false;
+
+  Node* counterMode_object = argument(0);
+  Node* src = argument(1);
+  Node* src_offset = argument(2);
+  Node* len = argument(3);
+  Node* dest = argument(4);
+  Node* dest_offset = argument(5);
+
+  // (1) src and dest are arrays.
+  const Type* src_type = src->Value(&_gvn);
+  const Type* dest_type = dest->Value(&_gvn);
+  const TypeAryPtr* top_src = src_type->isa_aryptr();
+  const TypeAryPtr* top_dest = dest_type->isa_aryptr();
+  assert(top_src != NULL && top_src->klass() != NULL &&
+         top_dest != NULL && top_dest->klass() != NULL, "args are strange");
+
+  // checks are the responsibility of the caller
+  Node* src_start = src;
+  Node* dest_start = dest;
+  if (src_offset != NULL || dest_offset != NULL) {
+    assert(src_offset != NULL && dest_offset != NULL, "");
+    src_start = array_element_address(src, src_offset, T_BYTE);
+    dest_start = array_element_address(dest, dest_offset, T_BYTE);
+  }
+
+  // if we are in this set of code, we "know" the embeddedCipher is an AESCrypt object
+  // (because of the predicated logic executed earlier).
+  // so we cast it here safely.
+  // this requires a newer class file that has this array as littleEndian ints, otherwise we revert to java
+  Node* embeddedCipherObj = load_field_from_object(counterMode_object, "embeddedCipher", "Lcom/sun/crypto/provider/SymmetricCipher;", /*is_exact*/ false);
+  if (embeddedCipherObj == NULL) return false;
+  // cast it to what we know it will be at runtime
+  const TypeInstPtr* tinst = _gvn.type(counterMode_object)->isa_instptr();
+  assert(tinst != NULL, "CTR obj is null");
+  assert(tinst->klass()->is_loaded(), "CTR obj is not loaded");
+  ciKlass* klass_AESCrypt = tinst->klass()->as_instance_klass()->find_klass(ciSymbol::make("com/sun/crypto/provider/AESCrypt"));
+  assert(klass_AESCrypt->is_loaded(), "predicate checks that this class is loaded");
+  ciInstanceKlass* instklass_AESCrypt = klass_AESCrypt->as_instance_klass();
+  const TypeKlassPtr* aklass = TypeKlassPtr::make(instklass_AESCrypt);
+  const TypeOopPtr* xtype = aklass->as_instance_type();
+  Node* aescrypt_object = new (C) CheckCastPPNode(control(), embeddedCipherObj, xtype);
+  aescrypt_object = _gvn.transform(aescrypt_object);
+  // we need to get the start of the aescrypt_object's expanded key array
+  Node* k_start = get_key_start_from_aescrypt_object(aescrypt_object);
+  if (k_start == NULL) return false;
+  // similarly, get the start address of the r vector
+  Node* obj_counter = load_field_from_object(counterMode_object, "counter", "[B", /*is_exact*/ false);
+  if (obj_counter == NULL) return false;
+  Node* cnt_start = array_element_address(obj_counter, intcon(0), T_BYTE);
+
+  Node* saved_encCounter = load_field_from_object(counterMode_object, "encryptedCounter", "[B", /*is_exact*/ false);
+  if (saved_encCounter == NULL) return false;
+  Node* saved_encCounter_start = array_element_address(saved_encCounter, intcon(0), T_BYTE);
+  Node* used = field_address_from_object(counterMode_object, "used", "I", /*is_exact*/ false);
+
+  Node* ctrCrypt;
+  if (Matcher::pass_original_key_for_aes()) {
+    // no SPARC version for AES/CTR intrinsics now.
+    return false;
+  }
+  // Call the stub, passing src_start, dest_start, k_start, r_start and src_len
+  ctrCrypt = make_runtime_call(RC_LEAF|RC_NO_FP,
+                               OptoRuntime::counterMode_aescrypt_Type(),
+                               stubAddr, stubName, TypePtr::BOTTOM,
+                               src_start, dest_start, k_start, cnt_start, len, saved_encCounter_start, used);
+
+  // return cipher length (int)
+  Node* retvalue = _gvn.transform(new (C) ProjNode(ctrCrypt, TypeFunc::Parms));
+  set_result(retvalue);
+  return true;
+}
+
 //------------------------------get_key_start_from_aescrypt_object-----------------------
 Node * LibraryCallKit::get_key_start_from_aescrypt_object(Node *aescrypt_object) {
 #ifdef PPC64
@@ -6841,6 +6973,48 @@ Node* LibraryCallKit::inline_cipherBlockChaining_AESCrypt_predicate(bool decrypt
   return _gvn.transform(region);
 }

+//----------------------------inline_counterMode_AESCrypt_predicate----------------------------
+// Return node representing slow path of predicate check.
+// the pseudo code we want to emulate with this predicate is:
+// for encryption:
+//    if (embeddedCipherObj instanceof AESCrypt) do_intrinsic, else do_javapath
+// for decryption:
+//    if ((embeddedCipherObj instanceof AESCrypt) && (cipher!=plain)) do_intrinsic, else do_javapath
+//    note cipher==plain is more conservative than the original java code but that's OK
+//
+
+Node* LibraryCallKit::inline_counterMode_AESCrypt_predicate() {
+  // The receiver was checked for NULL already.
+  Node* objCTR = argument(0);
+
+  // Load embeddedCipher field of CipherBlockChaining object.
+  Node* embeddedCipherObj = load_field_from_object(objCTR, "embeddedCipher", "Lcom/sun/crypto/provider/SymmetricCipher;", /*is_exact*/ false);
+
+  // get AESCrypt klass for instanceOf check
+  // AESCrypt might not be loaded yet if some other SymmetricCipher got us to this compile point
+  // will have same classloader as CipherBlockChaining object
+  const TypeInstPtr* tinst = _gvn.type(objCTR)->isa_instptr();
+  assert(tinst != NULL, "CTRobj is null");
+  assert(tinst->klass()->is_loaded(), "CTRobj is not loaded");
+
+  // we want to do an instanceof comparison against the AESCrypt class
+  ciKlass* klass_AESCrypt = tinst->klass()->as_instance_klass()->find_klass(ciSymbol::make("com/sun/crypto/provider/AESCrypt"));
+  if (!klass_AESCrypt->is_loaded()) {
+    // if AESCrypt is not even loaded, we never take the intrinsic fast path
+    Node* ctrl = control();
+    set_control(top()); // no regular fast path
+    return ctrl;
+  }
+
+  ciInstanceKlass* instklass_AESCrypt = klass_AESCrypt->as_instance_klass();
+  Node* instof = gen_instanceof(embeddedCipherObj, makecon(TypeKlassPtr::make(instklass_AESCrypt)));
+  Node* cmp_instof = _gvn.transform(new (C) CmpINode(instof, intcon(1)));
+  Node* bool_instof = _gvn.transform(new (C) BoolNode(cmp_instof, BoolTest::ne));
+  Node* instof_false = generate_guard(bool_instof, NULL, PROB_MIN);
+
+  return instof_false; // even if it is NULL
+}
+
 //------------------------------inline_ghash_processBlocks
 bool LibraryCallKit::inline_ghash_processBlocks() {
   address stubAddr;
diff --git a/hotspot/src/share/vm/opto/runtime.cpp b/hotspot/src/share/vm/opto/runtime.cpp
index 0a86211ba..1c51be19b 100644
--- a/hotspot/src/share/vm/opto/runtime.cpp
+++ b/hotspot/src/share/vm/opto/runtime.cpp
@@ -1021,6 +1021,35 @@ const TypeFunc* OptoRuntime::cipherBlockChaining_aescrypt_Type() {
   return TypeFunc::make(domain, range);
 }

+//for counterMode calls of aescrypt encrypt/decrypt, four pointers and a length, returning int
+const TypeFunc* OptoRuntime::counterMode_aescrypt_Type() {
+  // create input type (domain)
+  int num_args = 7;
+  if (Matcher::pass_original_key_for_aes()) {
+    num_args = 8;
+  }
+  int argcnt = num_args;
+  const Type** fields = TypeTuple::fields(argcnt);
+  int argp = TypeFunc::Parms;
+  fields[argp++] = TypePtr::NOTNULL; // src
+  fields[argp++] = TypePtr::NOTNULL; // dest
+  fields[argp++] = TypePtr::NOTNULL; // k array
+  fields[argp++] = TypePtr::NOTNULL; // counter array
+  fields[argp++] = TypeInt::INT; // src len
+  fields[argp++] = TypePtr::NOTNULL; // saved_encCounter
+  fields[argp++] = TypePtr::NOTNULL; // saved used addr
+  if (Matcher::pass_original_key_for_aes()) {
+    fields[argp++] = TypePtr::NOTNULL; // original k array
+  }
+  assert(argp == TypeFunc::Parms + argcnt, "correct decoding");
+  const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms + argcnt, fields);
+  // returning cipher len (int)
+  fields = TypeTuple::fields(1);
+  fields[TypeFunc::Parms + 0] = TypeInt::INT;
+  const TypeTuple* range = TypeTuple::make(TypeFunc::Parms + 1, fields);
+  return TypeFunc::make(domain, range);
+}
+
 /*
  * void implCompress(byte[] buf, int ofs)
  */
diff --git a/hotspot/src/share/vm/opto/runtime.hpp b/hotspot/src/share/vm/opto/runtime.hpp
index 47133d58c..f27e7d507 100644
--- a/hotspot/src/share/vm/opto/runtime.hpp
+++ b/hotspot/src/share/vm/opto/runtime.hpp
@@ -299,6 +299,7 @@ private:

   static const TypeFunc* aescrypt_block_Type();
   static const TypeFunc* cipherBlockChaining_aescrypt_Type();
+  static const TypeFunc* counterMode_aescrypt_Type();

   static const TypeFunc* sha_implCompress_Type();
   static const TypeFunc* digestBase_implCompressMB_Type();
diff --git a/hotspot/src/share/vm/runtime/globals.hpp b/hotspot/src/share/vm/runtime/globals.hpp
index 65dfcf69b..91e52f033 100644
--- a/hotspot/src/share/vm/runtime/globals.hpp
+++ b/hotspot/src/share/vm/runtime/globals.hpp
@@ -734,6 +734,9 @@ class CommandLineFlags {
   product(bool, UseAESIntrinsics, false,                                    \
           "Use intrinsics for AES versions of crypto")                      \
                                                                             \
+  product(bool, UseAESCTRIntrinsics, false,                                 \
+          "Use intrinsics for the paralleled version of AES/CTR crypto")    \
+                                                                            \
   product(bool, UseSHA1Intrinsics, false,                                   \
           "Use intrinsics for SHA-1 crypto hash function")                  \
                                                                             \
diff --git a/hotspot/src/share/vm/runtime/stubRoutines.cpp b/hotspot/src/share/vm/runtime/stubRoutines.cpp
index f2106d13a..d66237137 100644
--- a/hotspot/src/share/vm/runtime/stubRoutines.cpp
+++ b/hotspot/src/share/vm/runtime/stubRoutines.cpp
@@ -124,6 +124,7 @@ address StubRoutines::_aescrypt_encryptBlock               = NULL;
 address StubRoutines::_aescrypt_decryptBlock               = NULL;
 address StubRoutines::_cipherBlockChaining_encryptAESCrypt = NULL;
 address StubRoutines::_cipherBlockChaining_decryptAESCrypt = NULL;
+address StubRoutines::_counterMode_AESCrypt                = NULL;
 address StubRoutines::_ghash_processBlocks                 = NULL;

 address StubRoutines::_sha1_implCompress     = NULL;
diff --git a/hotspot/src/share/vm/runtime/stubRoutines.hpp b/hotspot/src/share/vm/runtime/stubRoutines.hpp
index 16075d9f4..9fb589540 100644
--- a/hotspot/src/share/vm/runtime/stubRoutines.hpp
+++ b/hotspot/src/share/vm/runtime/stubRoutines.hpp
@@ -202,6 +202,7 @@ class StubRoutines: AllStatic {
   static address _aescrypt_decryptBlock;
   static address _cipherBlockChaining_encryptAESCrypt;
   static address _cipherBlockChaining_decryptAESCrypt;
+  static address _counterMode_AESCrypt;
   static address _ghash_processBlocks;

   static address _sha1_implCompress;
@@ -370,6 +371,7 @@ class StubRoutines: AllStatic {
   static address aescrypt_decryptBlock()                { return _aescrypt_decryptBlock; }
   static address cipherBlockChaining_encryptAESCrypt()  { return _cipherBlockChaining_encryptAESCrypt; }
   static address cipherBlockChaining_decryptAESCrypt()  { return _cipherBlockChaining_decryptAESCrypt; }
+  static address counterMode_AESCrypt() { return _counterMode_AESCrypt; }
   static address ghash_processBlocks() { return _ghash_processBlocks; }

   static address sha1_implCompress()     { return _sha1_implCompress; }
diff --git a/hotspot/src/share/vm/runtime/vmStructs.cpp b/hotspot/src/share/vm/runtime/vmStructs.cpp
index 3f2bfeb74..842b5840d 100644
--- a/hotspot/src/share/vm/runtime/vmStructs.cpp
+++ b/hotspot/src/share/vm/runtime/vmStructs.cpp
@@ -815,6 +815,7 @@ typedef TwoOopHashtable<Symbol*, mtClass>     SymbolTwoOopHashtable;
      static_field(StubRoutines,                _aescrypt_decryptBlock,                        address)                               \
      static_field(StubRoutines,                _cipherBlockChaining_encryptAESCrypt,          address)                               \
      static_field(StubRoutines,                _cipherBlockChaining_decryptAESCrypt,          address)                               \
+     static_field(StubRoutines,                _counterMode_AESCrypt,                         address)                               \
      static_field(StubRoutines,                _ghash_processBlocks,                          address)                               \
      static_field(StubRoutines,                _updateBytesCRC32,                             address)                               \
      static_field(StubRoutines,                _crc_table_adr,                                address)                               \
diff --git a/hotspot/test/compiler/7184394/TestAESBase.java b/hotspot/test/compiler/7184394/TestAESBase.java
index 5c3e6881e..afda2a1f7 100644
--- a/hotspot/test/compiler/7184394/TestAESBase.java
+++ b/hotspot/test/compiler/7184394/TestAESBase.java
@@ -106,8 +106,8 @@ abstract public class TestAESBase {
       cipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE");
       dCipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE");

-      // CBC init
-      if (mode.equals("CBC")) {
+      // CBC or CTR init
+      if (mode.equals("CBC") || mode.equals("CTR")) {
         IvParameterSpec initVector = new IvParameterSpec(iv);
         cipher.init(Cipher.ENCRYPT_MODE, key, initVector);
         algParams = cipher.getParameters();
diff --git a/hotspot/test/compiler/7184394/TestAESMain.java b/hotspot/test/compiler/7184394/TestAESMain.java
index ddd8eeaef..65949420a 100644
--- a/hotspot/test/compiler/7184394/TestAESMain.java
+++ b/hotspot/test/compiler/7184394/TestAESMain.java
@@ -48,6 +48,13 @@
  * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 TestAESMain
  * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain
  * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DencInputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DencOutputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DdecOutputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DencInputOffset=1 -DencOutputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 TestAESMain
  *
  * @author Tom Deneau
  */
diff --git a/jdk/src/share/classes/com/sun/crypto/provider/CounterMode.java b/jdk/src/share/classes/com/sun/crypto/provider/CounterMode.java
index aea9336c9..c2bd38a71 100644
--- a/jdk/src/share/classes/com/sun/crypto/provider/CounterMode.java
+++ b/jdk/src/share/classes/com/sun/crypto/provider/CounterMode.java
@@ -39,10 +39,10 @@ import java.security.InvalidKeyException;
  * @author Andreas Sterbenz
  * @since 1.4.2
  */
-final class CounterMode extends FeedbackCipher {
+class CounterMode extends FeedbackCipher {

     // current counter value
-    private final byte[] counter;
+    final byte[] counter;

     // encrypted bytes of the previous counter value
     private final byte[] encryptedCounter;
@@ -137,7 +137,7 @@ final class CounterMode extends FeedbackCipher {
      * <code>cipherOffset</code>.
      *
      * @param in the buffer with the input data to be encrypted
-     * @param inOffset the offset in <code>plain</code>
+     * @param inOff the offset in <code>plain</code>
      * @param len the length of the input data
      * @param out the buffer for the result
      * @param outOff the offset in <code>cipher</code>
@@ -176,6 +176,11 @@ final class CounterMode extends FeedbackCipher {
         RangeUtil.nullAndBoundsCheck(in, inOff, len);
         RangeUtil.nullAndBoundsCheck(out, outOff, len);

+        return implCrypt(in, inOff, len, out, outOff);
+    }
+
+    // Implementation of crpyt() method. Possibly replaced with a compiler intrinsic.
+    private int implCrypt(byte[] in, int inOff, int len, byte[] out, int outOff) {
         int result = len;
         while (len-- > 0) {
             if (used >= blockSize) {
diff --git a/jdk/src/share/classes/com/sun/crypto/provider/GCTR.java b/jdk/src/share/classes/com/sun/crypto/provider/GCTR.java
index f8a3eaa0a..6a394e448 100644
--- a/jdk/src/share/classes/com/sun/crypto/provider/GCTR.java
+++ b/jdk/src/share/classes/com/sun/crypto/provider/GCTR.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2017 Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -29,52 +29,43 @@

 package com.sun.crypto.provider;

-import java.security.*;
-import javax.crypto.*;
+import javax.crypto.IllegalBlockSizeException;
 import static com.sun.crypto.provider.AESConstants.AES_BLOCK_SIZE;

 /**
  * This class represents the GCTR function defined in NIST 800-38D
- * under section 6.5. It needs to be constructed w/ an initialized
- * cipher object, and initial counter block(ICB). Given an input X
- * of arbitrary length, it processes and returns an output which has
- * the same length as X. The invariants of this class are:
- *
- * (1) The length of intialCounterBlk (and also of its clones, e.g.,
- * fields counter and counterSave) is equal to AES_BLOCK_SIZE.
- *
- * (2) After construction, the field counter never becomes null, it
- * always contains a byte array of length AES_BLOCK_SIZE.
+ * under section 6.5.  With a given cipher object and initial counter
+ * block, a counter mode operation is performed.  Blocksize is limited
+ * to 16 bytes.
  *
  * If any invariant is broken, failures can occur because the
  * AESCrypt.encryptBlock method can be intrinsified on the HotSpot VM
  * (see JDK-8067648 for details).
  *
+ * The counter mode operations can be intrinsified and parallelized
+ * by using CounterMode.implCrypt() if HotSpot VM supports it on the
+ * architecture.
+ *
  * <p>This function is used in the implementation of GCM mode.
  *
  * @since 1.8
  */
-final class GCTR {
-
-    // these fields should not change after the object has been constructed
-    private final SymmetricCipher aes;
-    private final byte[] icb;
-
-    // the current counter value
-    private byte[] counter;
+final class GCTR extends CounterMode {

-    // needed for save/restore calls
-    private byte[] counterSave = null;
-
-    // NOTE: cipher should already be initialized
     GCTR(SymmetricCipher cipher, byte[] initialCounterBlk) {
-        this.aes = cipher;
+        super(cipher);
         if (initialCounterBlk.length != AES_BLOCK_SIZE) {
             throw new RuntimeException("length of initial counter block (" + initialCounterBlk.length +
                                        ") not equal to AES_BLOCK_SIZE (" + AES_BLOCK_SIZE + ")");
         }
-        this.icb = initialCounterBlk;
-        this.counter = icb.clone();
+
+        iv = initialCounterBlk;
+        reset();
+    }
+
+    @Override
+    String getFeedback() {
+        return "GCTR";
     }

     // input must be multiples of 128-bit blocks when calling update
@@ -89,23 +80,11 @@ final class GCTR {
             throw new RuntimeException("output buffer too small");
         }

-        byte[] encryptedCntr = new byte[AES_BLOCK_SIZE];
-
-        int numOfCompleteBlocks = inLen / AES_BLOCK_SIZE;
-        for (int i = 0; i < numOfCompleteBlocks; i++) {
-            aes.encryptBlock(counter, 0, encryptedCntr, 0);
-            for (int n = 0; n < AES_BLOCK_SIZE; n++) {
-                int index = (i * AES_BLOCK_SIZE + n);
-                out[outOfs + index] =
-                    (byte) ((in[inOfs + index] ^ encryptedCntr[n]));
-            }
-            GaloisCounterMode.increment32(counter);
-        }
-        return inLen;
+        return encrypt(in, inOfs, inLen, out, outOfs);
     }

     // input can be arbitrary size when calling doFinal
-    protected int doFinal(byte[] in, int inOfs, int inLen, byte[] out,
+    int doFinal(byte[] in, int inOfs, int inLen, byte[] out,
                           int outOfs) throws IllegalBlockSizeException {
         try {
             if (inLen < 0) {
@@ -118,7 +97,7 @@ final class GCTR {
                 if (lastBlockSize != 0) {
                     // do the last partial block
                     byte[] encryptedCntr = new byte[AES_BLOCK_SIZE];
-                    aes.encryptBlock(counter, 0, encryptedCntr, 0);
+                    embeddedCipher.encryptBlock(counter, 0, encryptedCntr, 0);
                     for (int n = 0; n < lastBlockSize; n++) {
                         out[outOfs + completeBlkLen + n] =
                             (byte) ((in[inOfs + completeBlkLen + n] ^
@@ -131,28 +110,4 @@ final class GCTR {
         }
         return inLen;
     }
-
-    /**
-     * Resets the content of this object to when it's first constructed.
-     */
-    void reset() {
-        System.arraycopy(icb, 0, counter, 0, icb.length);
-        counterSave = null;
-    }
-
-    /**
-     * Save the current content of this object.
-     */
-    void save() {
-        this.counterSave = this.counter.clone();
-    }
-
-    /**
-     * Restores the content of this object to the previous saved one.
-     */
-    void restore() {
-        if (this.counterSave != null) {
-            this.counter = this.counterSave;
-        }
-    }
 }
diff --git a/jdk/src/share/classes/com/sun/crypto/provider/GHASH.java b/jdk/src/share/classes/com/sun/crypto/provider/GHASH.java
index dc42e6bbf..78f0723d7 100644
--- a/jdk/src/share/classes/com/sun/crypto/provider/GHASH.java
+++ b/jdk/src/share/classes/com/sun/crypto/provider/GHASH.java
@@ -122,10 +122,10 @@ final class GHASH {

     }

-    /* subkeyH and state are stored in long[] for GHASH intrinsic use */
+    /* subkeyHtbl and state are stored in long[] for GHASH intrinsic use */

-    // hash subkey H; should not change after the object has been constructed
-    private final long[] subkeyH;
+    // hashtable subkeyHtbl; holds 2*9 powers of subkeyH computed using carry-less multiplication
+    private long[] subkeyHtbl;

     // buffer for storing hash
     private final long[] state;
@@ -147,9 +147,9 @@ final class GHASH {
             throw new ProviderException("Internal error");
         }
         state = new long[2];
-        this.subkeyH = new long[2];
-        this.subkeyH[0] = getLong(subkeyH, 0);
-        this.subkeyH[1] = getLong(subkeyH, 8);
+        subkeyHtbl = new long[2*9];
+        subkeyHtbl[0] = getLong(subkeyH, 0);
+        subkeyHtbl[1] = getLong(subkeyH, 8);
     }

     /**
@@ -192,8 +192,8 @@ final class GHASH {
         if (inLen == 0) {
             return;
         }
-        ghashRangeCheck(in, inOfs, inLen, state, subkeyH);
-        processBlocks(in, inOfs, inLen/AES_BLOCK_SIZE, state, subkeyH);
+        ghashRangeCheck(in, inOfs, inLen, state, subkeyHtbl);
+        processBlocks(in, inOfs, inLen/AES_BLOCK_SIZE, state, subkeyHtbl);
     }

     private static void ghashRangeCheck(byte[] in, int inOfs, int inLen, long[] st, long[] subH) {
@@ -217,8 +217,8 @@ final class GHASH {
             throw new RuntimeException("internal state has invalid length: " +
                                        st.length);
         }
-        if (subH.length != 2) {
-            throw new RuntimeException("internal subkeyH has invalid length: " +
+        if (subH.length != 18) {
+            throw new RuntimeException("internal subkeyHtbl has invalid length: " +
                                        subH.length);
         }
     }
diff --git a/jdk/src/share/classes/sun/security/ssl/SSLSocketImpl.java b/jdk/src/share/classes/sun/security/ssl/SSLSocketImpl.java
index ab93e3097..dd2618455 100644
--- a/jdk/src/share/classes/sun/security/ssl/SSLSocketImpl.java
+++ b/jdk/src/share/classes/sun/security/ssl/SSLSocketImpl.java
@@ -439,6 +439,8 @@ public final class SSLSocketImpl
                 if (!conContext.isNegotiated) {
                     readHandshakeRecord();
                 }
+            } catch (InterruptedIOException iioe) {
+                handleException(iioe);
             } catch (IOException ioe) {
                 throw conContext.fatal(Alert.HANDSHAKE_FAILURE,
                     "Couldn't kickstart handshaking", ioe);
@@ -1309,12 +1311,11 @@ public final class SSLSocketImpl
                 }
             } catch (SSLException ssle) {
                 throw ssle;
+            } catch (InterruptedIOException iioe) {
+                // don't change exception in case of timeouts or interrupts
+                throw iioe;
             } catch (IOException ioe) {
-                if (!(ioe instanceof SSLException)) {
-                    throw new SSLException("readHandshakeRecord", ioe);
-                } else {
-                    throw ioe;
-                }
+                throw new SSLException("readHandshakeRecord", ioe);
             }
         }

@@ -1375,6 +1376,9 @@ public final class SSLSocketImpl
                 }
             } catch (SSLException ssle) {
                 throw ssle;
+            } catch (InterruptedIOException iioe) {
+                // don't change exception in case of timeouts or interrupts
+                throw iioe;
             } catch (IOException ioe) {
                 if (!(ioe instanceof SSLException)) {
                     throw new SSLException("readApplicationRecord", ioe);
diff --git a/jdk/src/share/classes/sun/security/ssl/SSLSocketInputRecord.java b/jdk/src/share/classes/sun/security/ssl/SSLSocketInputRecord.java
index 401822759..ab5712acc 100644
--- a/jdk/src/share/classes/sun/security/ssl/SSLSocketInputRecord.java
+++ b/jdk/src/share/classes/sun/security/ssl/SSLSocketInputRecord.java
@@ -26,6 +26,7 @@
 package sun.security.ssl;

 import java.io.EOFException;
+import java.io.InterruptedIOException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
@@ -47,37 +48,31 @@ import sun.security.ssl.SSLCipher.SSLReadCipher;
 final class SSLSocketInputRecord extends InputRecord implements SSLRecord {
     private InputStream is = null;
     private OutputStream os = null;
-    private final byte[] temporary = new byte[1024];
+    private final byte[] header = new byte[headerSize];
+    private int headerOff = 0;
+    // Cache for incomplete record body.
+    private ByteBuffer recordBody = ByteBuffer.allocate(1024);

     private boolean formatVerified = false;     // SSLv2 ruled out?

     // Cache for incomplete handshake messages.
     private ByteBuffer handshakeBuffer = null;

-    private boolean hasHeader = false;          // Had read the record header
-
     SSLSocketInputRecord(HandshakeHash handshakeHash) {
         super(handshakeHash, SSLReadCipher.nullTlsReadCipher());
     }

     @Override
     int bytesInCompletePacket() throws IOException {
-        if (!hasHeader) {
-            // read exactly one record
-            try {
-                int really = read(is, temporary, 0, headerSize);
-                if (really < 0) {
-                    // EOF: peer shut down incorrectly
-                    return -1;
-                }
-            } catch (EOFException eofe) {
-                // The caller will handle EOF.
-                return -1;
-            }
-            hasHeader = true;
+        // read header
+        try {
+            readHeader();
+        } catch (EOFException eofe) {
+            // The caller will handle EOF.
+            return -1;
         }

-        byte byteZero = temporary[0];
+        byte byteZero = header[0];
         int len = 0;

         /*
@@ -93,9 +88,9 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord {
              * Last sanity check that it's not a wild record
              */
             if (!ProtocolVersion.isNegotiable(
-                    temporary[1], temporary[2], false)) {
+                    header[1], header[2], false)) {
                 throw new SSLException("Unrecognized record version " +
-                        ProtocolVersion.nameOf(temporary[1], temporary[2]) +
+                        ProtocolVersion.nameOf(header[1], header[2]) +
                         " , plaintext connection?");
             }

@@ -109,8 +104,8 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord {
             /*
              * One of the SSLv3/TLS message types.
              */
-            len = ((temporary[3] & 0xFF) << 8) +
-                   (temporary[4] & 0xFF) + headerSize;
+            len = ((header[3] & 0xFF) << 8) +
+                    (header[4] & 0xFF) + headerSize;
         } else {
             /*
              * Must be SSLv2 or something unknown.
@@ -121,11 +116,11 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord {
              */
             boolean isShort = ((byteZero & 0x80) != 0);

-            if (isShort && ((temporary[2] == 1) || (temporary[2] == 4))) {
+            if (isShort && ((header[2] == 1) || (header[2] == 4))) {
                 if (!ProtocolVersion.isNegotiable(
-                        temporary[3], temporary[4], false)) {
+                        header[3], header[4], false)) {
                     throw new SSLException("Unrecognized record version " +
-                            ProtocolVersion.nameOf(temporary[3], temporary[4]) +
+                            ProtocolVersion.nameOf(header[3], header[4]) +
                             " , plaintext connection?");
                 }

@@ -138,9 +133,9 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord {
                 //
                 // int mask = (isShort ? 0x7F : 0x3F);
                 // len = ((byteZero & mask) << 8) +
-                //        (temporary[1] & 0xFF) + (isShort ? 2 : 3);
+                //        (header[1] & 0xFF) + (isShort ? 2 : 3);
                 //
-                len = ((byteZero & 0x7F) << 8) + (temporary[1] & 0xFF) + 2;
+                len = ((byteZero & 0x7F) << 8) + (header[1] & 0xFF) + 2;
             } else {
                 // Gobblygook!
                 throw new SSLException(
@@ -160,34 +155,41 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord {
             return null;
         }

-        if (!hasHeader) {
-            // read exactly one record
-            int really = read(is, temporary, 0, headerSize);
-            if (really < 0) {
-                throw new EOFException("SSL peer shut down incorrectly");
-            }
-            hasHeader = true;
-        }
+        // read header
+        readHeader();

-        Plaintext plaintext = null;
-        if (!formatVerified) {
-            formatVerified = true;
+        Plaintext[] plaintext = null;
+        boolean cleanInBuffer = true;
+        try {
+            if (!formatVerified) {
+                formatVerified = true;

-            /*
-             * The first record must either be a handshake record or an
-             * alert message. If it's not, it is either invalid or an
-             * SSLv2 message.
-             */
-            if ((temporary[0] != ContentType.HANDSHAKE.id) &&
-                (temporary[0] != ContentType.ALERT.id)) {
-                hasHeader = false;
-                return handleUnknownRecord(temporary);
+                /*
+                 * The first record must either be a handshake record or an
+                 * alert message. If it's not, it is either invalid or an
+                 * SSLv2 message.
+                 */
+                if ((header[0] != ContentType.HANDSHAKE.id) &&
+                        (header[0] != ContentType.ALERT.id)) {
+                    plaintext = handleUnknownRecord();
+                }
             }
-        }

-        // The record header should has consumed.
-        hasHeader = false;
-        return decodeInputRecord(temporary);
+            // The record header should has consumed.
+            if (plaintext == null) {
+                plaintext = decodeInputRecord();
+            }
+        } catch(InterruptedIOException e) {
+            // do not clean header and recordBody in case of Socket Timeout
+            cleanInBuffer = false;
+            throw e;
+        } finally {
+            if (cleanInBuffer) {
+                headerOff = 0;
+                recordBody.clear();
+            }
+        }
+        return plaintext;
     }

     @Override
@@ -200,9 +202,7 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord {
         this.os = outputStream;
     }

-    // Note that destination may be null
-    private Plaintext[] decodeInputRecord(
-            byte[] header) throws IOException, BadPaddingException {
+    private Plaintext[] decodeInputRecord() throws IOException, BadPaddingException {
         byte contentType = header[0];                   // pos: 0
         byte majorVersion = header[1];                  // pos: 1
         byte minorVersion = header[2];                  // pos: 2
@@ -227,30 +227,27 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord {
         }

         //
-        // Read a complete record.
+        // Read a complete record and store in the recordBody
+        // recordBody is used to cache incoming record and restore in case of
+        // read operation timedout
         //
-        ByteBuffer destination = ByteBuffer.allocate(headerSize + contentLen);
-        int dstPos = destination.position();
-        destination.put(temporary, 0, headerSize);
-        while (contentLen > 0) {
-            int howmuch = Math.min(temporary.length, contentLen);
-            int really = read(is, temporary, 0, howmuch);
-            if (really < 0) {
-                throw new EOFException("SSL peer shut down incorrectly");
+        if (recordBody.position() == 0) {
+            if (recordBody.capacity() < contentLen) {
+                recordBody = ByteBuffer.allocate(contentLen);
             }
-
-            destination.put(temporary, 0, howmuch);
-            contentLen -= howmuch;
+            recordBody.limit(contentLen);
+        } else {
+            contentLen = recordBody.remaining();
         }
-        destination.flip();
-        destination.position(dstPos + headerSize);
+        readFully(contentLen);
+        recordBody.flip();

         if (SSLLogger.isOn && SSLLogger.isOn("record")) {
             SSLLogger.fine(
                     "READ: " +
                     ProtocolVersion.nameOf(majorVersion, minorVersion) +
                     " " + ContentType.nameOf(contentType) + ", length = " +
-                    destination.remaining());
+                    recordBody.remaining());
         }

         //
@@ -259,7 +256,7 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord {
         ByteBuffer fragment;
         try {
             Plaintext plaintext =
-                    readCipher.decrypt(contentType, destination, null);
+                    readCipher.decrypt(contentType, recordBody, null);
             fragment = plaintext.fragment;
             contentType = plaintext.contentType;
         } catch (BadPaddingException bpe) {
@@ -368,8 +365,7 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord {
             };
     }

-    private Plaintext[] handleUnknownRecord(
-            byte[] header) throws IOException, BadPaddingException {
+    private Plaintext[] handleUnknownRecord() throws IOException, BadPaddingException {
         byte firstByte = header[0];
         byte thirdByte = header[2];

@@ -411,32 +407,29 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord {
             }

             int msgLen = ((header[0] & 0x7F) << 8) | (header[1] & 0xFF);
-
-            ByteBuffer destination = ByteBuffer.allocate(headerSize + msgLen);
-            destination.put(temporary, 0, headerSize);
-            msgLen -= 3;            // had read 3 bytes of content as header
-            while (msgLen > 0) {
-                int howmuch = Math.min(temporary.length, msgLen);
-                int really = read(is, temporary, 0, howmuch);
-                if (really < 0) {
-                    throw new EOFException("SSL peer shut down incorrectly");
+            if (recordBody.position() == 0) {
+                if (recordBody.capacity() < (headerSize + msgLen)) {
+                    recordBody = ByteBuffer.allocate(headerSize + msgLen);
                 }
-
-                destination.put(temporary, 0, howmuch);
-                msgLen -= howmuch;
+                recordBody.limit(headerSize + msgLen);
+                recordBody.put(header, 0, headerSize);
+            } else {
+                msgLen = recordBody.remaining();
             }
-            destination.flip();
+            msgLen -= 3;            // had read 3 bytes of content as header
+            readFully(msgLen);
+            recordBody.flip();

             /*
              * If we can map this into a V3 ClientHello, read and
              * hash the rest of the V2 handshake, turn it into a
              * V3 ClientHello message, and pass it up.
              */
-            destination.position(2);     // exclude the header
-            handshakeHash.receive(destination);
-            destination.position(0);
+            recordBody.position(2);     // exclude the header
+            handshakeHash.receive(recordBody);
+            recordBody.position(0);

-            ByteBuffer converted = convertToClientHello(destination);
+            ByteBuffer converted = convertToClientHello(recordBody);

             if (SSLLogger.isOn && SSLLogger.isOn("packet")) {
                 SSLLogger.fine(
@@ -456,28 +449,42 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord {
         }
     }

-    // Read the exact bytes of data, otherwise, return -1.
-    private static int read(InputStream is,
-            byte[] buffer, int offset, int len) throws IOException {
-        int n = 0;
-        while (n < len) {
-            int readLen = is.read(buffer, offset + n, len - n);
-            if (readLen < 0) {
-                if (SSLLogger.isOn && SSLLogger.isOn("packet")) {
-                    SSLLogger.fine("Raw read: EOF");
-                }
-                return -1;
+    // Read the exact bytes of data, otherwise, throw IOException.
+    private int readFully(int len) throws IOException {
+        int end = len + recordBody.position();
+        int off = recordBody.position();
+        try {
+            while (off < end) {
+                off += read(is, recordBody.array(), off, end - off);
             }
+        } finally {
+            recordBody.position(off);
+        }
+        return len;
+    }
+
+    // Read SSE record header, otherwise, throw IOException.
+    private int readHeader() throws IOException {
+        while (headerOff < headerSize) {
+            headerOff += read(is, header, headerOff, headerSize - headerOff);
+        }
+        return headerSize;
+    }

+    private static int read(InputStream is, byte[] buf, int off, int len)  throws IOException {
+        int readLen = is.read(buf, off, len);
+        if (readLen < 0) {
             if (SSLLogger.isOn && SSLLogger.isOn("packet")) {
-                ByteBuffer bb = ByteBuffer.wrap(buffer, offset + n, readLen);
-                SSLLogger.fine("Raw read", bb);
+                SSLLogger.fine("Raw read: EOF");
             }
-
-            n += readLen;
+            throw new EOFException("SSL peer shut down incorrectly");
         }

-        return n;
+        if (SSLLogger.isOn && SSLLogger.isOn("packet")) {
+            ByteBuffer bb = ByteBuffer.wrap(buf, off, readLen);
+            SSLLogger.fine("Raw read", bb);
+        }
+        return readLen;
     }

     // Try to use up the input stream without impact the performance too much.
diff --git a/jdk/src/share/classes/sun/security/ssl/SSLTransport.java b/jdk/src/share/classes/sun/security/ssl/SSLTransport.java
index b3d03b370..78e13ea2c 100644
--- a/jdk/src/share/classes/sun/security/ssl/SSLTransport.java
+++ b/jdk/src/share/classes/sun/security/ssl/SSLTransport.java
@@ -27,6 +27,7 @@ package sun.security.ssl;

 import java.io.EOFException;
 import java.io.IOException;
+import java.io.InterruptedIOException;
 import java.nio.ByteBuffer;
 import javax.crypto.AEADBadTagException;
 import javax.crypto.BadPaddingException;
@@ -134,6 +135,9 @@ interface SSLTransport {
         } catch (EOFException eofe) {
             // rethrow EOFException, the call will handle it if neede.
             throw eofe;
+        } catch (InterruptedIOException iioe) {
+            // don't close the Socket in case of timeouts or interrupts.
+            throw iioe;
         } catch (IOException ioe) {
             throw context.fatal(Alert.UNEXPECTED_MESSAGE, ioe);
         }
diff --git a/jdk/test/micro/org/openjdk/bench/javax/crypto/full/AESGCMBench.java b/jdk/test/micro/org/openjdk/bench/javax/crypto/full/AESGCMBench.java
new file mode 100644
index 000000000..258672f59
--- /dev/null
+++ b/jdk/test/micro/org/openjdk/bench/javax/crypto/full/AESGCMBench.java
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2015, 2019, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package org.openjdk.bench.javax.crypto.full;
+
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Setup;
+
+import javax.crypto.Cipher;
+import javax.crypto.spec.GCMParameterSpec;
+import javax.crypto.spec.SecretKeySpec;
+
+/**
+ * This performance tests runs AES/GCM encryption and decryption using byte[]
+ * as input and output buffers for single and multi-part testing.
+ *
+ * This test rotates the IV and creates a new GCMParameterSpec for each encrypt
+ * benchmark operation
+ */
+
+public class AESGCMBench extends CryptoBase {
+
+    @Param({"128"})
+    private int keyLength;
+
+    @Param({"1024", "1500", "4096", "16384"})
+    private int dataSize;
+
+    byte[] encryptedData;
+    byte[] in, out;
+    private Cipher encryptCipher;
+    private Cipher decryptCipher;
+    SecretKeySpec ks;
+    GCMParameterSpec gcm_spec;
+    byte[] iv;
+
+    private static final int IV_BUFFER_SIZE = 32;
+    private static final int IV_MODULO = IV_BUFFER_SIZE - 16;
+    int iv_index = 0;
+    int updateLen = 0;
+
+    private int next_iv_index() {
+        int r = iv_index;
+        iv_index = (iv_index + 1) % IV_MODULO;
+        return r;
+    }
+
+    @Setup
+    public void setup() throws Exception {
+        setupProvider();
+
+        // Setup key material
+        byte[] keystring = fillSecureRandom(new byte[keyLength / 8]);
+        ks = new SecretKeySpec(keystring, "AES");
+        iv = fillSecureRandom(new byte[IV_BUFFER_SIZE]);
+        gcm_spec = new GCMParameterSpec(96, iv, next_iv_index(), 16);
+
+        // Setup Cipher classes
+        encryptCipher = makeCipher(prov, "AES/GCM/NoPadding");
+        encryptCipher.init(Cipher.ENCRYPT_MODE, ks, gcm_spec);
+        decryptCipher = makeCipher(prov, "AES/GCM/NoPadding");
+        decryptCipher.init(Cipher.DECRYPT_MODE, ks,
+                encryptCipher.getParameters().
+                        getParameterSpec(GCMParameterSpec.class));
+
+        // Setup input/output buffers
+        in = fillRandom(new byte[dataSize]);
+        encryptedData = new byte[encryptCipher.getOutputSize(in.length)];
+        out = new byte[encryptedData.length];
+        encryptCipher.doFinal(in, 0, in.length, encryptedData, 0);
+        updateLen = in.length / 2;
+
+    }
+
+    @Benchmark
+    public void encrypt() throws Exception {
+        gcm_spec = new GCMParameterSpec(96, iv, next_iv_index(), 16);
+        encryptCipher.init(Cipher.ENCRYPT_MODE, ks, gcm_spec);
+        encryptCipher.doFinal(in, 0, in.length, out, 0);
+    }
+
+    @Benchmark
+    public void encryptMultiPart() throws Exception {
+        gcm_spec = new GCMParameterSpec(96, iv, next_iv_index(), 16);
+        encryptCipher.init(Cipher.ENCRYPT_MODE, ks, gcm_spec);
+        int outOfs = encryptCipher.update(in, 0, updateLen, out, 0);
+        encryptCipher.doFinal(in, updateLen, in.length - updateLen,
+                out, outOfs);
+    }
+
+    @Benchmark
+    public void decrypt() throws Exception {
+        decryptCipher.init(Cipher.DECRYPT_MODE, ks,
+                encryptCipher.getParameters().
+                        getParameterSpec(GCMParameterSpec.class));
+        decryptCipher.doFinal(encryptedData, 0, encryptedData.length, out, 0);
+    }
+
+    @Benchmark
+    public void decryptMultiPart() throws Exception {
+        decryptCipher.init(Cipher.DECRYPT_MODE, ks,
+                encryptCipher.getParameters().
+                        getParameterSpec(GCMParameterSpec.class));
+        decryptCipher.update(encryptedData, 0, updateLen, out, 0);
+        decryptCipher.doFinal(encryptedData, updateLen,
+                encryptedData.length - updateLen, out, 0);
+    }
+}
\ No newline at end of file
diff --git a/jdk/test/micro/org/openjdk/bench/javax/crypto/full/AESGCMByteBuffer.java b/jdk/test/micro/org/openjdk/bench/javax/crypto/full/AESGCMByteBuffer.java
new file mode 100644
index 000000000..cb6d20c51
--- /dev/null
+++ b/jdk/test/micro/org/openjdk/bench/javax/crypto/full/AESGCMByteBuffer.java
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package org.openjdk.bench.javax.crypto.full;
+
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Setup;
+
+import javax.crypto.Cipher;
+import javax.crypto.spec.GCMParameterSpec;
+import javax.crypto.spec.SecretKeySpec;
+import java.nio.ByteBuffer;
+
+/**
+ * This performance tests runs AES/GCM encryption and decryption using heap and
+ * direct ByteBuffers as input and output buffers for single and multi-part
+ * operations.
+ *
+ * This test rotates the IV and creates a new GCMParameterSpec for each encrypt
+ * benchmark operation
+ */
+
+public class AESGCMByteBuffer extends CryptoBase {
+
+    @Param({"128"})
+    private int keyLength;
+
+    @Param({"1024", "1500", "4096", "16384"})
+    private int dataSize;
+
+    @Param({"direct", "heap"})
+    private String dataMethod;
+
+    byte[] data;
+    ByteBuffer encryptedData;
+    ByteBuffer in, out;
+    private Cipher encryptCipher;
+    private Cipher decryptCipher;
+    SecretKeySpec ks;
+    GCMParameterSpec gcm_spec;
+    byte[] iv;
+
+    private static final int IV_BUFFER_SIZE = 32;
+    private static final int IV_MODULO = IV_BUFFER_SIZE - 16;
+    int iv_index = 0;
+    int updateLen = 0;
+
+    private int next_iv_index() {
+        int r = iv_index;
+        iv_index = (iv_index + 1) % IV_MODULO;
+        return r;
+    }
+
+    @Setup
+    public void setup() throws Exception {
+        setupProvider();
+
+        // Setup key material
+        byte[] keystring = fillSecureRandom(new byte[keyLength / 8]);
+        ks = new SecretKeySpec(keystring, "AES");
+        iv = fillSecureRandom(new byte[IV_BUFFER_SIZE]);
+        gcm_spec = new GCMParameterSpec(96, iv, next_iv_index(), 16);
+
+        // Setup Cipher classes
+        encryptCipher = makeCipher(prov, "AES/GCM/NoPadding");
+        encryptCipher.init(Cipher.ENCRYPT_MODE, ks, gcm_spec);
+        decryptCipher = makeCipher(prov, "AES/GCM/NoPadding");
+        decryptCipher.init(Cipher.DECRYPT_MODE, ks,
+                encryptCipher.getParameters().
+                        getParameterSpec(GCMParameterSpec.class));
+
+        // Setup input/output buffers
+        data = fillRandom(new byte[dataSize]);
+        if (dataMethod.equalsIgnoreCase("direct")) {
+            in = ByteBuffer.allocateDirect(data.length);
+            in.put(data);
+            in.flip();
+            encryptedData = ByteBuffer.allocateDirect(
+                    encryptCipher.getOutputSize(data.length));
+            out = ByteBuffer.allocateDirect(encryptedData.capacity());
+        } else if (dataMethod.equalsIgnoreCase("heap")) {
+            in = ByteBuffer.wrap(data);
+            encryptedData = ByteBuffer.allocate(
+                    encryptCipher.getOutputSize(data.length));
+            out = ByteBuffer.allocate(encryptedData.capacity());
+        }
+
+        encryptCipher.doFinal(in, encryptedData);
+        encryptedData.flip();
+        in.flip();
+        updateLen = in.remaining() / 2;
+    }
+
+    @Benchmark
+    public void encrypt() throws Exception {
+        gcm_spec = new GCMParameterSpec(96, iv, next_iv_index(), 16);
+        encryptCipher.init(Cipher.ENCRYPT_MODE, ks, gcm_spec);
+        encryptCipher.doFinal(in, out);
+        out.flip();
+        in.flip();
+    }
+
+    @Benchmark
+    public void encryptMultiPart() throws Exception {
+        gcm_spec = new GCMParameterSpec(96, iv, next_iv_index(), 16);
+        encryptCipher.init(Cipher.ENCRYPT_MODE, ks, gcm_spec);
+        in.limit(updateLen);
+        encryptCipher.update(in, out);
+        in.limit(in.capacity());
+        encryptCipher.doFinal(in, out);
+        out.flip();
+        in.flip();
+    }
+
+    @Benchmark
+    public void decrypt() throws Exception {
+        decryptCipher.init(Cipher.DECRYPT_MODE, ks,
+                encryptCipher.getParameters().
+                        getParameterSpec(GCMParameterSpec.class));
+        decryptCipher.doFinal(encryptedData, out);
+        encryptedData.flip();
+        out.flip();
+    }
+
+    @Benchmark
+    public void decryptMultiPart() throws Exception {
+        decryptCipher.init(Cipher.DECRYPT_MODE, ks,
+                encryptCipher.getParameters().
+                        getParameterSpec(GCMParameterSpec.class));
+
+        int len = encryptedData.remaining();
+        encryptedData.limit(updateLen);
+        decryptCipher.update(encryptedData, out);
+        encryptedData.limit(len);
+
+        decryptCipher.doFinal(encryptedData, out);
+        encryptedData.flip();
+        out.flip();
+    }
+
+}
\ No newline at end of file
diff --git a/jdk/test/micro/org/openjdk/bench/javax/crypto/full/CryptoBase.java b/jdk/test/micro/org/openjdk/bench/javax/crypto/full/CryptoBase.java
new file mode 100644
index 000000000..4af12703b
--- /dev/null
+++ b/jdk/test/micro/org/openjdk/bench/javax/crypto/full/CryptoBase.java
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2015, 2018, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package org.openjdk.bench.javax.crypto.full;
+
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+
+import javax.crypto.BadPaddingException;
+import javax.crypto.Cipher;
+import javax.crypto.IllegalBlockSizeException;
+import javax.crypto.NoSuchPaddingException;
+import java.security.NoSuchAlgorithmException;
+import java.security.Provider;
+import java.security.SecureRandom;
+import java.security.Security;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+
+
+@Fork(jvmArgsAppend = {"-XX:+AlwaysPreTouch"}, value = 5)
+@Warmup(iterations = 3, time = 3)
+@Measurement(iterations = 8, time = 2)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@State(Scope.Thread)
+@BenchmarkMode(Mode.Throughput)
+public class CryptoBase {
+
+    @Param({""})
+    private String provider;
+
+    public Provider prov = null;
+
+    @Setup
+    public void setupProvider() {
+        if (provider != null && !provider.isEmpty()) {
+            prov = Security.getProvider(provider);
+            if (prov == null) {
+                throw new RuntimeException("Can't find prodiver \"" + provider + "\"");
+            }
+        }
+    }
+
+    public static Cipher makeCipher(Provider prov, String algorithm) throws NoSuchPaddingException, NoSuchAlgorithmException {
+        return (prov == null) ? Cipher.getInstance(algorithm) : Cipher.getInstance(algorithm, prov);
+    }
+
+    public static byte[][] fillRandom(byte[][] data) {
+        Random rnd = new Random();
+        for (byte[] d : data) {
+            rnd.nextBytes(d);
+        }
+        return data;
+    }
+
+    public static byte[] fillRandom(byte[] data) {
+        Random rnd = new Random();
+        rnd.nextBytes(data);
+        return data;
+    }
+
+    public static byte[] fillSecureRandom(byte[] data) {
+        SecureRandom rnd = new SecureRandom();
+        rnd.nextBytes(data);
+        return data;
+    }
+
+    public static byte[][] fillEncrypted(byte[][] data, Cipher encryptCipher) throws BadPaddingException, IllegalBlockSizeException {
+        byte[][] encryptedData = new byte[data.length][];
+        for (int i = 0; i < encryptedData.length; i++) {
+            encryptedData[i] = encryptCipher.doFinal(data[i]);
+        }
+        return encryptedData;
+    }
+}
\ No newline at end of file
diff --git a/jdk/test/micro/org/openjdk/bench/javax/crypto/small/AESGCMBench.java b/jdk/test/micro/org/openjdk/bench/javax/crypto/small/AESGCMBench.java
new file mode 100644
index 000000000..a21b0c87f
--- /dev/null
+++ b/jdk/test/micro/org/openjdk/bench/javax/crypto/small/AESGCMBench.java
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2015, 2021, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package org.openjdk.bench.javax.crypto.small;
+
+import org.openjdk.jmh.annotations.Param;
+
+public class AESGCMBench extends
+        org.openjdk.bench.javax.crypto.full.AESGCMBench {
+
+    @Param({"128"})
+    private int keyLength;
+
+    @Param({"1024"})
+    private int dataSize;
+
+}
\ No newline at end of file
diff --git a/jdk/test/micro/org/openjdk/bench/javax/crypto/small/AESGCMByteBuffer.java b/jdk/test/micro/org/openjdk/bench/javax/crypto/small/AESGCMByteBuffer.java
new file mode 100644
index 000000000..2e389d300
--- /dev/null
+++ b/jdk/test/micro/org/openjdk/bench/javax/crypto/small/AESGCMByteBuffer.java
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package org.openjdk.bench.javax.crypto.small;
+
+import org.openjdk.jmh.annotations.Param;
+
+public class AESGCMByteBuffer extends
+        org.openjdk.bench.javax.crypto.full.AESGCMByteBuffer {
+
+    @Param({"128"})
+    private int keyLength;
+
+    @Param({"1024"})
+    private int dataSize;
+
+}
\ No newline at end of file
diff --git a/jdk/test/sun/security/ssl/SSLSocketImpl/ClientTimeout.java b/jdk/test/sun/security/ssl/SSLSocketImpl/ClientTimeout.java
index 3eb1d7b89..7678cc71f 100644
--- a/jdk/test/sun/security/ssl/SSLSocketImpl/ClientTimeout.java
+++ b/jdk/test/sun/security/ssl/SSLSocketImpl/ClientTimeout.java
@@ -26,8 +26,7 @@

 /*
  * @test
- * @bug 4836493
- * @ignore need further evaluation
+ * @bug 4836493 8239798
  * @summary Socket timeouts for SSLSockets causes data corruption.
  * @run main/othervm ClientTimeout
  */
diff --git a/jdk/test/sun/security/ssl/SSLSocketImpl/SSLExceptionForIOIssue.java b/jdk/test/sun/security/ssl/SSLSocketImpl/SSLExceptionForIOIssue.java
index 3e626a257..5578ea725 100644
--- a/jdk/test/sun/security/ssl/SSLSocketImpl/SSLExceptionForIOIssue.java
+++ b/jdk/test/sun/security/ssl/SSLSocketImpl/SSLExceptionForIOIssue.java
@@ -36,7 +36,7 @@

 import javax.net.ssl.*;
 import java.io.*;
-import java.net.InetAddress;
+import java.net.*;

 public class SSLExceptionForIOIssue implements SSLContextTemplate {

@@ -139,7 +139,7 @@ public class SSLExceptionForIOIssue implements SSLContextTemplate {
             } catch (SSLProtocolException | SSLHandshakeException sslhe) {
                 clientException = sslhe;
                 System.err.println("unexpected client exception: " + sslhe);
-            } catch (SSLException ssle) {
+            } catch (SSLException | SocketTimeoutException ssle) {
                 // the expected exception, ignore it
                 System.err.println("expected client exception: " + ssle);
             } catch (Exception e) {
--
2.17.1