Add rotate_half implementation for fused_rope (#56401)

tianhaodongbd · web-flow · commit c089a2af5469 · 2023-09-04T18:52:29.000+08:00
* add rotate_half in fused_rope

* add position_ids in fused_rope

* modified examples about fused_rope

* add set_device in examples
diff --git a/paddle/phi/api/yaml/fused_backward.yaml b/paddle/phi/api/yaml/fused_backward.yaml
@@ -17,10 +17,10 @@
   support_dygraph_mode : true
 
 - backward_op : fused_rotary_position_embedding_grad
-  forward: fused_rotary_position_embedding (Tensor q, Tensor k, Tensor v, Tensor sin, Tensor cos) -> Tensor(out_q), Tensor(out_k), Tensor(out_v)
-  args : (Tensor sin, Tensor cos, Tensor out_q_grad, Tensor out_k_grad,Tensor out_v_grad)
+  forward: fused_rotary_position_embedding (Tensor q, Tensor k, Tensor v, Tensor sin, Tensor cos, Tensor position_ids, bool use_neox_rotary_style) -> Tensor(out_q), Tensor(out_k), Tensor(out_v)
+  args : (Tensor sin, Tensor cos, Tensor position_ids, Tensor out_q_grad, Tensor out_k_grad,Tensor out_v_grad, bool use_neox_rotary_style)
   output : Tensor(q_grad), Tensor(k_grad), Tensor(v_grad)
-  optional :  sin, cos, out_k_grad, out_v_grad, k_grad, v_grad
+  optional :  sin, cos, position_ids, out_k_grad, out_v_grad, k_grad, v_grad
   infer_meta :
     func : FusedRopeGradInferMeta
   kernel :
diff --git a/paddle/phi/api/yaml/fused_ops.yaml b/paddle/phi/api/yaml/fused_ops.yaml
@@ -149,11 +149,11 @@
   optional : cache_kv, pre_caches, rotary_pos_emb, time_step, seq_lengths, src_mask, gather_index
 
 - op : fused_rotary_position_embedding
-  args : (Tensor q, Tensor k, Tensor v, Tensor sin, Tensor cos)
+  args : (Tensor q, Tensor k, Tensor v, Tensor sin, Tensor cos, Tensor position_ids, bool use_neox_rotary_style = true)
   output : Tensor(out_q), Tensor(out_k), Tensor(out_v)
   infer_meta :
     func : FusedRopeInferMeta
-  optional : k,v,sin,cos, out_k, out_v
+  optional : k, v, sin, cos, position_ids, out_k, out_v
   kernel :
     func : fused_rotary_position_embedding
     data_type : q
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
@@ -1219,9 +1219,11 @@ void IndexPutGradInferMeta(const MetaTensor& x,
 
 void FusedRopeGradInferMeta(const MetaTensor& sin,
                             const MetaTensor& cos,
+                            const MetaTensor& position_ids,
                             const MetaTensor& dout_q,
                             const MetaTensor& dout_k,
                             const MetaTensor& dout_v,
+                            bool use_neox_rotary_style,
                             MetaTensor* dq,
                             MetaTensor* dk,
                             MetaTensor* dv) {
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
@@ -186,9 +186,11 @@ void FusedDropoutAddGradInferMeta(const MetaTensor& seed_offset,
 
 void FusedRopeGradInferMeta(const MetaTensor& sin,
                             const MetaTensor& cos,
+                            const MetaTensor& position_ids,
                             const MetaTensor& dout_q,
                             const MetaTensor& dout_k,
                             const MetaTensor& dout_v,
+                            bool use_neox_rotary_style,
                             MetaTensor* dq,
                             MetaTensor* dk,
                             MetaTensor* dv);
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
@@ -4041,6 +4041,8 @@ void FusedRopeInferMeta(const MetaTensor& q,
                         const MetaTensor& v,
                         const MetaTensor& sin,
                         const MetaTensor& cos,
+                        const MetaTensor& position_ids,
+                        bool use_neox_rotary_style,
                         MetaTensor* out_q,
                         MetaTensor* out_k,
                         MetaTensor* out_v) {
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
@@ -807,6 +807,8 @@ void FusedRopeInferMeta(const MetaTensor& q,
                         const MetaTensor& v,
                         const MetaTensor& sin,
                         const MetaTensor& cos,
+                        const MetaTensor& position_ids,
+                        bool use_neox_rotary_style,
                         MetaTensor* out_q,
                         MetaTensor* out_k,
                         MetaTensor* out_v);
diff --git a/paddle/phi/kernels/fusion/gpu/fused_rope_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_rope_grad_kernel.cu
@@ -27,9 +27,11 @@ template <typename T, typename Context>
 void FusedRopeGradKernel(const Context& dev_ctx,
                          const paddle::optional<DenseTensor>& sin,
                          const paddle::optional<DenseTensor>& cos,
+                         const paddle::optional<DenseTensor>& position_ids,
                          const DenseTensor& dout_q,
                          const paddle::optional<DenseTensor>& dout_k,
                          const paddle::optional<DenseTensor>& dout_v,
+                         bool use_neox_rotary_style,
                          DenseTensor* dq,
                          DenseTensor* dk,
                          DenseTensor* dv) {
@@ -58,6 +60,7 @@ void FusedRopeGradKernel(const Context& dev_ctx,
   phi::Array<T*, 3> outs_data;
   phi::Array<const T*, 3> ins_data;
   phi::Array<const T*, 2> sin_cos_data;
+  const int64_t* position_ids_data = NULL;
 
   ins_data[0] = dout_q.data<T>();
   outs_data[0] = dq->data<T>();
@@ -86,21 +89,42 @@ void FusedRopeGradKernel(const Context& dev_ctx,
     sin_cos_data[1] = cos->data<T>();
 
     flag_sin_cos = true;
+
+    if (position_ids.get_ptr()) {
+      position_ids_data = position_ids->data<int64_t>();
+    }
   }
 
   int sign = -1;
-  VectorizedFusedRopeKernel<T, MPType, vec_size>
-      <<<grid, block, 0, stream>>>(ins_data,
-                                   sin_cos_data,
-                                   flag_sin_cos,
-                                   sign,
-                                   batch_size,
-                                   seq_len,
-                                   num_heads,
-                                   head_dim,
-                                   outs_data,
-                                   num_inputs,
-                                   div_c);
+  if (use_neox_rotary_style) {
+    VectorizedFusedRopeWithRotateEveryTwoKernel<T, MPType, vec_size>
+        <<<grid, block, 0, stream>>>(ins_data,
+                                     sin_cos_data,
+                                     position_ids_data,
+                                     flag_sin_cos,
+                                     sign,
+                                     batch_size,
+                                     seq_len,
+                                     num_heads,
+                                     head_dim,
+                                     outs_data,
+                                     num_inputs,
+                                     div_c);
+  } else {
+    VectorizedFusedRopeWithRotateHalfKernel<T, MPType, vec_size>
+        <<<grid, block, 0, stream>>>(ins_data,
+                                     sin_cos_data,
+                                     position_ids_data,
+                                     flag_sin_cos,
+                                     sign,
+                                     batch_size,
+                                     seq_len,
+                                     num_heads,
+                                     head_dim,
+                                     outs_data,
+                                     num_inputs,
+                                     div_c);
+  }
 }
 
 }  // namespace fusion
diff --git a/paddle/phi/kernels/fusion/gpu/fused_rope_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_rope_kernel.cu
@@ -30,6 +30,8 @@ void FusedRopeKernel(const Context& dev_ctx,
                      const paddle::optional<DenseTensor>& v,
                      const paddle::optional<DenseTensor>& sin,
                      const paddle::optional<DenseTensor>& cos,
+                     const paddle::optional<DenseTensor>& position_ids,
+                     bool use_neox_rotary_style,
                      DenseTensor* out_q,
                      DenseTensor* out_k,
                      DenseTensor* out_v) {
@@ -59,6 +61,7 @@ void FusedRopeKernel(const Context& dev_ctx,
   phi::Array<T*, 3> outs_data;
   phi::Array<const T*, 3> ins_data;
   phi::Array<const T*, 2> sin_cos_data;
+  const int64_t* position_ids_data = NULL;
 
   ins_data[0] = q.data<T>();
   outs_data[0] = out_q->data<T>();
@@ -109,15 +112,52 @@ void FusedRopeKernel(const Context& dev_ctx,
               "The batch_size and num_heads of sin and cos must be 1."));
     }
     int sin_seq_len_dim = (dims_size) == 4 ? 1 : 0;
-    PADDLE_ENFORCE_EQ((sin_dims[dims_size - 1] == head_dim &&
-                       sin_dims[sin_seq_len_dim] == seq_len),
-                      true,
-                      phi::errors::InvalidArgument(
-                          "The seq_len and head_dim of sin and cos "
-                          "must be the same as those of q. But recieved sin's "
-                          "shape is {%s}, q's shape is {%s}.",
-                          sin_dims,
-                          q.dims()));
+
+    if (position_ids.get_ptr()) {
+      PADDLE_ENFORCE_EQ(
+          (sin_dims[dims_size - 1] == head_dim &&
+           sin_dims[sin_seq_len_dim] >= seq_len),
+          true,
+          phi::errors::InvalidArgument(
+              "The seq_len of sin and cos must be greater than or equal to "
+              "this of q. The head_dim of sin and cos must be the same as this "
+              "of q. But recieved sin's "
+              "shape is {%s}, q's shape is {%s}.",
+              sin_dims,
+              q.dims()));
+
+      auto position_ids_dims = position_ids.get_ptr()->dims();
+      PADDLE_ENFORCE_EQ(position_ids_dims.size(),
+                        2,
+                        phi::errors::InvalidArgument(
+                            "The dims of position_ids is expected to "
+                            "be 2, but recieved %d.",
+                            position_ids_dims.size()));
+
+      PADDLE_ENFORCE_EQ(
+          (position_ids_dims[0] == batch_size &&
+           position_ids_dims[1] == seq_len),
+          true,
+          phi::errors::InvalidArgument(
+              "The batch_size and seq_len of position_ids must be the same as "
+              "those of q. But recieved position_ids's "
+              "shape is {%s}, q's shape is {%s}.",
+              position_ids_dims,
+              q.dims()));
+
+      position_ids_data = position_ids->data<int64_t>();
+    } else {
+      PADDLE_ENFORCE_EQ(
+          (sin_dims[dims_size - 1] == head_dim &&
+           sin_dims[sin_seq_len_dim] == seq_len),
+          true,
+          phi::errors::InvalidArgument(
+              "The seq_len and head_dim of sin and cos "
+              "must be the same as those of q. But recieved sin's "
+              "shape is {%s}, q's shape is {%s}.",
+              sin_dims,
+              q.dims()));
+    }
 
     sin_cos_data[0] = sin->data<T>();
     sin_cos_data[1] = cos->data<T>();
@@ -126,18 +166,35 @@ void FusedRopeKernel(const Context& dev_ctx,
   }
 
   int sign = 1;
-  VectorizedFusedRopeKernel<T, MPType, vec_size>
-      <<<grid, block, 0, stream>>>(ins_data,
-                                   sin_cos_data,
-                                   flag_sin_cos,
-                                   sign,
-                                   batch_size,
-                                   seq_len,
-                                   num_heads,
-                                   head_dim,
-                                   outs_data,
-                                   num_inputs,
-                                   div_c);
+  if (use_neox_rotary_style) {
+    VectorizedFusedRopeWithRotateEveryTwoKernel<T, MPType, vec_size>
+        <<<grid, block, 0, stream>>>(ins_data,
+                                     sin_cos_data,
+                                     position_ids_data,
+                                     flag_sin_cos,
+                                     sign,
+                                     batch_size,
+                                     seq_len,
+                                     num_heads,
+                                     head_dim,
+                                     outs_data,
+                                     num_inputs,
+                                     div_c);
+  } else {
+    VectorizedFusedRopeWithRotateHalfKernel<T, MPType, vec_size>
+        <<<grid, block, 0, stream>>>(ins_data,
+                                     sin_cos_data,
+                                     position_ids_data,
+                                     flag_sin_cos,
+                                     sign,
+                                     batch_size,
+                                     seq_len,
+                                     num_heads,
+                                     head_dim,
+                                     outs_data,
+                                     num_inputs,
+                                     div_c);
+  }
 }
 }  // namespace fusion
 }  // namespace phi
diff --git a/paddle/phi/kernels/fusion/gpu/fused_rope_utils.h b/paddle/phi/kernels/fusion/gpu/fused_rope_utils.h
diff --git a/python/paddle/incubate/nn/functional/fused_rotary_position_embedding.py b/python/paddle/incubate/nn/functional/fused_rotary_position_embedding.py
diff --git a/test/legacy_test/test_fused_rotary_position_embedding.py b/test/legacy_test/test_fused_rotary_position_embedding.py