File tree 1 file changed +38
-0
lines changed
rag_implement_by_suing_opensearch
1 file changed +38
-0
lines changed Original file line number Diff line number Diff line change
1
+ #1.필요 패키지 설치
2
+ # !pip install -q boto3
3
+ # !pip install -q requests
4
+ # !pip install -q requests-aws4auth
5
+ # !pip install -q opensearch-py
6
+ # !pip install -q tqdm
7
+ # !pip install -q boto3
8
+
9
+ #2.데이터준비
10
+ import pandas as pd
11
+ import requests
12
+
13
+ df = pd .read_csv ("./data/movies.csv" , low_memory = False )
14
+ df .head (5 )
15
+
16
+ ##데이터 스키마 확인
17
+ df .info ()
18
+
19
+
20
+ #현재 사용할 임베딩모델은 Cohere Multilingual
21
+ #이 모델의 Max sequence 길이는 512
22
+ #따라서 plot 컬럼의 최대길이를 512로 Truncate
23
+ max_length = 509
24
+ def truncate_plot (plot ):
25
+ if len (plot ) > max_length :
26
+ return plot [:max_length ] + "..."
27
+ else :
28
+ return plot
29
+
30
+ # Apply the function to the 'plot' column
31
+ df ["plot" ] = df ["plot" ].apply (truncate_plot )
32
+
33
+ #plot(줄거리)의 길이가 512가 넘는 레코드가 있는지 체크
34
+ def find_long_plot_items (df ):
35
+ long_plot_items = df [df ["plot" ].str .len () > 512 ]
36
+ return long_plot_items
37
+
38
+ find_long_plot_items (df ).count ()
You can’t perform that action at this time.
0 commit comments