Skip to content

Commit 6dcfe07

Browse files
committed
semantic - 필수패키지적재 & 데이터 준비
1 parent 388b28d commit 6dcfe07

File tree

1 file changed

+38
-0
lines changed

1 file changed

+38
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#1.필요 패키지 설치
2+
# !pip install -q boto3
3+
# !pip install -q requests
4+
# !pip install -q requests-aws4auth
5+
# !pip install -q opensearch-py
6+
# !pip install -q tqdm
7+
# !pip install -q boto3
8+
9+
#2.데이터준비
10+
import pandas as pd
11+
import requests
12+
13+
df = pd.read_csv("./data/movies.csv", low_memory=False)
14+
df.head(5)
15+
16+
##데이터 스키마 확인
17+
df.info()
18+
19+
20+
#현재 사용할 임베딩모델은 Cohere Multilingual
21+
#이 모델의 Max sequence 길이는 512
22+
#따라서 plot 컬럼의 최대길이를 512로 Truncate
23+
max_length = 509
24+
def truncate_plot(plot):
25+
if len(plot) > max_length:
26+
return plot[:max_length] + "..."
27+
else:
28+
return plot
29+
30+
# Apply the function to the 'plot' column
31+
df["plot"] = df["plot"].apply(truncate_plot)
32+
33+
#plot(줄거리)의 길이가 512가 넘는 레코드가 있는지 체크
34+
def find_long_plot_items(df):
35+
long_plot_items = df[df["plot"].str.len() > 512]
36+
return long_plot_items
37+
38+
find_long_plot_items(df).count()

0 commit comments

Comments
 (0)