-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsetup.sh
45 lines (37 loc) · 1.25 KB
/
setup.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
conda create -n textprocess -y
conda activate textprocess
conda install pip -y
pip install datasets==2.18.0
#mecab 日本語の解析に使います
sudo apt install mecab -y
sudo apt install libmecab-dev -y
sudo apt install mecab-ipadic-utf8 -y
pip install mecab==0.996.3
pip install ja-sentence-segmenter==0.0.2
pip install hojichar==0.9.0
#text clustering
pip install gensim==4.3.2
pip install scikit-learn==1.4.1.post1
#pip install dask==2024.3.1
#pip install "dask[dataframe]"
#pip install jinja2-3.1.3
#model download (w2v)
cd data
mkdir model
cd model
#wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ja.300.bin.gz
#gzip -d cc.ja.300.bin.gz
#こっちの方が軽量
wget http://www.cl.ecei.tohoku.ac.jp/~m-suzuki/jawiki_vector/data/20170201.tar.bz2
tar -xjvf 20170201.tar.bz2
cd ../../
#dedup build
# main.cppのhasherをキツめにするとdedupが強くなります
# Hasher hasher(5, 500, 50, 10);
sudo apt install nlohmann-json3-dev -y
git clone https://github.com/if001/dedup_sentence
cd dedup_sentence
git clone https://github.com/aappleby/smhasher.git
wget https://raw.githubusercontent.com/simdjson/simdjson/master/singleheader/simdjson.h
wget https://raw.githubusercontent.com/simdjson/simdjson/master/singleheader/simdjson.cpp
make