-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsetup_commands
40 lines (36 loc) · 1.37 KB
/
setup_commands
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
conda create -n crawl -y
conda activate crawl
conda install pip
pip install requests
pip install warcio
pip install beautifulsoup4
pip install tqdm
pip install bunkai
pip install emoji==1.7.0
pip install nltk
pip install mecab-python3
pip install fughashi
pip install ja-sentence-segmenter
pip install datasets
pip install hojichar
pip install scikit-learn
pip install python-tlsh
#torch
#pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
#mecab 日本語の解析に使います
sudo apt install mecab -y
sudo apt install libmecab-dev -y
sudo apt install mecab-ipadic-utf8 -y
#fasttext 記事のフィルタリングに使います
pip install gensim
cd mc4s/annotations/text_labels/
wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ja.300.bin.gz
gzip -d cc.ja.300.bin.gz
#kenlm warcファイルの解析に使います
#https://zenn.dev/syoyo/articles/529ce949121ca4
sudo apt install build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev
pip install https://github.com/kpu/kenlm/archive/master.zip
python -m pip install sentencepiece "protobuf<3.20.*"
mkdir -p warc/data/lm_sp
wget -c -P warc/data/lm_sp http://dl.fbaipublicfiles.com/cc_net/lm/ja.arpa.bin
wget -c -P warc/data/lm_sp http://dl.fbaipublicfiles.com/cc_net/lm/ja.sp.model