Skip to content

Commit 6bd60a9

Browse files
pham.huu.quangpham.huu.quang
authored andcommitted
fix coding convention
hotfix topdup_app init
1 parent 5f85afc commit 6bd60a9

File tree

16 files changed

+352
-275
lines changed

16 files changed

+352
-275
lines changed

src/topdup_open/autoload_data/_config.py

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,29 @@
11
import os
2-
from dotenv import load_dotenv #comment if using docker
3-
#load environment variable
4-
load_dotenv('.env')#comment if using docker
2+
from dotenv import load_dotenv # comment if using docker
3+
4+
# load environment variable
5+
load_dotenv(".env") # comment if using docker
56
# RabbitMQ host
6-
HOST = 'tech-monitor.vnalert.vn'
7+
HOST = "tech-monitor.vnalert.vn"
78
PORT = 19000
8-
USERNAME = os.environ['USERNAME_MONITOR']
9-
PASSWORD = os.environ['PASSWORD_MONITOR']
10-
EXCHANGE = 'docbao_tech_protect'
11-
POST_QUEUE = 'tech_protect_AI' # queue to bind to get posts
9+
USERNAME = os.environ["USERNAME_MONITOR"]
10+
PASSWORD = os.environ["PASSWORD_MONITOR"]
11+
EXCHANGE = "docbao_tech_protect"
12+
POST_QUEUE = "tech_protect_AI" # queue to bind to get posts
1213
MAX_POST = 10 # number of post to push each queue
1314
WAIT_BETWEEN_POST = 0.5
1415

1516
# file, model path
1617
PROJECT_DIR = os.getcwd()
1718
# PROJECT_DIR = '/app'
18-
EMBEDDING_FILE = f'{PROJECT_DIR}/dataset/post_embedding.pkl'
19-
DATABASE_URI = f'sqlite:///{PROJECT_DIR}/dataset/post_database.db'
20-
TF_IDF = f'{PROJECT_DIR}/dataset/tf_idf_model.pkl'
21-
FAKE_DATASET = f'{PROJECT_DIR}/dataset/fake_dataset.csv'
22-
LOG_FILE = f'{PROJECT_DIR}/dataset/logs.txt'
19+
EMBEDDING_FILE = f"{PROJECT_DIR}/dataset/post_embedding.pkl"
20+
DATABASE_URI = f"sqlite:///{PROJECT_DIR}/dataset/post_database.db"
21+
TF_IDF = f"{PROJECT_DIR}/dataset/tf_idf_model.pkl"
22+
FAKE_DATASET = f"{PROJECT_DIR}/dataset/fake_dataset.csv"
23+
LOG_FILE = f"{PROJECT_DIR}/dataset/logs.txt"
2324

24-
PICKLE_DATASET = f'{PROJECT_DIR}/dataset/post_dataset.pkl' ## save data for debug
25+
# save data for debug
26+
PICKLE_DATASET = f"{PROJECT_DIR}/dataset/post_dataset.pkl"
2527

2628
# other global variable
2729
TOP_K = 5

src/topdup_open/autoload_data/data_utils.py

Lines changed: 45 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,15 @@
1-
##################################################################################################
2-
#Rabbitmq: Docbao Rabbitmq Client - Dang Hai Loc #
3-
#Function: Get crawled posts through RabbitMQ #
4-
##################################################################################################
1+
######################################################################
2+
# Rabbitmq: Docbao Rabbitmq Client - Dang Hai Loc #
3+
# Function: Get crawled posts through RabbitMQ #
4+
######################################################################
55

66

7-
import os
87
import pika
9-
import sys
10-
import time
118
import pickle
12-
import numpy as np
13-
from time import sleep
14-
from random import randint, choice, choices
15-
from datetime import datetime, timedelta
16-
from scipy.sparse import csr_matrix, vstack
9+
from scipy.sparse import vstack
1710
from sklearn.metrics.pairwise import cosine_similarity
1811

19-
from ._config import *
12+
import _config
2013
from .raw_post import RawPost
2114
from .post_orm import Post
2215
from .post_orm import create_session, load_pickle_data
@@ -30,11 +23,11 @@
3023

3124

3225
def handle_post(new_posts):
33-
""" Handle post:
34-
Compute post_embedding,
35-
Search nearest post candidate for each post base on post_embedding
36-
Re-compute similarity_score for each candidate by Jaccard metric in compute_doc_similarity()
37-
Save post to database and pickle file
26+
"""Handle post:
27+
Compute post_embedding,
28+
Search nearest post candidate for each post base on post_embedding
29+
Re-compute similarity_score for each candidate by Jaccard metric
30+
in compute_doc_similarity(). Save post to database and pickle file
3831
"""
3932
if len(new_posts) == 0:
4033
return
@@ -48,14 +41,14 @@ def handle_post(new_posts):
4841
post.embedd_vector = None
4942

5043
new_posts = [post for post in new_posts if post.embedd_vector is not None]
51-
old_posts = load_pickle_data(EMBEDDING_FILE)
44+
old_posts = load_pickle_data(_config.EMBEDDING_FILE)
5245
logger.debug(f"OLD POSTS LENGTH: {len(old_posts)}")
5346
session.commit()
5447

5548
# compute and search nearest post
5649
if len(old_posts) > 0 and len(new_posts) > 0:
57-
old_ids = [post['id'] for post in old_posts]
58-
old_vectors = vstack([post['vector'] for post in old_posts])
50+
old_ids = [post["id"] for post in old_posts]
51+
old_vectors = vstack([post["vector"] for post in old_posts])
5952
new_vectors = vstack([post.embedd_vector for post in new_posts])
6053

6154
# sim_matrix[i,j] - similarity score of (new_posts[i], old_posts[j])
@@ -65,39 +58,33 @@ def handle_post(new_posts):
6558

6659
for i, post in enumerate(new_posts):
6760
score_list = enumerate(list(sim_matrix[i]))
68-
topK_score = sorted(
69-
score_list, key=lambda x: x[1], reverse=True)[:TOP_K]
70-
similarity_info = []
61+
topK_score = sorted(score_list,
62+
key=lambda x: x[1],
63+
reverse=True)[:_config.TOP_K]
7164

7265
# get similarity score with compute_doc_similarity function
7366
for index, _ in topK_score:
7467
sim_id = old_ids[index]
7568
sim_post = session.query(Post).get(sim_id)
7669
if (sim_post is not None) and (post.url != sim_post.url):
77-
score = compute_doc_similarity(post.content, sim_post.content)
70+
score = compute_doc_similarity(post.content,
71+
sim_post.content)
7872

7973
# append similarity info to database
80-
if score > SAVE_THRESH:
81-
post.add_similar_info({
82-
"id": sim_id,
83-
"score": score,
84-
"url": sim_post.url
85-
})
86-
sim_post.add_similar_info({
87-
'id': post.id,
88-
'score': score,
89-
'url': post.url
90-
})
74+
if score > _config.SAVE_THRESH:
75+
post.add_similar_info(
76+
{"id": sim_id, "score": score, "url": sim_post.url}
77+
)
78+
sim_post.add_similar_info(
79+
{"id": post.id, "score": score, "url": post.url}
80+
)
9181
del sim_matrix
9282

9383
# re-save all post embedding to pickle file
9484
for post in new_posts:
95-
old_posts.append({
96-
'id': post.id,
97-
'vector': post.embedd_vector
98-
})
85+
old_posts.append({"id": post.id, "vector": post.embedd_vector})
9986

100-
f = open(EMBEDDING_FILE, 'wb+')
87+
f = open(_config.EMBEDDING_FILE, "wb+")
10188
pickle.dump(old_posts, f)
10289
f.close()
10390
session.commit()
@@ -106,11 +93,14 @@ def handle_post(new_posts):
10693

10794
"""
10895
HOW TO USE
109-
This program will check repeatedly if there are new post in RabbitMQ queue. If there are new posts,
110-
it will parse binary message into Post() object, and for each Post instance, call Post.push_to_database()
96+
This program will check repeatedly if there are new post in RabbitMQ queue.
97+
If there are new posts, it will parse binary message into Post() object,
98+
and for each Post instance, call Post.push_to_database()
11199
to save it in database.
112100
"""
113-
def read_data_from_source(data_source='rabbitmq', save_raw_data=False):
101+
102+
103+
def read_data_from_source(data_source="rabbitmq", save_raw_data=False):
114104
"""
115105
Start a process that get data from RabbitMQ then push to database
116106
"""
@@ -120,33 +110,34 @@ def read_data_from_source(data_source='rabbitmq', save_raw_data=False):
120110
posts = [RawPost(body).to_orm_post() for body in all_body]
121111
return posts
122112

123-
if data_source == 'csv_dataset':
124-
posts = [fake_data() for i in range(MAX_POST)]
113+
if data_source == "csv_dataset":
114+
posts = [fake_data() for i in range(_config.MAX_POST)]
125115
return posts
126116

127117
# connect to RabbitMQ
128118
# login
129119

130-
credentials = pika.PlainCredentials(USERNAME, PASSWORD)
131-
parameters = pika.ConnectionParameters(HOST, PORT, '/', credentials)
120+
credentials = pika.PlainCredentials(_config.USERNAME, _config.PASSWORD)
121+
parameters = pika.ConnectionParameters(_config.HOST,
122+
_config.PORT, "/",
123+
credentials)
132124
connection = pika.BlockingConnection(parameters)
133125

134126
channel = connection.channel()
135-
queue_state = channel.queue_declare(POST_QUEUE, durable=True, passive=True)
136-
channel.queue_bind(exchange=EXCHANGE, queue=POST_QUEUE)
127+
queue_state = channel.queue_declare(_config.POST_QUEUE,
128+
durable=True, passive=True)
129+
channel.queue_bind(exchange=_config.EXCHANGE, queue=_config.POST_QUEUE)
137130
queue_length = queue_state.method.message_count
138131
logger.debug(f"QUEUE LENGTH: {queue_length}")
139132

140133
# start get message
141-
load_time = 0
142134
count_post = 0
143-
raw_posts = []
144135
posts = []
145136

146-
while (queue_length >= 1 and count_post < MAX_POST):
137+
while queue_length >= 1 and count_post < _config.MAX_POST:
147138
queue_length -= 1
148139
count_post += 1
149-
_, _, body = channel.basic_get(POST_QUEUE, auto_ack=True)
140+
_, _, body = channel.basic_get(_config.POST_QUEUE, auto_ack=True)
150141
if body is not None:
151142
# parse message into Post
152143
post = RawPost(body)

src/topdup_open/autoload_data/log.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,9 @@
55
def get_logger(name, f_name=_config.LOG_FILE):
66
logger = logging.getLogger(name)
77
logger.setLevel(logging.DEBUG)
8-
formater = logging.Formatter(
9-
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
10-
)
11-
fhanler = logging.FileHandler(f_name, 'a+', encoding='utf-8')
8+
formater = logging.Formatter("%(asctime)s - %(name)s "
9+
"- %(levelname)s - %(message)s")
10+
fhanler = logging.FileHandler(f_name, "a+", encoding="utf-8")
1211
fhanler.setFormatter(formater)
1312
fhanler.setLevel(logging.DEBUG)
1413
logger.addHandler(fhanler)

src/topdup_open/autoload_data/post_orm.py

Lines changed: 37 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,10 @@
22
import pickle
33
import json
44
import datetime
5-
import sqlalchemy
65
import pandas as pd
7-
from glob import glob
8-
from random import randint, shuffle, choice, choices
6+
from random import randint
97
from sqlalchemy import create_engine
10-
from sqlalchemy import create_engine, Column, Integer, String, DateTime, Float
8+
from sqlalchemy import Column, Integer, String, DateTime, Float
119
from sqlalchemy.ext.declarative import declarative_base
1210
from sqlalchemy.orm import sessionmaker
1311
from .log import get_logger
@@ -18,24 +16,25 @@
1816
engine = create_engine(_config.DATABASE_URI, echo=False)
1917
Base = declarative_base()
2018

19+
2120
class Post(Base):
2221
"""ORM class to communicate with database"""
23-
24-
__tablename__ = 'post'
22+
23+
__tablename__ = "post"
2524

2625
id = Column(Integer, primary_key=True, autoincrement=True)
2726
title = Column(String)
2827
content = Column(String)
29-
author = Column(String, default='')
30-
publish_time = Column(String, default='')
28+
author = Column(String, default="")
29+
publish_time = Column(String, default="")
3130
updated_time = Column(DateTime, default=datetime.datetime.utcnow)
3231
url = Column(String)
3332
max_score = Column(Float, default=0.0)
3433

3534
# similar_post_info: save all post_id and score that nearest the post,
3635
# format: [{id:, score:},..], save in database with String type
3736
similar_post_info = Column(String, default=json.dumps([]))
38-
embedd_vector = None # not saved in database
37+
embedd_vector = None # not saved in database
3938

4039
def set_similar_post_info(self, similar_info):
4140
"""
@@ -44,10 +43,11 @@ def set_similar_post_info(self, similar_info):
4443
"""
4544
if len(similar_info) == 0:
4645
return False
47-
similar_info = sorted(
48-
similar_info, key=lambda x: x['score'], reverse=True)
46+
similar_info = sorted(similar_info,
47+
key=lambda x: x["score"],
48+
reverse=True)
4949
self.similar_post_info = json.dumps(similar_info)
50-
self.max_score = round(similar_info[0]['score'], 3)
50+
self.max_score = round(similar_info[0]["score"], 3)
5151
return True
5252

5353
def add_similar_info(self, post_info):
@@ -57,12 +57,12 @@ def add_similar_info(self, post_info):
5757
"""
5858
json_info = json.loads(self.similar_post_info)
5959
for item in json_info:
60-
if item['url'] == post_info['url']:
61-
return None
60+
if item["url"] == post_info["url"]:
61+
return None
6262

6363
json_info.append(post_info)
64-
json_info = sorted(json_info, key=lambda x: x['score'], reverse=True)
65-
self.max_score = round(json_info[0]['score'], 3)
64+
json_info = sorted(json_info, key=lambda x: x["score"], reverse=True)
65+
self.max_score = round(json_info[0]["score"], 3)
6666
self.similar_post_info = json.dumps(json_info)
6767

6868
def get_similar_post_info(self):
@@ -79,6 +79,7 @@ def __repr__(self):
7979
Base.metadata.create_all(engine)
8080
Session = sessionmaker(bind=engine)
8181

82+
8283
def create_session():
8384
"""Init session for ORM classes"""
8485
session = Session()
@@ -89,48 +90,55 @@ def load_pickle_data(fn):
8990
all_data = []
9091
if os.path.isfile(fn):
9192
try:
92-
f = open(fn, 'rb+')
93+
f = open(fn, "rb+")
9394
all_data = pickle.load(f)
9495
f.close()
95-
except:
96+
except Exception:
9697
logger.exception("pickle file is empty")
9798
return all_data
9899

99100

100101
def check_valid_post(post, session):
101102
try:
102-
l = len(post.content)
103-
if l < _config.MIN_CHARACTER_LEN:
104-
logger.debug(f'post content is too short: length {l}, {post.title}, {post.url}')
103+
num_content = len(post.content)
104+
if num_content < _config.MIN_CHARACTER_LEN:
105+
logger.debug(
106+
f"post content is too short: length "
107+
"{num_content}, {post.title}, {post.url}"
108+
)
105109
return False
106110

107111
all_post = session.query(Post.title, Post.url).all()
108112
for title, url in all_post:
109113
if post.title == title and post.url == url:
110-
logger.debug(f'This post is already exists in database: {post.title}')
114+
logger.debug(
115+
f"This post is already exists in database: {post.title}"
116+
)
111117
return False
112118
return True
113119

114120
except Exception as e:
115121
logger.exception(e)
116122
return False
117123

124+
118125
df = None
119126

127+
120128
def fake_data():
121129
global df
122130
if df is None:
123131
df = pd.read_csv(_config.FAKE_DATASET)
124-
id = randint(0, len(df)-1)
132+
id = randint(0, len(df) - 1)
125133
item = df.loc[id]
126-
134+
127135
try:
128-
url = item['link']
129-
except:
130-
url = ''
136+
url = item["link"]
137+
except Exception:
138+
url = ""
131139
post = Post(
132-
title=item['title'],
133-
content=item['content'],
140+
title=item["title"],
141+
content=item["content"],
134142
url=url,
135143
)
136144
return post

0 commit comments

Comments
 (0)