Skip to content

Commit

Permalink
Merge pull request #108 from hyejungg/develop
Browse files Browse the repository at this point in the history
merge to main
  • Loading branch information
hyejungg authored Feb 6, 2025
2 parents 0dee1d7 + e959491 commit c1fdc43
Show file tree
Hide file tree
Showing 9 changed files with 88 additions and 55 deletions.
5 changes: 3 additions & 2 deletions news-scraper-agent/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,16 @@

## 실행 방법
```shell
poetry shell
poetry env activate
poetry install
python {파일명}.py
```

## 배포 방법
```shell
# 로컬에서 실행 방법
sam local invoke
sam local invoke --config-env {phase} {이미지 이름}


# 빌드 후 배포
sam build --config-env {phase}
Expand Down
8 changes: 3 additions & 5 deletions news-scraper-agent/config/env_config.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
from typing import Optional

from dotenv import load_dotenv
from pydantic_settings import BaseSettings, SettingsConfigDict

from typing import Optional

load_dotenv()

Expand All @@ -14,12 +12,12 @@ class Environment(BaseSettings):
# db
MONGO_DB_LOCAL_URI: str
MONGO_DB_DEV_URI: str
MONGO_DB_REAL_URI: str
MONGO_DB_PROD_URI: str

# kakaoworks
KAWORK_WEBHOOK_LOCAL_URI: str
KAWORK_WEBHOOK_DEV_URI: str
KAWORK_WEBHOOK_REAL_URI: str
KAWORK_WEBHOOK_PROD_URI: str

# langsmith
LANGCHAIN_ENDPOINT: Optional[str] = None
Expand Down
2 changes: 1 addition & 1 deletion news-scraper-agent/config/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def _initialize_logger(self):
formatter = logging.Formatter(fmt="%(name)16s - %(message)s")

# Logger 레벨 설정
self.setLevel(logging.DEBUG if env.PROFILE != "real" else logging.INFO)
self.setLevel(logging.DEBUG if env.PROFILE != "prod" else logging.INFO)

# RichHandler 추가
rich_handler = RichHandler(
Expand Down
2 changes: 1 addition & 1 deletion news-scraper-agent/external/kakaowork/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
HTTP_CONTENT_TYPE = "application/json"

WEBHOOK_URL_MAP = {
"real": env.KAWORK_WEBHOOK_REAL_URI,
"prod": env.KAWORK_WEBHOOK_PROD_URI,
"dev": env.KAWORK_WEBHOOK_DEV_URI,
"local": env.KAWORK_WEBHOOK_LOCAL_URI,
}
Expand Down
17 changes: 4 additions & 13 deletions news-scraper-agent/loader/connect.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,20 @@
from mongoengine import connect

from config.env_config import env
from config.log import NewsScraperAgentLogger
from models.message import Message
from models.site import Site
from mongoengine import connect

logger = NewsScraperAgentLogger()


def connect_db():
try:
if env.PROFILE == "real":
connect(host=env.MONGO_DB_REAL_URI)
if env.PROFILE == "prod":
connect(host=env.MONGO_DB_PROD_URI)
elif env.PROFILE == "develop":
connect(host=env.MONGO_DB_DEV_URI)
else:
connect(host=env.MONGO_DB_LOCAL_URI)

logger.info("MongoDB Connected ...")

# 컬렉션 생성 확인
Site.ensure_indexes()
logger.info("Site Collection is ready!")
Message.ensure_indexes()
logger.info("Message Collection is ready!")
logger.info(f"MongoDB connected. phase={env.PROFILE}")

except Exception as err:
logger.error(f"MongoDB connect error: {err}")
Expand Down
24 changes: 13 additions & 11 deletions news-scraper-agent/models/message.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,43 @@
from typing import Optional

from mongoengine import (
Document,
StringField,
ListField,
EmbeddedDocument,
EmbeddedDocumentField,
DateTimeField,
ObjectIdField,
)
from pydantic import BaseModel

from typing import Optional
from utils.time_utils import get_datetime_kst


class MessageContent(EmbeddedDocument):
name = StringField(required=False)
title = StringField(required=False)
url = StringField(required=False)
_id = ObjectIdField(required=True, db_field="_id") # _id 필드 추가
name = StringField(required=False, db_field="name")
title = StringField(required=False, db_field="title")
url = StringField(required=False, db_field="url")


class Message(Document):
type = StringField(required=True)
status = StringField(required=True)
messages = ListField(EmbeddedDocumentField(MessageContent))
type = StringField(required=True, db_field="type")
status = StringField(required=True, db_field="status")
messages = ListField(EmbeddedDocumentField(MessageContent), db_field="messages")

# timestamps 옵션 대신 createdAt과 updatedAt 필드를 직접 정의
createdAt = DateTimeField()
updatedAt = DateTimeField()
createdAt = DateTimeField(db_field="createdAt")
updatedAt = DateTimeField(db_field="updatedAt")

meta = {
"collection": "messages",
"indexes": [
{
"name": "createdAt_1",
"fields": ["createdAt"],
"expireAfterSeconds": 60 * 60 * 24 * 180,
} # 180일 후 만료
],
"auto_create_index": True, # 인덱스가 없을 경우 자동 생성
"versionKey": False, # __v 필드 생성 방지
}

Expand Down
25 changes: 11 additions & 14 deletions news-scraper-agent/models/site.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,27 @@
from typing import Optional

from mongoengine import Document, StringField, ListField, BooleanField, DateTimeField
from pydantic import BaseModel

from typing import Optional
from utils.time_utils import get_datetime_kst


class Site(Document):
name = StringField(required=True)
url = StringField(required=True)
keywords = ListField(StringField(), required=False)
verified = BooleanField(required=True, default=False)
requestedBy = StringField(required=False)
name = StringField(required=True, db_field="name")
url = StringField(required=True, db_field="url")
keywords = ListField(StringField(), required=False, db_field="keywords")
verified = BooleanField(required=True, default=False, db_field="verified")
requestedBy = StringField(required=False, db_field="requestedBy")

# timestamps 옵션 대신 createdAt과 updatedAt 필드를 직접 정의
createdAt = DateTimeField()
updatedAt = DateTimeField()
createdAt = DateTimeField(db_field="createdAt")
updatedAt = DateTimeField(db_field="updatedAt")

meta = {
"collection": "sites",
"indexes": [
{"fields": ["name"]},
{"fields": ["url"]},
{"name": "name_1", "fields": ["name"]},
{"name": "url_1", "fields": ["url"]},
],
"auto_create_index": True,
"index_background": True,
"auto_create_index": True, # 인덱스가 없을 경우 자동 생성
"versionKey": False, # __v 필드 생성 방지
}

Expand Down
9 changes: 8 additions & 1 deletion scraper-lambda/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,14 @@
## required
- python 3.12.2
- poetry 2.0.1
-

## 실행 방법
```shell
poetry env activate
poetry install
python {파일명}.py
```

## 동작
url, content_type, selector를 받아 해당 url의 데이터에서
html인 경우 전체 내용 또는 selector에 해당하는 데이터를 추출하여 반환
Expand Down
51 changes: 44 additions & 7 deletions template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,16 +46,22 @@ Resources:
DependsOn: NewsScraperAgentFunctionRole

NewsScraperAgentSchedule:
Type: AWS::Events::Rule
Type: AWS::Scheduler::Schedule
Condition: IsProd
Properties:
Name: !Sub news-scraper-agent-schedule-${PROFILE}
ScheduleExpression: 'cron(0 1 ? * 2-6 *)'
Description: news-scraper-agent daily scheduler
FlexibleTimeWindow:
MaximumWindowInMinutes: 1
Mode: 'FLEXIBLE'
ScheduleExpression: 'cron(0 10 ? * 2-6 * )'
ScheduleExpressionTimezone: Asia/Seoul
State: ENABLED
Targets:
- Id: NewsScraperAgentFunction
Arn: !GetAtt NewsScraperAgentFunction.Arn
Target:
Arn: !GetAtt NewsScraperAgentFunction.Arn
RoleArn: !GetAtt NewsScraperAgentScheduleRole.Arn



ScraperLambdaFunction:
Type: AWS::Serverless::Function # More info about Function Resource: https://github.com/awslabs/serverless-application-model/blob/master/versions/2016-10-31.md#awsserverlessfunction
Expand Down Expand Up @@ -93,7 +99,9 @@ Resources:
Statement:
- Effect: Allow
Principal:
Service: lambda.amazonaws.com
Service:
- lambda.amazonaws.com
- scheduler.amazonaws.com
Action: sts:AssumeRole
Policies:
- PolicyName: LambdaECRAccess
Expand Down Expand Up @@ -136,6 +144,34 @@ Resources:
- lambda:InvokeFunction
Resource: !GetAtt ScraperLambdaFunction.Arn

NewsScraperAgentScheduleRole:
Type: AWS::IAM::Role
Properties:
RoleName: !Sub news-scraper-agent-schedule-role-${PROFILE}
Tags:
- Key: PROJECT
Value: AI_NEWS_AGENT
- Key: PHASE
Value: !Ref PROFILE
AssumeRolePolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: Allow
Principal:
Service:
- lambda.amazonaws.com
- scheduler.amazonaws.com
Action: sts:AssumeRole
Policies:
- PolicyName: LambdaInvokePermission
PolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: Allow
Action:
- lambda:InvokeFunction
Resource: !GetAtt NewsScraperAgentFunction.Arn

ScraperLambdaFunctionRole:
Type: AWS::IAM::Role
Properties:
Expand All @@ -150,7 +186,8 @@ Resources:
Statement:
- Effect: Allow
Principal:
Service: lambda.amazonaws.com
Service:
- lambda.amazonaws.com
Action: sts:AssumeRole
Policies:
- PolicyName: LambdaECRAccess
Expand Down

0 comments on commit c1fdc43

Please sign in to comment.