diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..bd9d467 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +models/**/*.safetensors filter=lfs diff=lfs merge=lfs -text +models/**/*.spiece.model filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d765f95 --- /dev/null +++ b/.gitignore @@ -0,0 +1,19 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] + +# 가상환경 +.venv/ +venv/ +env/ +ENV/ + +# 로그 파일 +*.log + +# macOS 메타데이터 +.DS_Store + +# IDE 설정 파일 +.vscode/ +.idea/ \ No newline at end of file diff --git a/README.md b/README.md index 329d46a..4ef21e2 100644 --- a/README.md +++ b/README.md @@ -1 +1,61 @@ -# Inside movie AI \ No newline at end of file +# Inside movie AI + +## 1. 가상 환경 및 패키지 간단 설치 방법 +```bash +# 프로젝트 루트에 environment.yml이 있어야 합니다. +conda env create -n Insidemovie-AI -f environment.yml +``` + +**환경 활성화** +```bash +conda activate Insidemovie-AI +``` + +- - - +## 2. Conda 환경 설치 방법 +### Conda 가상 환경 생성 +```bash +conda create -n Insidemovie-AI python=3.12.11 +``` + +### 생성한 가상 환경 실행 +```bash +conda activate Insidemovie-AI +``` +## 3. venv 환경에서 설치 방법 +- `Insidemovie-AI` 이름으로 가상환경 생성 +```bash +python3 -m venv --prompt Insidemovie-AI .venv +``` + +- 환경 활성화 +```bash +source .venv/bin/activate +``` + + +## 4. 패키지 요구사항 +- uvicorn == 0.35.0 +- fastapi == 0.116.1 +- pytorch == 2.7.1 +- transformers == 4.53.2 +- pydantic-settings +- sentencepiece == 0.2.0 +- motor + +#### 패키지 설치 코드 +```bash +pip install \ + uvicorn==0.35.0 \ + fastapi==0.116.1 \ + torch==2.7.1 \ + transformers==4.53.2 \ + pydantic-settings \ + sentencepiece==0.2.0 \ + motor +``` + +#### KoBERT 설치 +```bash +pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf' +``` \ No newline at end of file diff --git a/config.py b/config.py index b458586..9942a39 100644 --- a/config.py +++ b/config.py @@ -1,13 +1,13 @@ -# config.py +import os from pydantic_settings import BaseSettings, SettingsConfigDict class Settings(BaseSettings): - db_name: str = "emotion_db" mongodb_uri: str = "mongodb://localhost:27017" + db_name: str = "emotion_db" collection_name: str = "predictions" - title: str = "Inside Movie AI" + model_dir: str = os.getenv("MODEL_DIR", "models/0712_kobert_5_emotion_model") + title: str = "MovieMood - KoBERT Emotion API" - # .env 파일 자동 로딩 설정 model_config = SettingsConfigDict(env_file=".env") settings = Settings() \ No newline at end of file diff --git a/const.py b/const.py new file mode 100644 index 0000000..382229f --- /dev/null +++ b/const.py @@ -0,0 +1,8 @@ +# 7 가지 감정 상태 +# 기쁨, 슬픔, 분노, 놀람, 혐오, 공포, 중립 +LABELS_6 = [ + "anger", "fear", "joy", "neutral", "sadness", "surprise" +] +LABELS_5 = [ + "anger", "fear", "joy", "neutral", "sadness" +] \ No newline at end of file diff --git a/db.py b/db.py index 3aa6058..835032e 100644 --- a/db.py +++ b/db.py @@ -1,4 +1,3 @@ -# app/db.py from motor.motor_asyncio import AsyncIOMotorClient from config import settings diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..49cf5d7 --- /dev/null +++ b/environment.yml @@ -0,0 +1,64 @@ +name: Insidemovie-AI +channels: + - defaults +dependencies: + - bzip2=1.0.8=h80987f9_6 + - ca-certificates=2025.2.25=hca03da5_0 + - expat=2.7.1=h313beb8_0 + - libcxx=17.0.6=he5c5206_4 + - libffi=3.4.4=hca03da5_1 + - ncurses=6.4=h313beb8_0 + - openssl=3.0.16=h02f6b3c_0 + - pip=25.1=pyhc872135_2 + - python=3.12.11=h421de30_0 + - readline=8.2=h1a28f6b_0 + - setuptools=78.1.1=py312hca03da5_0 + - sqlite=3.45.3=h80987f9_0 + - tk=8.6.14=h6ba3021_1 + - tzdata=2025b=h04d1e81_0 + - wheel=0.45.1=py312hca03da5_0 + - xz=5.6.4=h80987f9_1 + - zlib=1.2.13=h18a0788_1 + - pip: + - annotated-types==0.7.0 + - anyio==4.9.0 + - certifi==2025.7.9 + - charset-normalizer==3.4.2 + - click==8.2.1 + - dnspython==2.7.0 + - fastapi==0.116.1 + - filelock==3.18.0 + - fsspec==2025.5.1 + - h11==0.16.0 + - hf-xet==1.1.5 + - huggingface-hub==0.33.4 + - idna==3.10 + - jinja2==3.1.6 + - kobert-tokenizer==0.1 + - markupsafe==3.0.2 + - motor==3.7.1 + - mpmath==1.3.0 + - networkx==3.5 + - numpy==2.3.1 + - packaging==25.0 + - pydantic==2.11.7 + - pydantic-core==2.33.2 + - pydantic-settings==2.10.1 + - pymongo==4.13.2 + - python-dotenv==1.1.1 + - pyyaml==6.0.2 + - regex==2024.11.6 + - requests==2.32.4 + - safetensors==0.5.3 + - sentencepiece==0.2.0 + - sniffio==1.3.1 + - starlette==0.47.1 + - sympy==1.14.0 + - tokenizers==0.21.2 + - torch==2.7.1 + - tqdm==4.67.1 + - transformers==4.53.2 + - typing-extensions==4.14.1 + - typing-inspection==0.4.1 + - urllib3==2.5.0 + - uvicorn==0.35.0 diff --git a/main.py b/main.py index 9b47866..8ecb9eb 100644 --- a/main.py +++ b/main.py @@ -1,12 +1,14 @@ -# main.py +import uvicorn from fastapi import FastAPI from config import settings -from db import collection from routers.home import router as home_router +from routers.predict import router as predict_router app = FastAPI(title=settings.title) + +# 라우터 등록 app.include_router(home_router) +app.include_router(predict_router) if __name__ == "__main__": - import uvicorn uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True) \ No newline at end of file diff --git a/models/0712_kobert_5_emotion_model/config.json b/models/0712_kobert_5_emotion_model/config.json new file mode 100644 index 0000000..f00d0eb --- /dev/null +++ b/models/0712_kobert_5_emotion_model/config.json @@ -0,0 +1,42 @@ +{ + "architectures": [ + "BertForSequenceClassification" + ], + "attention_probs_dropout_prob": 0.1, + "author": "Heewon Jeon(madjakarta@gmail.com)", + "classifier_dropout": null, + "gradient_checkpointing": false, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1", + "2": "LABEL_2", + "3": "LABEL_3", + "4": "LABEL_4" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "kobert_version": 1.0, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1, + "LABEL_2": 2, + "LABEL_3": 3, + "LABEL_4": 4 + }, + "layer_norm_eps": 1e-12, + "max_position_embeddings": 512, + "model_type": "bert", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pad_token_id": 1, + "position_embedding_type": "absolute", + "problem_type": "single_label_classification", + "torch_dtype": "float32", + "transformers_version": "4.53.1", + "type_vocab_size": 2, + "use_cache": true, + "vocab_size": 8002 +} diff --git a/models/0712_kobert_5_emotion_model/model.safetensors b/models/0712_kobert_5_emotion_model/model.safetensors new file mode 100644 index 0000000..8e09bb3 --- /dev/null +++ b/models/0712_kobert_5_emotion_model/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e00255ad69ab36f05617b8ce490cae3c8a317894ddbd9308a02d40c11ddd77b5 +size 368786356 diff --git a/models/0712_kobert_5_emotion_model/special_tokens_map.json b/models/0712_kobert_5_emotion_model/special_tokens_map.json new file mode 100644 index 0000000..c5a4417 --- /dev/null +++ b/models/0712_kobert_5_emotion_model/special_tokens_map.json @@ -0,0 +1,15 @@ +{ + "bos_token": "[CLS]", + "cls_token": "[CLS]", + "eos_token": "[SEP]", + "mask_token": { + "content": "[MASK]", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": "[PAD]", + "sep_token": "[SEP]", + "unk_token": "[UNK]" +} diff --git a/models/0712_kobert_5_emotion_model/spiece.model b/models/0712_kobert_5_emotion_model/spiece.model new file mode 100644 index 0000000..d00706c Binary files /dev/null and b/models/0712_kobert_5_emotion_model/spiece.model differ diff --git a/models/0712_kobert_5_emotion_model/tokenizer_config.json b/models/0712_kobert_5_emotion_model/tokenizer_config.json new file mode 100644 index 0000000..0329588 --- /dev/null +++ b/models/0712_kobert_5_emotion_model/tokenizer_config.json @@ -0,0 +1,64 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "[UNK]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "[PAD]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "[CLS]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "[SEP]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "[MASK]", + "lstrip": true, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [], + "bos_token": "[CLS]", + "clean_up_tokenization_spaces": false, + "cls_token": "[CLS]", + "do_lower_case": false, + "eos_token": "[SEP]", + "extra_special_tokens": {}, + "keep_accents": false, + "mask_token": "[MASK]", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "[PAD]", + "remove_space": true, + "sep_token": "[SEP]", + "sp_model_kwargs": { + "alpha": 0.6, + "enable_sampling": true, + "nbest_size": -1 + }, + "tokenizer_class": "KoBERTTokenizer", + "unk_token": "[UNK]" +} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..7103d63 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,42 @@ +annotated-types==0.7.0 +anyio==4.9.0 +certifi==2025.7.14 +charset-normalizer==3.4.2 +click==8.2.1 +dnspython==2.7.0 +fastapi==0.116.1 +filelock==3.18.0 +fsspec==2025.5.1 +h11==0.16.0 +hf-xet==1.1.5 +huggingface-hub==0.33.4 +idna==3.10 +Jinja2==3.1.6 +kobert-tokenizer @ git+https://github.com/SKTBrain/KoBERT.git@fcd729f2f4b37858f333597c0782388ada51eb5f#subdirectory=kobert_hf +MarkupSafe==3.0.2 +motor==3.7.1 +mpmath==1.3.0 +networkx==3.5 +numpy==2.3.1 +packaging==25.0 +pydantic==2.11.7 +pydantic-settings==2.10.1 +pydantic_core==2.33.2 +pymongo==4.13.2 +python-dotenv==1.1.1 +PyYAML==6.0.2 +regex==2024.11.6 +requests==2.32.4 +safetensors==0.5.3 +sentencepiece==0.2.0 +sniffio==1.3.1 +starlette==0.47.1 +sympy==1.14.0 +tokenizers==0.21.2 +torch==2.7.1 +tqdm==4.67.1 +transformers==4.53.2 +typing-inspection==0.4.1 +typing_extensions==4.14.1 +urllib3==2.5.0 +uvicorn==0.35.0 diff --git a/routers/home.py b/routers/home.py index 7f36a17..23f95c5 100644 --- a/routers/home.py +++ b/routers/home.py @@ -1,4 +1,3 @@ -# routers/home.py from fastapi import APIRouter from config import settings diff --git a/routers/predict.py b/routers/predict.py new file mode 100644 index 0000000..ab2c4dc --- /dev/null +++ b/routers/predict.py @@ -0,0 +1,53 @@ +from fastapi import APIRouter, HTTPException +from datetime import datetime +from schemas import TextItem, Prediction +from services.prediction import ( + predict_emotion_full, + predict_emotion_split_avg, + predict_emotion_overall_avg, +) +from db import collection + +router = APIRouter(prefix="/predict", tags=["Prediction"]) + +@router.post("/full", response_model=Prediction) +async def predict_full(item: TextItem): + try: + probs = predict_emotion_full(item.text) + record = { + "text": item.text, + "probabilities": probs, + "timestamp": datetime.utcnow() + } + await collection.insert_one(record) + return record + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + +@router.post("/split_avg", response_model=Prediction) +async def predict_split_avg(item: TextItem): + try: + probs = predict_emotion_split_avg(item.text) + record = { + "text": item.text, + "probabilities": probs, + "timestamp": datetime.utcnow() + } + await collection.insert_one(record) + return record + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + +@router.post("/overall_avg", response_model=Prediction) +async def predict_overall_avg(item: TextItem): + try: + probs = predict_emotion_overall_avg(item.text) + record = { + "text": item.text, + "probabilities": probs, + "timestamp": datetime.utcnow() + } + await collection.insert_one(record) + return record + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) \ No newline at end of file diff --git a/routers/review.py b/routers/review.py new file mode 100644 index 0000000..9898d64 --- /dev/null +++ b/routers/review.py @@ -0,0 +1,96 @@ +# TODO: 데이터 저장하기(mongo or mysql) +from fastapi import APIRouter, HTTPException +from pydantic import BaseModel + +import requests + +router = APIRouter( + prefix = "/review", + tags = ['review'] +) + +class review_request(BaseModel): + movie_id: int + review_count: int = 100 + +@router.post("/crawl") +async def crawl_data(request: review_request): + url = "https://gateway.kinolights.com/graphql" + + headers = { + "User-Agent": "Mozilla/5.0", + "Content-Type": "application/json", + } + + payload = { + "operationName": "QueryContentReviews", + "variables": { + "contentId": request.movie_id, # 영화 ID + "reviewsOffset": 0, + "reviewsLimit": request.review_count, + "reviewsOrderBy": "LIKE", + "reviewsOrderOption": "DESC", + }, + "query": """ + query QueryContentReviews( + $contentId: Int!, + $reviewsOffset: Int = 0, + $reviewsLimit: Int = 10, + $reviewsOrderBy: ReviewMoviesOrderType!, + $reviewsOrderOption: OrderOptionType!, + $reviewType: ReviewFilterType, + ) { + reviews( + movieId: $contentId + offset: $reviewsOffset + limit: $reviewsLimit + orderBy: $reviewsOrderBy + orderOption: $reviewsOrderOption + reviewType: $reviewType + ) { + reviewTitle + review + + } + } + """, + } + try: + response = requests.post(url, headers=headers, json=payload) + response.raise_for_status() # HTTP 에러 발생 시 예외 처리 + + data = response.json()['data']['reviews'] + + if "errors" in data: + return { + "status" : response.status_code, + "success" : "false", + "message" : f"리뷰 요청에 실패했습니다. {response.text}" + } + + return { + "status": 200, + "success": "true", + "message": "리뷰 데이터가 저장되었습니다.", + } + + except requests.exceptions.RequestException as e: + return { + "status": 500, + "success": "false", + "message": f"요청 실패: {str(e)}" + } + + except ValueError: + return { + "status": 500, + "success": "false", + "message": "서버 응답을 JSON으로 파싱할 수 없습니다." + } + + except Exception as e: + return { + "status": 500, + "success": "false", + "message": f"알 수 없는 오류 발생: {str(e)}" + } \ No newline at end of file diff --git a/schemas.py b/schemas.py new file mode 100644 index 0000000..e94eb7b --- /dev/null +++ b/schemas.py @@ -0,0 +1,13 @@ +from datetime import datetime +from typing import Dict, Optional + +from pydantic import BaseModel + + +class TextItem(BaseModel): + text: str + +class Prediction(BaseModel): + text: str + probabilities: Dict[str, float] + timestamp: Optional[datetime] = None \ No newline at end of file diff --git a/services/prediction.py b/services/prediction.py new file mode 100644 index 0000000..df1baa3 --- /dev/null +++ b/services/prediction.py @@ -0,0 +1,69 @@ +import torch +from transformers import AutoModelForSequenceClassification +from kobert_tokenizer import KoBERTTokenizer +from const import LABELS_5 +from config import settings + +_tokenizer = KoBERTTokenizer.from_pretrained(settings.model_dir) +_model = AutoModelForSequenceClassification.from_pretrained(settings.model_dir) +_model.eval() + + +def get_avg_emotion(results: list[dict[str, float]]) -> dict[str, float]: + if not results: + return {label: 0.0 for label in LABELS_5} + + # 각 레이블별 합산 + sums = {label: 0.0 for label in LABELS_5} + for prob in results: + for label, score in prob.items(): + sums[label] += score + + # 평균 계산 + n = len(results) + return {label: sums[label] / n for label in LABELS_5} + +def _predict_batch(texts: list[str]) -> list[dict[str, float]]: + inputs = _tokenizer( + texts, + return_tensors="pt", + truncation=True, + padding=True, + max_length=128 + ) + + with torch.no_grad(): + logits = _model(**inputs).logits # [batch_size, num_labels] + probs = torch.softmax(logits, dim=-1) + + return [ + { LABELS_5[i]: float(prob[i]) for i in range(len(LABELS_5)) } + for prob in probs + ] + + +def _format_percent(probs: dict[str, float]) -> dict[str, float]: + return {label: round(probs[label] * 100, 2) for label in LABELS_5} + +def predict_emotion_split_avg(text: str) -> dict[str, float]: + sentences = [s.strip() for s in text.split('.') if s.strip()] + if not sentences: + sentences = [text.strip()] + raw_probs = _predict_batch(sentences) + avg_raw = get_avg_emotion(raw_probs) + return _format_percent(avg_raw) + + +def predict_emotion_overall_avg(text: str) -> dict[str, float]: + sentences = [s.strip() for s in text.split('.') if s.strip()] + if not sentences: + sentences = [text.strip()] + all_texts = [text.strip()] + sentences + raw_probs = _predict_batch(all_texts) + avg_raw = get_avg_emotion(raw_probs) + return _format_percent(avg_raw) + + +def predict_emotion_full(text: str) -> dict[str, float]: + [raw] = _predict_batch([text.strip()]) + return _format_percent(raw)