# -*- coding: utf-8 -*-
import os, re, logging, random, base64, json, urllib.parse
from datetime import datetime
from typing import List, Dict, Tuple
from bs4 import BeautifulSoup as BS
from jinja2 import Environment, FileSystemLoader
TEMPLATE_DIR = os.path.join(os.path.dirname(__file__), “templates”)
env = Environment(loader=FileSystemLoader(TEMPLATE_DIR))
logger = logging.getLogger(“generators”)
logger.setLevel(os.getenv(“LOG_LEVEL”, “INFO”))
def _textify(html_or_text: str) -> str:
if not html_or_text:
return “”
try:
return BS(html_or_text, “html.parser”).get_text(” “, strip=True)
except Exception:
return str(html_or_text)
def _first_img_src_from_html(html: str) -> str | None:
if not html:
return None
try:
soup = BS(html, “html.parser”)
im = soup.find(“img”)
if not im:
return None
for k in (“src”, “data-src”, “data-lazy”, “data-original”):
v = im.get(k)
if v and isinstance(v, str) and v.startswith((“http://”, “https://”)):
return v
except Exception:
pass
return None
def _cloud_fetch(u: str) -> str:
“””Use Cloudinary fetch proxy if configured, to make remote images robust.”””
cloud = os.getenv(“CLOUDINARY_CLOUD_NAME”, “”).strip()
if not (cloud and u and u.startswith((“http://”, “https://”))):
return u
enc = urllib.parse.quote(u, safe=””)
# auto format/quality, strip metadata
return f”https://res.cloudinary.com/{cloud}/image/fetch/f_auto,q_auto,fl_force_strip/{enc}”
def summarize_text(text: str, sentences: int = 8) -> str:
text = (_textify(text) or “”).strip()
if not text:
return “”
# Try LLM if available
if os.getenv(“OPENAI_API_KEY”):
try:
import openai # type: ignore
openai.api_key = os.getenv(“OPENAI_API_KEY”)
resp = openai.ChatCompletion.create(
model=”gpt-4o-mini”,
messages=[
{“role”: “system”, “content”: “한국어 친화 요약가. 6~10문장 핵심 요약.”},
{“role”: “user”, “content”: text[:6000]},
],
max_tokens=600,
temperature=0.3,
)
out = resp[“choices”][0][“message”][“content”].strip()
if out:
return out
except Exception as e:
logger.info(f”[SUMM] LLM fail: {e}”)
# Fallback: naive sentence slice
parts = re.split(r'(?<=[.!?])\s+', text)
parts = [p.strip() for p in parts if len(p.strip()) > 25]
if not parts:
return text[:1200]
return ” “.join(parts[:sentences])[:1200]
def translate_ko(text: str) -> str:
“””GCP_CREDENTIALS_JSON_B64 → google.oauth2.service_account 사용, Papago fallback, 최종 보장.”””
text = (text or “”).strip()
if not text:
return “”
# Google Cloud (service account via B64)
try:
from google.cloud import translate_v2 as translate # type: ignore
from google.oauth2 import service_account # type: ignore
b64 = os.getenv(“GCP_CREDENTIALS_JSON_B64”, “”).strip()
if b64:
data = json.loads(base64.b64decode(b64))
creds = service_account.Credentials.from_service_account_info(data)
client = translate.Client(credentials=creds)
else:
client = translate.Client()
res = client.translate(text, target_language=”ko”)
ko = res.get(“translatedText”) or “”
if ko:
return ko
except Exception as e:
logger.info(f”[TRANS] GCP fail: {e}”)
# Papago fallback
try:
pid, psec = os.getenv(“PAPAGO_CLIENT_ID”), os.getenv(“PAPAGO_CLIENT_SECRET”)
if pid and psec:
import requests # lazy import
r = requests.post(
“https://openapi.naver.com/v1/papago/n2mt”,
headers={“X-Naver-Client-Id”: pid, “X-Naver-Client-Secret”: psec},
data={“source”: “en”, “target”: “ko”, “text”: text},
timeout=12,
)
j = r.json()
ko = (j.get(“message”, {}) or {}).get(“result”, {}).get(“translatedText”, “”)
if ko:
return ko
except Exception as e:
logger.info(f”[TRANS] Papago fail: {e}”)
# Final fallback (보장)
base = text[:720]
return f”자동 한글 요약: {base}…” if base else “자동 한글 요약: (원문이 충분하지 않습니다.)”
NEWS_DOMAINS = (“arstechnica”, “theverge”, “wired”, “techcrunch”, “tomshardware”, “coindesk”, “cointelegraph”)
PRODUCT_HINTS = (“amazon.”, “coupang”, “store”, “shop”, “buy.”, “cart”, “smartstore”, “bestbuy”, “newegg”, “bhphotovideo”)
PRODUCT_TITLE_KEYWORDS = (“phone”, “iphone”, “galaxy”, “pixel”, “samsung”, “xiaomi”, “oneplus”,
“tv”, “headphone”, “earbuds”, “charger”, “ssd”, “nvme”, “camera”, “dslr”,
“mirrorless”, “lens”, “monitor”, “webcam”, “streamer”, “set-top”)
def classify_link(url: str | None) -> str:
u = (url or “”).lower()
if not u:
return “출처”
if any(h in u for h in PRODUCT_HINTS):
return “구매하기”
if any(d in u for d in NEWS_DOMAINS) or (“news” in u):
return “기사보기”
return “출처”
def build_cards(crawled: List[Dict], limit: int = 6) -> Tuple[List[Dict], Dict[str, int]]:
cards = []
stats = {“product”: 0, “article”: 0, “source”: 0}
# 1) product-first by title keywords
for it in crawled:
title_lc = (it.get(“product”) or it.get(“title”) or it.get(“og_title”) or “”).lower()
url = it.get(“url”)
img = it.get(“image”) or it.get(“og_image”) or “”
is_product_by_title = any(kw in title_lc for kw in PRODUCT_TITLE_KEYWORDS)
if is_product_by_title:
btn = “구매하기”
stats[“product”] += 1
review_src = _textify(it.get(“text”) or it.get(“html”) or “”)[:90]
cards.append({
“title”: it.get(“title”) or it.get(“product”) or “추천 제품”,
“image”: _cloud_fetch(img) if img else “https://via.placeholder.com/220×160?text=No+Image”,
“url”: url,
“btn_text”: btn,
“review”: review_src
})
# 2) remaining: article/source
for it in crawled:
if len(cards) >= limit:
break
u = it.get(“url”) or “”
if u and any(c.get(“url”) == u for c in cards):
continue
title = it.get(“title”) or “추천”
url = it.get(“url”)
img = it.get(“image”) or it.get(“og_image”) or “”
btn = classify_link(url)
if btn == “구매하기”:
stats[“product”] += 1
elif btn == “기사보기”:
stats[“article”] += 1
else:
stats[“source”] += 1
review_src = _textify(it.get(“text”) or it.get(“html”) or “”)[:90]
cards.append({
“title”: title,
“image”: _cloud_fetch(img) if img else “https://via.placeholder.com/220×160?text=No+Image”,
“url”: url, “btn_text”: btn, “review”: review_src
})
# 3) Shortage → topic placeholders
if len(cards) < limit:
# simple keyword pool from titles
words = []
for it in crawled:
words += re.findall(r"[A-Za-z0-9]{3,}", (it.get("title") or "") + " " + (it.get("text") or ""))
from collections import Counter
kwc = [k for k,_ in Counter([w.lower() for w in words]).most_common(10)] or ["tech"]
while len(cards) < limit:
kw = kwc[len(cards) % len(kwc)]
cards.append({
"title": f"{kw.title()} 추천",
"image": f"https://via.placeholder.com/220x160?text={urllib.parse.quote(kw)}",
"url": None, "btn_text": "기사보기", "review": ""
})
stats["article"] += 1
return cards[:limit], stats
def build_article_html(crawled: List[Dict], min_plain_chars: int = 1200) -> Tuple[str, int]:
parts = []
total_plain = 0
for idx, it in enumerate(crawled):
title = it.get(“title”) or f”섹션 {idx+1}”
txt = _textify(it.get(“text”) or it.get(“html”) or “”)
if not txt:
continue
# p.jump → 커서/클릭 처리용
chunk = f”
{title}
{txt}
”
parts.append(chunk)
total_plain += len(txt)
if total_plain >= min_plain_chars:
break
body = “\n”.join(parts)
if total_plain < min_plain_chars:
extra = "
관련 링크
- ” + “”.join(f”
- {_textify(it.get(‘title’) or ”)}
” for it in crawled[:6]) + “
”
body = body + “\n” + extra
total_plain = len(_textify(body))
return body, total_plain
def select_hero(crawled: List[Dict]) -> Tuple[str, str]:
for it in crawled:
for k in (“og_image”, “image”):
img = it.get(k)
if img and isinstance(img, str) and img.startswith((“http://”, “https://”)):
return _cloud_fetch(img), k
first_img = _first_img_src_from_html(it.get(“html”) or “”)
if first_img:
return _cloud_fetch(first_img), “first_img_in_html”
return “https://via.placeholder.com/800×400?text=No+Image”, “placeholder”
def build_post(category: str, cfg: dict):
cat = (category or “auto”).strip().lower()
logger.info(f”[BUILD_POST] category={cat}”)
# Crawl
try:
from crawler.generic_crawler import crawl_sources as _crawl # type: ignore
except Exception:
def _crawl(x): return []
crawled = _crawl(cat) or []
logger.info(f”[BUILD_POST] crawled count={len(crawled)}”)
main_title = crawled[0].get(“title”) if crawled else f”{cat.capitalize()}”
all_texts = [_textify(it.get(“text”) or it.get(“html”) or “”) for it in crawled]
all_text = “\n”.join([t for t in all_texts if t]).strip()
summary_en = summarize_text(all_text, sentences=8) if all_text else “”
summary_kr = translate_ko(summary_en) if summary_en else “”
if not summary_kr:
base = summary_en[:720] if summary_en else “”
summary_kr = f”자동 한글 요약: {base}…” if base else “자동 한글 요약: (원문이 충분하지 않습니다.)”
article_html, sections_plain_len = build_article_html(crawled, min_plain_chars=1200)
hero_url, hero_reason = select_hero(crawled)
cards, stats = build_cards(crawled, limit=6)
post_url = (cfg or {}).get(“post_url”)
hero_click = os.getenv(“WP_HERO_CLICK”, “true”).lower() == “true”
html_out = env.get_template(“base.html”).render(
title=main_title,
category=cat,
date_str=datetime.now().strftime(“%Y-%m-%d”),
hero_image_url=hero_url,
hero_click=hero_click,
summary_en=summary_en,
summary_kr=summary_kr,
article_html=article_html,
product_cards=cards,
post_url=post_url
)
meta = {
“SUMMARY_EN_LEN”: len(summary_en or “”),
“SUMMARY_KR_LEN”: len(summary_kr or “”),
“HERO”: hero_url,
“HERO_SRC”: hero_reason,
“SECTIONS_TOTAL_LEN”: sections_plain_len,
“CARDS_STATS”: stats,
}
logger.info(f”[META] {meta}”)
return {
“title”: main_title,
“html”: html_out,
“hero_image_url”: hero_url,
“product_cards”: cards,
“summary_en”: summary_en,
“summary_kr”: summary_kr,
“post_url”: post_url,
“meta”: meta,
}
Leave a Reply