From b1ffd81fec924ef0b9368f085357c6d17a0e32b5 Mon Sep 17 00:00:00 2001 From: mrinal Date: Fri, 5 Jun 2026 22:55:18 +0530 Subject: [PATCH 1/9] feat: add free-LLM sentiment tier above FinBERT/VADER Vendor free_llm_router and layer an LLM sentiment tier on top of the existing FinBERT -> XLM-RoBERTa -> VADER chain in processors/sentiment.py. The LLM returns financial sentiment with direction (bullish/bearish), which keyword/FinBERT scoring conflates with tone. Falls through to the existing chain on any failure, so reliability is unchanged. - Sync asyncio.run bridge (Celery workers have no running loop). - Enabled by default when the package is present; FREE_LLM_ENABLED=false to disable. - .env.example documents per-provider keys. Co-Authored-By: Claude Opus 4.8 (1M context) --- .env.example | 9 ++ free_llm_router/__init__.py | 30 +++++ free_llm_router/health.py | 85 ++++++++++++ free_llm_router/policy.py | 65 +++++++++ free_llm_router/providers.py | 111 ++++++++++++++++ free_llm_router/ratelimit.py | 64 +++++++++ free_llm_router/router.py | 247 +++++++++++++++++++++++++++++++++++ processors/sentiment.py | 88 +++++++++++++ 8 files changed, 699 insertions(+) create mode 100644 free_llm_router/__init__.py create mode 100644 free_llm_router/health.py create mode 100644 free_llm_router/policy.py create mode 100644 free_llm_router/providers.py create mode 100644 free_llm_router/ratelimit.py create mode 100644 free_llm_router/router.py diff --git a/.env.example b/.env.example index 39d3df0..6da3656 100644 --- a/.env.example +++ b/.env.example @@ -32,6 +32,15 @@ TOR_PROXY=socks5://127.0.0.1:9050 ANTHROPIC_API_KEY= OLLAMA_URL=http://localhost:11434 +# Free LLM router — adds an LLM sentiment tier above FinBERT/VADER. +# Set FREE_LLM_ENABLED=false to disable. Router uses only providers with a key set. +FREE_LLM_ENABLED=true +GROQ_API_KEY= +CEREBRAS_API_KEY= +GOOGLE_AI_STUDIO_API_KEY= +MISTRAL_API_KEY= +OPENROUTER_API_KEY= + # ── Destination: DragonScope ────────────────────────────── DRAGONSCOPE_REDIS_URL=redis://localhost:6379/1 DRAGONSCOPE_API_URL=http://localhost:3456 diff --git a/free_llm_router/__init__.py b/free_llm_router/__init__.py new file mode 100644 index 0000000..e2c98c4 --- /dev/null +++ b/free_llm_router/__init__.py @@ -0,0 +1,30 @@ +"""free-llm-router: failover across perpetually-free, OpenAI-compatible LLM APIs.""" + +from .health import CircuitBreaker, State +from .providers import Provider, REGISTRY, available_providers +from .ratelimit import TokenBucket +from .router import ( + AllProvidersFailed, + FreeLLMRouter, + OrderFn, + ProviderStats, + TASK_TIER, + default_order, +) + +__all__ = [ + "FreeLLMRouter", + "Provider", + "ProviderStats", + "OrderFn", + "default_order", + "AllProvidersFailed", + "TASK_TIER", + "REGISTRY", + "available_providers", + "TokenBucket", + "CircuitBreaker", + "State", +] + +__version__ = "0.1.0" diff --git a/free_llm_router/health.py b/free_llm_router/health.py new file mode 100644 index 0000000..00ae4da --- /dev/null +++ b/free_llm_router/health.py @@ -0,0 +1,85 @@ +""" +Per-provider circuit breaker. + +A free provider that starts 500-ing or timing out should be taken out of the +rotation quickly and probed cautiously — otherwise every request pays the full +timeout before failing over, and a flapping provider gets hammered. + +States: + closed – normal; requests flow. + open – too many recent failures; reject fast until cooldown elapses. + half_open – cooldown elapsed; allow EXACTLY ONE probe. If it succeeds we + close; if it fails we re-open. Letting many probes through at + once was a real bug — they all rush the still-broken provider + and the breaker oscillates. +""" + +from __future__ import annotations + +import asyncio +from enum import Enum + + +class State(str, Enum): + CLOSED = "closed" + OPEN = "open" + HALF_OPEN = "half_open" + + +class CircuitBreaker: + def __init__( + self, + *, + monotonic, + failure_threshold: int = 3, + cooldown_sec: float = 30.0, + ) -> None: + self._monotonic = monotonic + self._failure_threshold = failure_threshold + self._cooldown_sec = cooldown_sec + self._state = State.CLOSED + self._failures = 0 + self._opened_at = 0.0 + self._probe_in_flight = False + self._lock = asyncio.Lock() + + async def allow(self) -> bool: + """Whether a request may proceed right now.""" + async with self._lock: + if self._state is State.CLOSED: + return True + if self._state is State.OPEN: + if self._monotonic() - self._opened_at >= self._cooldown_sec: + # cooldown elapsed → permit a single probe + self._state = State.HALF_OPEN + self._probe_in_flight = True + return True + return False + # HALF_OPEN: only the one in-flight probe is allowed + if not self._probe_in_flight: + self._probe_in_flight = True + return True + return False + + async def record_success(self) -> None: + async with self._lock: + self._failures = 0 + self._probe_in_flight = False + self._state = State.CLOSED + + async def record_failure(self) -> None: + async with self._lock: + self._probe_in_flight = False + if self._state is State.HALF_OPEN: + # probe failed → straight back to open, restart cooldown + self._state = State.OPEN + self._opened_at = self._monotonic() + return + self._failures += 1 + if self._failures >= self._failure_threshold: + self._state = State.OPEN + self._opened_at = self._monotonic() + + @property + def state(self) -> State: + return self._state diff --git a/free_llm_router/policy.py b/free_llm_router/policy.py new file mode 100644 index 0000000..b84b165 --- /dev/null +++ b/free_llm_router/policy.py @@ -0,0 +1,65 @@ +""" +Provider ordering policy — YOUR decision point. + +The router calls an ``OrderFn`` before every request to decide which provider to +try first, second, third… Given a live snapshot of each provider's state, return +the providers in the order you want them attempted. + +The default policy (``free_llm_router.router.default_order``) sorts by static +``priority`` only. That's fine until reality intrudes: + * The top-priority provider is rate-limited *this minute* — trying it first just + wastes a failover hop (it'll be skipped, but it's still first in line). + * A provider has burned 49/50 of its daily quota — maybe save it for last. + * One provider has been consistently slow (high ``last_latency_ms``). + * A provider's circuit is half_open — risky; maybe deprioritize. + +`ProviderStats` gives you, per provider: + .provider.priority static rank (lower = preferred) + .circuit_state "closed" | "open" | "half_open" + .tokens_available bool — has an RPM token to spend right now + .day_count / .day_limit requests spent today / documented daily cap (cap may be None) + .last_latency_ms most recent successful round-trip, 0.0 if never called + +Tradeoffs to weigh: + - Latency-first ordering gets fast answers but can stampede one provider until + it rate-limits, then thrash. + - Quota-preserving ordering (spread load, save scarce daily quotas for last) + is gentler on the free tiers — which is the whole point of not getting banned. + - Health-first ordering avoids dead providers but a pure "closed-circuits-first" + sort ignores speed and quota entirely. + +There is no single right answer — it depends on whether you optimize for speed, +for staying under the free caps, or for resilience. That's why it's yours. +""" + +from __future__ import annotations + +from typing import List + +from .router import ProviderStats, default_order +from .providers import Provider + + +def smart_order(stats: List[ProviderStats]) -> List[Provider]: + """ + TODO(you): Rank providers for the next request. + + Return a list[Provider] in the order they should be tried. You don't have to + include every provider, but anything you drop simply won't be attempted this + call (the router still skips rate-limited / open-circuit ones defensively, so + dropping them is optional). + + Suggested shape — sort by a tuple of keys, cheapest-to-violate first, e.g.: + + def rank(s: ProviderStats): + return ( + 0 if s.circuit_state == "closed" else 1, # healthy first + 0 if s.tokens_available else 1, # ready-now first + ???, # your quota / latency call + s.provider.priority, # static tie-break + ) + return [s.provider for s in sorted(stats, key=rank)] + + Replace the line below with your implementation. + """ + return default_order(stats) # placeholder — delegates to static priority diff --git a/free_llm_router/providers.py b/free_llm_router/providers.py new file mode 100644 index 0000000..d749686 --- /dev/null +++ b/free_llm_router/providers.py @@ -0,0 +1,111 @@ +""" +Registry of perpetually-free, OpenAI-compatible LLM providers. + +Every provider here exposes a ``POST {base_url}/chat/completions`` endpoint that +accepts the OpenAI request schema. That uniformity is what lets a single client +body talk to all of them — only ``base_url``, the API key, and the model id change. + +We model two logical *tiers* instead of hard-coding model names at call sites: + + "fast" – small, low-latency model for classification / bulk work + "smart" – larger model for drafting / reasoning / summarization + +Each provider maps the tiers to a concrete model it offers for free. Callers ask +for a tier; the router resolves it per-provider during failover. +""" + +from __future__ import annotations + +import os +from dataclasses import dataclass, field +from typing import Dict, List, Optional + +Tier = str # "fast" | "smart" + + +@dataclass(frozen=True) +class Provider: + """A single free LLM provider and its free-tier characteristics.""" + + name: str + base_url: str + api_key_env: str # env var holding the key + models: Dict[Tier, str] # tier -> concrete model id offered for free + rpm: int # documented free-tier requests per minute + rpd: Optional[int] # documented free-tier requests per day (None = unknown) + priority: int # tie-breaker; lower = generally preferred + referer: str = "" # OpenRouter wants HTTP-Referer/X-Title for free tier + extra_headers: Dict[str, str] = field(default_factory=dict) + + @property + def api_key(self) -> Optional[str]: + return os.environ.get(self.api_key_env) or None + + def model_for(self, tier: Tier) -> Optional[str]: + return self.models.get(tier) + + +# ── The registry (perpetually-free tiers only — no trial-credit providers) ────── +# +# Limits are the documented free-tier numbers at time of writing; they drift, so +# treat them as hints for the rate limiter rather than guarantees. Sources: +# github.com/cheahjs/free-llm-api-resources + +REGISTRY: List[Provider] = [ + Provider( + name="groq", + base_url="https://api.groq.com/openai/v1", + api_key_env="GROQ_API_KEY", + models={"fast": "llama-3.1-8b-instant", "smart": "llama-3.3-70b-versatile"}, + rpm=30, + rpd=14_400, + priority=10, # fastest inference of the free tiers + ), + Provider( + name="cerebras", + base_url="https://api.cerebras.ai/v1", + api_key_env="CEREBRAS_API_KEY", + models={"fast": "llama3.1-8b", "smart": "llama-3.3-70b"}, + rpm=30, + rpd=14_400, + priority=20, + ), + Provider( + name="google_ai_studio", + # Google exposes an OpenAI-compatible shim under /v1beta/openai + base_url="https://generativelanguage.googleapis.com/v1beta/openai", + api_key_env="GOOGLE_AI_STUDIO_API_KEY", + models={"fast": "gemini-2.0-flash-lite", "smart": "gemini-2.0-flash"}, + rpm=15, + rpd=1_500, + priority=30, # generous token quota, strong quality + ), + Provider( + name="mistral", + base_url="https://api.mistral.ai/v1", + api_key_env="MISTRAL_API_KEY", + models={"fast": "open-mistral-nemo", "smart": "mistral-small-latest"}, + rpm=60, + rpd=None, + priority=40, + ), + Provider( + name="openrouter", + base_url="https://openrouter.ai/api/v1", + api_key_env="OPENROUTER_API_KEY", + # ":free" suffixed models cost nothing on OpenRouter + models={ + "fast": "meta-llama/llama-3.3-70b-instruct:free", + "smart": "deepseek/deepseek-r1:free", + }, + rpm=20, + rpd=50, # 1000/day if the account has ever topped up $10 + priority=50, # widest model catalog, but tightest free request cap + referer="https://github.com/cheahjs/free-llm-api-resources", + ), +] + + +def available_providers() -> List[Provider]: + """Registry entries that actually have an API key set in the environment.""" + return [p for p in REGISTRY if p.api_key] diff --git a/free_llm_router/ratelimit.py b/free_llm_router/ratelimit.py new file mode 100644 index 0000000..23b3248 --- /dev/null +++ b/free_llm_router/ratelimit.py @@ -0,0 +1,64 @@ +""" +Per-provider rate limiting. + +Free tiers police two axes simultaneously: requests-per-minute (burst) and +requests-per-day (quota). We enforce both: + + * RPM via a classic token bucket (smooth refill, allows short bursts). + * RPD via a simple daily counter the caller resets out-of-band. + +Async-safe. A subtle TOCTOU bug bit an earlier project: refilling tokens only +in ``acquire`` let two coroutines both see "1 token left" before either consumed. +Here refill happens under the same lock that does the consume, so check-and-take +is atomic. +""" + +from __future__ import annotations + +import asyncio + + +class TokenBucket: + """Async token bucket: ``rpm`` tokens, refilled continuously.""" + + def __init__(self, rpm: int, *, monotonic) -> None: + # `monotonic` is injected (time.monotonic) so tests can supply a fake clock. + self._capacity = float(max(rpm, 1)) + self._tokens = float(max(rpm, 1)) + self._refill_per_sec = max(rpm, 1) / 60.0 + self._monotonic = monotonic + self._last = monotonic() + self._lock = asyncio.Lock() + self._day_count = 0 + + def _refill(self) -> None: + now = self._monotonic() + elapsed = now - self._last + if elapsed > 0: + self._tokens = min(self._capacity, self._tokens + elapsed * self._refill_per_sec) + self._last = now + + async def try_acquire(self) -> bool: + """Take one token if available. Returns False instead of blocking.""" + async with self._lock: + self._refill() # refill INSIDE the lock — atomic with the consume below + if self._tokens >= 1.0: + self._tokens -= 1.0 + self._day_count += 1 + return True + return False + + async def seconds_until_token(self) -> float: + """How long until at least one token is available (for backoff hints).""" + async with self._lock: + self._refill() + if self._tokens >= 1.0: + return 0.0 + return (1.0 - self._tokens) / self._refill_per_sec + + @property + def day_count(self) -> int: + return self._day_count + + def reset_day(self) -> None: + self._day_count = 0 diff --git a/free_llm_router/router.py b/free_llm_router/router.py new file mode 100644 index 0000000..5cdd780 --- /dev/null +++ b/free_llm_router/router.py @@ -0,0 +1,247 @@ +""" +FreeLLMRouter — failover across free, OpenAI-compatible providers. + +Usage mirrors a normal chat-completions client, but a single call may try several +providers in turn until one succeeds: + + router = FreeLLMRouter() + result = await router.chat_completion( + messages=[{"role": "user", "content": "Summarize: ..."}], + tier="smart", + ) + print(result["text"], "via", result["provider"]) + +The return shape matches OperatorOS's existing OpenRouter client, so it can be +dropped in as a replacement: + {text, model, provider, tokens:{prompt,completion,total}, latency_ms, cost_usd} +""" + +from __future__ import annotations + +import asyncio +import logging +import time +from dataclasses import dataclass +from typing import Any, Callable, Dict, List, Optional + +import httpx + +from .health import CircuitBreaker +from .providers import Provider, Tier, available_providers +from .ratelimit import TokenBucket + +logger = logging.getLogger("free_llm_router") + +# Map domain task types (OperatorOS / DragonScope vocabulary) onto the two tiers. +TASK_TIER: Dict[str, Tier] = { + "factual": "fast", + "classification": "fast", + "bulk": "fast", + "sentiment": "fast", + "advisory": "smart", + "computation": "smart", + "drafting": "smart", + "summarization": "smart", + "briefing": "smart", +} + + +class AllProvidersFailed(RuntimeError): + """Raised when every eligible provider was skipped or errored.""" + + +@dataclass +class ProviderStats: + """Live signals the ordering policy can use to rank a provider.""" + + provider: Provider + circuit_state: str # "closed" | "open" | "half_open" + tokens_available: bool # has an RPM token right now + day_count: int # requests already spent today + day_limit: Optional[int] # documented RPD, or None + last_latency_ms: float # most recent successful round-trip (0 if none yet) + + +# ── Default ordering policy ───────────────────────────────────────────────────── +# Static priority only. This is the seam where smarter, health-aware ranking lives +# — see OrderFn and the note in chat_completion(). +def default_order(stats: List[ProviderStats]) -> List[Provider]: + return [s.provider for s in sorted(stats, key=lambda s: s.provider.priority)] + + +OrderFn = Callable[[List[ProviderStats]], List[Provider]] + + +class FreeLLMRouter: + def __init__( + self, + providers: Optional[List[Provider]] = None, + *, + order_fn: OrderFn = default_order, + monotonic: Callable[[], float] = time.monotonic, + request_timeout: float = 45.0, + ) -> None: + self._providers = providers if providers is not None else available_providers() + if not self._providers: + logger.warning( + "FreeLLMRouter has no providers — set at least one of " + "GROQ_API_KEY / CEREBRAS_API_KEY / GOOGLE_AI_STUDIO_API_KEY / " + "MISTRAL_API_KEY / OPENROUTER_API_KEY" + ) + self._order_fn = order_fn + self._timeout = request_timeout + self._buckets = {p.name: TokenBucket(p.rpm, monotonic=monotonic) for p in self._providers} + self._breakers = {p.name: CircuitBreaker(monotonic=monotonic) for p in self._providers} + self._last_latency: Dict[str, float] = {p.name: 0.0 for p in self._providers} + self._client: Optional[httpx.AsyncClient] = None + + async def _http(self) -> httpx.AsyncClient: + if self._client is None or self._client.is_closed: + self._client = httpx.AsyncClient(timeout=httpx.Timeout(self._timeout, connect=10.0)) + return self._client + + async def close(self) -> None: + if self._client and not self._client.is_closed: + await self._client.aclose() + self._client = None + + # ── snapshot for the ordering policy ──────────────────────────────────── + async def _snapshot(self) -> List[ProviderStats]: + stats: List[ProviderStats] = [] + for p in self._providers: + bucket = self._buckets[p.name] + stats.append( + ProviderStats( + provider=p, + circuit_state=self._breakers[p.name].state.value, + tokens_available=(await bucket.seconds_until_token()) == 0.0, + day_count=bucket.day_count, + day_limit=p.rpd, + last_latency_ms=self._last_latency[p.name], + ) + ) + return stats + + # ── main entry point ───────────────────────────────────────────────────── + async def chat_completion( + self, + messages: List[Dict[str, str]], + *, + tier: Optional[Tier] = None, + task_type: Optional[str] = None, + temperature: float = 0.3, + max_tokens: int = 2048, + ) -> Dict[str, Any]: + resolved_tier: Tier = tier or TASK_TIER.get(task_type or "", "smart") + + ordered = self._order_fn(await self._snapshot()) + attempted: List[str] = [] + last_error: Optional[Exception] = None + + for provider in ordered: + model = provider.model_for(resolved_tier) + if model is None: + continue + + breaker = self._breakers[provider.name] + bucket = self._buckets[provider.name] + + if not await breaker.allow(): + continue + if provider.rpd is not None and bucket.day_count >= provider.rpd: + continue + if not await bucket.try_acquire(): + continue # rate-limited this minute; let the next provider take it + + attempted.append(provider.name) + try: + result = await self._call(provider, model, messages, temperature, max_tokens) + await breaker.record_success() + self._last_latency[provider.name] = result["latency_ms"] + logger.info( + "free-llm: %s/%s ok tokens=%d latency=%.0fms", + provider.name, model, result["tokens"]["total"], result["latency_ms"], + ) + return result + except Exception as exc: # noqa: BLE001 — any failure => try next provider + last_error = exc + await breaker.record_failure() + logger.warning("free-llm: %s failed (%s) — failing over", provider.name, exc) + continue + + raise AllProvidersFailed( + f"No free provider served the request (tried: {attempted or 'none eligible'}). " + f"Last error: {last_error}" + ) + + async def _call( + self, + provider: Provider, + model: str, + messages: List[Dict[str, str]], + temperature: float, + max_tokens: int, + ) -> Dict[str, Any]: + headers = { + "Authorization": f"Bearer {provider.api_key}", + "Content-Type": "application/json", + } + if provider.referer: + headers["HTTP-Referer"] = provider.referer + headers["X-Title"] = "free-llm-router" + headers.update(provider.extra_headers) + + payload = { + "model": model, + "messages": messages, + "temperature": temperature, + "max_tokens": max_tokens, + } + + client = await self._http() + start = time.monotonic() + resp = await client.post( + f"{provider.base_url}/chat/completions", json=payload, headers=headers + ) + latency_ms = (time.monotonic() - start) * 1000 + resp.raise_for_status() + data = resp.json() + + choice = data["choices"][0] + usage = data.get("usage", {}) or {} + prompt_t = usage.get("prompt_tokens", 0) + completion_t = usage.get("completion_tokens", 0) + total_t = usage.get("total_tokens", prompt_t + completion_t) + + return { + "text": choice["message"]["content"], + "model": data.get("model", model), + "provider": provider.name, + "tokens": {"prompt": prompt_t, "completion": completion_t, "total": total_t}, + "latency_ms": round(latency_ms, 2), + "cost_usd": 0.0, # free tier — kept for drop-in contract compatibility + } + + # ── convenience helper used by classification-style callers ────────────── + async def quick_classify(self, text: str, categories: List[str]) -> str: + cats = ", ".join(categories) + result = await self.chat_completion( + messages=[ + { + "role": "system", + "content": ( + f"Classifier. Reply with EXACTLY one of: {cats}. " + "No explanation, no punctuation." + ), + }, + {"role": "user", "content": text}, + ], + tier="fast", + temperature=0.0, + max_tokens=16, + ) + raw = result["text"].strip().lower() + for cat in categories: + if cat.lower() in raw: + return cat + return categories[0] diff --git a/processors/sentiment.py b/processors/sentiment.py index 334de6e..036fa57 100644 --- a/processors/sentiment.py +++ b/processors/sentiment.py @@ -5,12 +5,22 @@ - Sector-level sentiment for banking, markets, real estate, etc. """ +import json import logging import math +import os import re from core.base_processor import BaseProcessor +try: + from free_llm_router import AllProvidersFailed, FreeLLMRouter + from free_llm_router.policy import smart_order + + _FREE_AVAILABLE = True +except Exception: # pragma: no cover - import guard + _FREE_AVAILABLE = False + logger = logging.getLogger(__name__) # Keyword-based policy direction detection (pre-compiled with word boundaries) @@ -58,9 +68,16 @@ def __init__(self, config: dict = None): "multilingual_model", "cardiffnlp/twitter-xlm-roberta-base-sentiment" ) self.fallback = self.config.get("fallback", "vader") + # LLM tier (free providers) sits ABOVE FinBERT/VADER. Enabled by default + # when the package is present; degrades to the existing chain on failure. + self.use_llm = self.config.get( + "use_llm", + _FREE_AVAILABLE and os.environ.get("FREE_LLM_ENABLED", "true").lower() == "true", + ) self._pipeline = None self._multilingual_pipeline = None self._vader = None + self._free_router = None def _get_pipeline(self): if self._pipeline is None: @@ -147,6 +164,12 @@ def _analyze(self, text: str, language: str = "en") -> tuple[float, str]: """ from processors.language_detector import get_sentiment_model_for_language + # Top tier: free LLM with financial context. Falls through on any failure. + if self.use_llm: + llm = self._llm_score(text) + if llm is not None: + return llm + model_type = get_sentiment_model_for_language(language) if model_type == "vader": @@ -198,6 +221,71 @@ def _analyze_multilingual(self, text: str) -> tuple[float, str]: logger.debug(f"[Sentiment] Multilingual model failed: {e}") return self._sanitize_score(self._vader_score(text)), "vader" + def _get_free_router(self): + if not (_FREE_AVAILABLE and self.use_llm): + return None + if self._free_router is None: + self._free_router = FreeLLMRouter(order_fn=smart_order) + return self._free_router + + @staticmethod + def _run_async(coro): + """Run an async coroutine from this sync processor. + + Celery sync workers have no running loop, so asyncio.run is safe. If a loop + is somehow already running, bail (caller falls back to FinBERT/VADER). + """ + import asyncio + + try: + asyncio.get_running_loop() + return None # already in a loop — don't risk a nested-run crash + except RuntimeError: + pass + return asyncio.run(coro) + + def _llm_score(self, text: str): + """Financial sentiment via free LLM. Returns (score, model) or None. + + Asks for a compact JSON object so we get both an intensity score AND the + financial direction (which keyword/FinBERT scoring conflates with tone). + """ + router = self._get_free_router() + if router is None: + return None + + messages = [ + { + "role": "system", + "content": ( + "You are a financial sentiment analyst. Read the text and reply " + "with ONLY a JSON object: " + '{"score": , "direction": "bullish"|"bearish"|"neutral"}. ' + "score reflects market sentiment intensity (negative=bearish). " + "No prose, no code fences." + ), + }, + {"role": "user", "content": text[:2000]}, + ] + try: + result = self._run_async( + router.chat_completion( + messages, task_type="sentiment", temperature=0.0, max_tokens=48 + ) + ) + if result is None: + return None + raw = result["text"].strip().removeprefix("```json").removeprefix("```").removesuffix("```").strip() + parsed = json.loads(raw) + score = self._sanitize_score(float(parsed["score"])) + return score, f"llm:{result.get('provider', 'free')}" + except AllProvidersFailed as exc: + logger.debug("[Sentiment] All free providers failed: %s", exc) + return None + except (json.JSONDecodeError, KeyError, ValueError, TypeError) as exc: + logger.debug("[Sentiment] LLM returned unparseable output: %s", exc) + return None + def _vader_score(self, text: str) -> float: if self._vader is None: try: From 25e95cc1e8dcf95e231081a76b08b628114a5ed3 Mon Sep 17 00:00:00 2001 From: mrinal Date: Thu, 18 Jun 2026 19:43:22 +0530 Subject: [PATCH 2/9] =?UTF-8?q?feat:=20PALIMPSEST=20China-intel=20?= =?UTF-8?q?=E2=80=94=20DDTI=20censorship=20index=20+=20CBB=20economic=20co?= =?UTF-8?q?nditions=20engine?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DDTI (political/censorship layer): - Weibo hot-search collector + Chinese finance/sentiment (zh_finance, negation-aware) - Censorship deletion probe (defusedxml-hardened) + selectivity/novelty threat index - Live CDT pull with disk time-series; XSS-hardened "Redacted Terminal" dashboard - Chinese lexicons: finance, censorship gazetteer, market modifiers, threat categories CBB (economic conditions layer, built via Kimi 44-agent swarm): - Comtrade mirror-trade + CN high-frequency indicator collectors - Sector diffusion-index engine (D=0.4*SD+0.6*AS, momentum, mirror-gap), offline self-test passes - Conditions report generator + sector heatmap dashboard - 9-sector taxonomy + 25-source China data catalog; ConditionsIndexSnapshot time-series All modules compile; self-tests pass; pipelines run end-to-end. Data layer partly stubbed (22/25 CN sources todo, Comtrade needs key) — engine verified, live data is next. Co-Authored-By: Claude Opus 4.8 (1M context) Co-Authored-By: Kimi Code (Moonshot) --- .gitignore | 1 + KIMI_CBB_PROMPT.md | 128 +++ PALIMPSEST_BRIEF.md | 43 + PALIMPSEST_CBB_PLAN.md | 122 +++ api/main.py | 4 + api/routes/conditions.py | 88 +++ api/routes/ddti.py | 67 ++ collectors/cn_indicators.py | 449 +++++++++++ collectors/comtrade_mirror.py | 397 ++++++++++ collectors/ddti_probe.py | 220 ++++++ collectors/weibo_hotsearch.py | 136 ++++ config/cbb_taxonomy.json | 1082 ++++++++++++++++++++++++++ config/cn_hf_sources.json | 279 +++++++ config/ddti_threat_categories.json | 49 ++ config/sources.yaml | 226 ++++++ config/zh_censorship_gazetteer.json | 85 ++ config/zh_finance_lexicon.json | 23 + config/zh_market_modifiers.json | 12 + core/tasks.py | 15 + dashboards/conditions_dashboard.html | 318 ++++++++ dashboards/ddti_dashboard.html | 386 +++++++++ processors/conditions_index.py | 531 +++++++++++++ processors/conditions_report.py | 433 +++++++++++ processors/ddti_index.py | 355 +++++++++ processors/sentiment.py | 30 +- processors/zh_finance.py | 128 +++ scripts/conditions_pull.py | 358 +++++++++ scripts/ddti_feasibility.py | 147 ++++ scripts/ddti_live_pull.py | 204 +++++ storage/models.py | 56 ++ 30 files changed, 6369 insertions(+), 3 deletions(-) create mode 100644 KIMI_CBB_PROMPT.md create mode 100644 PALIMPSEST_BRIEF.md create mode 100644 PALIMPSEST_CBB_PLAN.md create mode 100644 api/routes/conditions.py create mode 100644 api/routes/ddti.py create mode 100644 collectors/cn_indicators.py create mode 100644 collectors/comtrade_mirror.py create mode 100644 collectors/ddti_probe.py create mode 100644 collectors/weibo_hotsearch.py create mode 100644 config/cbb_taxonomy.json create mode 100644 config/cn_hf_sources.json create mode 100644 config/ddti_threat_categories.json create mode 100644 config/zh_censorship_gazetteer.json create mode 100644 config/zh_finance_lexicon.json create mode 100644 config/zh_market_modifiers.json create mode 100644 dashboards/conditions_dashboard.html create mode 100644 dashboards/ddti_dashboard.html create mode 100644 processors/conditions_index.py create mode 100644 processors/conditions_report.py create mode 100644 processors/ddti_index.py create mode 100644 processors/zh_finance.py create mode 100644 scripts/conditions_pull.py create mode 100644 scripts/ddti_feasibility.py create mode 100644 scripts/ddti_live_pull.py diff --git a/.gitignore b/.gitignore index 67f7865..e583ea5 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ __pycache__/ .env cookies.json data/ +.agents/ *.egg-info/ .venv/ venv/ diff --git a/KIMI_CBB_PROMPT.md b/KIMI_CBB_PROMPT.md new file mode 100644 index 0000000..d7d0edd --- /dev/null +++ b/KIMI_CBB_PROMPT.md @@ -0,0 +1,128 @@ +# Kimi Prompt — China Economic Conditions Engine (full 5-step build) + +Paste everything in the fenced block below into the Kimi Code window (interactive, where Kimi +has file read/write tools). It is self-contained and neutrally framed (open macro-nowcasting), +and deliberately leverages Kimi's China-data knowledge. Do NOT point Kimi at PALIMPSEST_BRIEF.md +or PALIMPSEST_CBB_PLAN.md — those contain political framing that can trigger a refusal; this +prompt inlines everything needed. + +After Kimi finishes: send the output back to Claude to review/integrate, wire the dashboard route +into api/main.py, and add the cross-domain (political↔economic) layer on Claude's side. + +--- + +```text +ROLE & MISSION +You are a senior Python data engineer AND a Chinese-macroeconomics specialist. Build a +comprehensive, independent macroeconomic "conditions engine" for the Chinese economy: a +high-frequency nowcasting system that estimates real economic activity by sector and region +from PUBLIC data — international trade statistics, Chinese-domestic high-frequency indicators, +and Chinese-language financial-media sentiment. This is standard open macro-research practice +(in the spirit of private nowcasting products like China Beige Book, Rhodium, and central-bank +activity trackers). All inputs are public; the goal is timely, granular, triangulated economic +signal. Implement it end to end, in an existing FastAPI + SQLAlchemy + Celery codebase. + +LEVERAGE YOUR CHINA EXPERTISE — THIS IS THE WHOLE POINT +A generic model misses China's best high-frequency data. You don't. Use your knowledge: +1. Chinese-domestic high-frequency indicators that lead/triangulate official monthly stats. + Build a catalog (and wire the accessible free ones). Consider (you know more than this list): + - Autos: 乘联会 (CPCA) weekly/monthly passenger-vehicle retail & wholesale + - Steel/iron ore: Mysteel 钢材库存/价格, rebar utilization; 百年建筑网 construction-site activity + - Cement/construction: 数字水泥网, cement shipment/utilization + - Coal/energy: CCTD 秦皇岛动力煤, daily coal consumption of major power groups + - Freight/logistics: CCFI & SCFI 出口集装箱运价指数, 中国公路物流运价指数, BDI + - Trade hubs: 义乌中国小商品指数 (Yiwu commodity index) + - Property: 克而瑞 / 中指院 weekly new-home sales of 30 major cities, land transactions + - Mobility: 百度迁徙指数 / 高德 migration & congestion, 12306 rail & 民航 air travel, 电影票房 box office + - Macro: 国家统计局 (NBS) monthly, 海关总署 (Customs) imports/exports, 中物联/财新 PMI, PBOC社融/M2 + For EACH source produce: name (zh+en), URL/API, access method (open JSON/CSV vs scrape), + update frequency, which sector it proxies, and access difficulty (easy/medium/hard). +2. Build a rich Chinese-language sentiment keyword lexicon PER SECTOR (zh terms + en gloss) for + tagging Chinese financial-media text. Also list good Chinese financial-media feeds for sentiment + (e.g., 东方财富, 新浪财经, 财新, 第一财经, 华尔街见闻) with RSS/API access notes. +3. Map sectors to Chinese industrial & regional reality: which provinces/clusters dominate each + sector (e.g., 广东/江苏/浙江 export manufacturing, 山西/内蒙 coal, 河北 steel), and give a few + representative listed firms per sector with SOE-vs-private tag (for a future firm-level layer). + +FIRST read these files to match conventions EXACTLY (do not invent new patterns): +- core/base_collector.py (collect()->parse()->validate(); source_type="api" routes to EconomicData) +- collectors/rss_feeds.py (clean concrete collector example) +- core/base_processor.py + processors/daily_digest.py (aggregate-processor pattern: override run()) +- storage/models.py + api/database.py (model style; init_db = create_all) +- config/sources.yaml + core/registry.py + core/tasks.py (registration, scheduling, tasks) + +DELIVERABLES — implement ALL of the following: + +STEP 1 — Independent data collectors (BaseCollector subclasses, source_type="api") +1a) collectors/comtrade_mirror.py — `ComtradeMirrorCollector` (name="comtrade_mirror"). + Pull MONTHLY China trade (reporterCode 156) by HS chapter in BOTH directions, plus the + "mirror" view (partnerCode=156 reported by partners) so a mirror gap can be computed. + Base "https://comtradeapi.un.org/data/v1/get"; optional env COMTRADE_API_KEY sent as header + "Ocp-Apim-Subscription-Key"; make base URL + key config-driven; log reachability/status; degrade + gracefully. parse() -> EconomicData rows: indicator=f"trade_{flow}_{hs}", value=primaryValue (USD), + unit="USD", metadata={hs,flow,reporter,partner,period,netWeight}, date=period. Add a sources.yaml entry. +1b) config/cn_hf_sources.json — the documented catalog from your China expertise (item 1 above). + collectors/cn_indicators.py — `CNIndicatorsCollector`: implement collectors for the EASILY + accessible free sources (open JSON/CSV), and clearly-marked TODO stubs (with access notes) for + the scrape-required ones. Same EconomicData shape. Config-driven list of enabled sources. + +STEP 2 — config/cbb_taxonomy.json + 9 sectors: manufacturing, property_construction, retail_consumer, services, agriculture, + mining_commodities, transport_logistics, finance_banking, technology. + For each sector: HS chapters (refine to useful 6-digit codes), zh+en sentiment keywords, + the cn_hf_sources that proxy it, dominant provinces, and 2-3 representative listed firms (SOE/private). + Regions: coastal_export, inland, northeast, national. + +STEP 3 — processors/conditions_index.py (PURE function + aggregate processor + __main__ self-test) + Pure compute_conditions(trade_series, cn_indicators, sentiment_mentions, taxonomy, now). + Per sector s, month t: + # sentiment diffusion: classify each mention pos if score>+0.15, neg if <-0.15 else neutral + SD = 100 * (n_pos - n_neg) / max(1, n_pos + n_neg + n_neutral) # -100..+100 + # anchor signal from period-over-period growth g of the sector's anchor (trade and/or cn_hf) + AS = 100 * tanh(g / 0.10) # bounded -100..+100 + # blended conditions index (no anchor -> w_anchor=0, w_sent=1, confidence="low") + D = 0.4*SD + 0.6*AS + momentum = D_t - D_{t-1} # flow, not level + # cross-source triangulation (standard trade-statistics reconciliation): + mirror_gap = 100 * (mirror_value - reported_value) / max(1, reported_value) + confidence = f(n_mentions, anchor_available) -> "low"/"med"/"high" + Return per-sector dict {sector, region, D, SD, AS, momentum, mirror_gap, confidence, + n_mentions, inputs}. Keep PURE & offline-testable. __main__ feeds 2-3 synthetic months and + prints a sector table. + Aggregate `ConditionsIndexProcessor(BaseProcessor)`: override run(), read recent EconomicData + (comtrade_mirror + cn_indicators) + sentiment, call compute_conditions, publish to Redis + "cbb:latest" (best-effort) and persist a snapshot row. + +STEP 4 — processors/conditions_report.py (pattern = daily_digest.py; use the project's free_llm_router if present, else Ollama/Anthropic fallback like daily_digest) + Generate a neutral "China Economic Conditions Report": sector-by-sector conditions & momentum, + biggest movers, and a "cross-source triangulation" section noting where independent/commercial + indicators and mirror statistics differ from headline official series (framed as data-quality / + nowcasting commentary, NOT accusation). Store the report; optional Telegram/email like daily_digest. + +STEP 5 — dashboard + api/routes/conditions.py — `/api/v4/conditions/{index,report,dashboard}` reading Redis "cbb:latest" + (register in api/main.py with prefix /api/v4). + dashboards/conditions_dashboard.html — a sector x region CONDITIONS HEATMAP (diffusion color + scale red->amber->green), momentum arrows, confidence dots, and a mirror-gap column; clean, modern, + responsive. IMPORTANT: build all HTML from data via a strict escaper (escape & < > " ' on every + interpolated string; coerce numbers) — never inject raw values; map any color from a fixed + whitelist, never from raw data. Try fetching the live API, fall back to an embedded sample, and + clearly badge LIVE vs SAMPLE. + +STORAGE — storage/models.py: add `ConditionsIndexSnapshot` (generated_at, period, sector, region, + diffusion, sentiment, anchor, momentum, mirror_gap, confidence, n_mentions, inputs JSONB; indexes + on generated_at and sector). Must be created by create_all. + +RUNNABLE — scripts/conditions_pull.py: end-to-end WITHOUT Celery/DB — pull data -> compute -> write a + timestamped JSON to data/cbb/ + append data/cbb/history.jsonl; also write Postgres + Redis if + available (each wrapped in try/except; print which tiers succeeded); print the sector table. + +CONSTRAINTS +- Match existing code style; reuse httpx, pandas, SQLAlchemy as the project does. No new heavy deps. +- Handle all network/parse errors gracefully (log + continue). UTF-8; ensure_ascii=False on JSON. +- Everything compiles (python -m py_compile) and the pure compute_conditions + its __main__ self-test + run OFFLINE with synthetic inputs. Public data only. + +DELIVER: implement every file above, run py_compile on each + the self-test, and report results, +plus a short summary of which Chinese data sources you wired vs stubbed and why. +``` diff --git a/PALIMPSEST_BRIEF.md b/PALIMPSEST_BRIEF.md new file mode 100644 index 0000000..60aed1f --- /dev/null +++ b/PALIMPSEST_BRIEF.md @@ -0,0 +1,43 @@ +# PALIMPSEST-II / DDTI — Project Brief (for Kimi) + +You are collaborating with Claude Code on this app. Here's what it is and where you can help. + +## What the app is +A **latent-state China-intelligence engine** built as an extension of `social_scraper` +(a Python collector → processor → API platform). It treats "what's really happening in +China" as a hidden state estimated from many biased sensors. The flagship module is the +**DDTI (Deletion-Differential Threat Index)**: *treat the censor as a sensor* — what the +regime deletes, how fast, and how selectively reveals what it fears. + +## Architecture (existing patterns — match them) +- **Collectors** (`collectors/*.py`, subclass `core.base_collector.BaseCollector`): + `collect() → parse() → validate()`. Registered in `config/sources.yaml`, scheduled by Celery Beat. +- **Processors** (`processors/*.py`, subclass `core.base_processor.BaseProcessor`): NLP/aggregation over collected Articles. +- **Storage**: Postgres (`storage/models.py`), Redis (live cache), disk (`data/`). +- **API**: FastAPI (`api/main.py`, routes in `api/routes/`). +- **Free LLM**: `free_llm_router` is wired for translation/synthesis. + +## DDTI pieces already built +- `collectors/ddti_probe.py` — pulls China Digital Times deletion feeds. +- `processors/ddti_index.py` — `compute_selectivity_novelty()` ranks censored terms by + **threat = attention (time-decayed frequency) × novelty (burst / first-appearance)**. + `extract_terms()` pulls quoted spans + an English gazetteer + the Chinese finance lexicon + tags. +- `processors/zh_finance.py` + `config/zh_finance_lexicon.json` — Chinese finance/policy lexicon (you built this earlier) + negation-aware hawkish/dovish/sector detection. +- `dashboards/ddti_dashboard.html` — "Redacted Intelligence Terminal" UI. +- `scripts/ddti_live_pull.py` — pulls real CDT data, stores a disk time-series. + +## Current constraint (why your help matters) +From normal egress only CDT's English root feed is reachable; the richer **Chinese** +deletion feeds (CDT Chinese, Weibo, FreeWeibo) are Cloudflare-blocked and await a proxy. +When that proxy lands, the pipeline will ingest **Chinese** censored text — and our term +extraction has no Chinese *censorship* vocabulary yet (only finance terms + English entities). + +## Where you (Kimi) can help — Chinese-language layers +1. **Chinese censorship / sensitive-terms gazetteer** (most valuable now) — euphemisms and + deletion-trigger phrases that evade filters: June-4 euphemisms (八平方, 占中…), leadership + euphemisms, protest/dissent slang, economic-distress slang, 润学/emigration, censorship-meta. +2. Mapping CDT English topic tags ↔ canonical Chinese threat categories. +3. Reviewing Chinese sentiment edge cases (negation, sarcasm) in `processors/zh_finance.py`. + +Output structured data (JSON) Claude will review and integrate. Claude owns the code/architecture; +you own the Chinese-language knowledge. Do not endanger sources — public/aggregated terms only. diff --git a/PALIMPSEST_CBB_PLAN.md b/PALIMPSEST_CBB_PLAN.md new file mode 100644 index 0000000..2e5d193 --- /dev/null +++ b/PALIMPSEST_CBB_PLAN.md @@ -0,0 +1,122 @@ +# PALIMPSEST → China-Beige-Book-style Conditions Engine — Build Plan + +## Goal +Reproduce China Beige Book's *philosophy and output*, not its survey moat: +1. **Independent ground-truth** that ignores official NBS stats (we use physical anchors instead of firm surveys). +2. **Sector × region** disaggregation as **diffusion indices** (% improving − % deteriorating). +3. **Flow over level** — emphasize change (momentum), not absolute snapshots. +4. **Divergence flag** — surface where independent data contradicts official claims (CBB's signature). +5. A periodic **"China Conditions Report"**. + +What we explicitly DON'T copy: CBB's primary firm-survey network (unreplicable solo). Our +substitute is OSINT + un-fakeable physical anchors, with confidence scoring that is honest +about being noisier than surveys. + +--- + +## Layer 1 — Independent data (the survey substitute) +All as `BaseCollector` subclasses → `EconomicData` table (`source_type="api"`). + +| Collector | Source | Cost/reach | Feeds which sectors | Status | +|---|---|---|---|---| +| `comtrade_mirror.py` | UN Comtrade (partner-reported trade) | free API (key for higher limits); globally reachable | manufacturing, property (ore/cement/copper), tech (semiconductors HS85), agri (soybeans), commodities | **Phase 1** | +| `viirs_nightlights.py` | NASA Black Marble VNP46 | free | regional activity, property (new districts) | Phase 3 | +| `ais_ports.py` | aisstream.io websocket | free | transport/logistics, export tempo | Phase 3 | +| (existing) sentiment/DDTI | scraped articles | built | every sector (sentiment diffusion) | ✅ | + +**Why Comtrade first:** partner customs data is collected *outside* China → un-massageable; +free; reachable from normal egress (unlike the Cloudflare-blocked censorship feeds). It is the +single most CBB-spirited dataset and the cleanest first build. + +Comtrade specifics: pull monthly China (reporter=156) trade AND mirror (partners report China as +partner), by HS chapter. Key chapters → sectors: HS72/73 steel, HS25 cement, HS26 ores, HS74 copper +→ **property/construction**; HS84/85 machinery/electronics → **manufacturing/tech**; HS85 (8541/8542 +semiconductors) → **tech**; HS12 oilseeds, HS10 cereals → **agriculture**; HS27 energy → **energy**. +Mirror gap (partner-reported China imports − China-reported exports) = a distortion signal. + +--- + +## Layer 2 — Sector × region taxonomy (`config/cbb_taxonomy.json`) +**Sectors (9):** manufacturing, property_construction, retail_consumer, services, agriculture, +mining_commodities, transport_logistics, finance_banking, technology. +**Regions (initial):** coastal_export (Guangdong, Jiangsu, Zhejiang, Shanghai, Fujian), +inland, northeast_rustbelt, national. (Region split comes mainly from nightlights + provincial +trade; mark coverage="partial" until then.) +**Firm-type (stretch axis):** SOE vs private — inferable later from filings/announcements; defer. + +Each sector entry maps to: sentiment keywords (zh+en), the anchor series that proxy it, and an +optional official series for the divergence check. + +--- + +## Layer 3 — Diffusion-index formula (`processors/conditions_index.py`, pure core) +Per sector *s*, period *t* (monthly): + +**Sentiment diffusion** (from sector-tagged articles with sentiment score ∈ [−1,1]; classify each +mention positive if score>+θ, negative if <−θ, else neutral; θ=0.15): +``` +SD = 100 · (n_pos − n_neg) / max(1, n_pos + n_neg + n_neutral) # range −100..+100 +``` + +**Anchor signal** (for sectors with a physical proxy; g = period-over-period growth of the anchor +series, e.g. mirror-trade volume; k = reference scale ≈ 0.10): +``` +AS = 100 · tanh(g / k) # bounded −100..+100 +``` + +**Blended conditions index** (weights sum to 1; if no anchor, w_anchor=0 and confidence drops): +``` +D = w_sent · SD + w_anchor · AS (default w_sent=0.4, w_anchor=0.6 when anchor exists) +``` + +**Momentum (flow, the CBB emphasis):** `ΔD = D_t − D_{t−1}` + +**Confidence:** `C = f(n_mentions, anchor_available)` → low/med/high; reported with every cell. + +**Divergence (CBB signature):** when an official series O exists, +``` +Div = AS_independent − normalize(O_official) +``` +Persistent Div<0 (official > independent) = suspected overstatement → flagged in the report. + +All outputs are **−100..+100** so red↔green heatmaps and QoQ arrows render directly. + +--- + +## Layer 4 — Storage (time-series) (`storage/models.py`) +New `ConditionsIndexSnapshot`: (generated_at, period, sector, region, diffusion D, sentiment SD, +anchor AS, momentum ΔD, divergence, confidence, n_mentions, inputs JSONB). One row per +sector×region×period → native time-series (mirrors `ddti_index_snapshots`). Redis `cbb:latest` +for the live dashboard; disk JSON fallback like the DDTI pull. + +--- + +## Layer 5 — Report generator (`processors/conditions_report.py`, pattern = daily_digest.py) +Assemble the sector grid + anchors + divergence flags → `free_llm_router` → a CBB-style brief: +per-sector conditions, biggest movers (momentum), and an explicit "where official data looks +overstated" section. Periodic (monthly/quarterly). Stored + optionally emailed/Telegram'd. + +--- + +## Layer 6 — Dashboard (extend or sibling of ddti_dashboard.html) +Sector × region **conditions heatmap** (diffusion color scale, momentum arrows, confidence dots, +divergence ⚠ flags), plus a conditions time-series sparkline per sector from the snapshot history. +Same "Redacted Terminal" aesthetic; XSS-safe rendering (esc()/num(), whitelisted color scales). + +--- + +## Build sequence & gates +- **Phase 1 — Comtrade mirror-trade collector** + taxonomy config. GATE: collector returns real + partner-trade rows for ≥5 HS chapters from this egress. +- **Phase 2 — Conditions index** (pure formula + processor over sentiment, anchor optional) + + storage model + a cold-start run. GATE: a sector grid renders with non-flat diffusion. +- **Phase 3 — More anchors** (nightlights, AIS) for regional + non-trade sectors. +- **Phase 4 — Conditions report** generator. +- **Phase 5 — Dashboard heatmap** + history sparklines. + +## Honest limits +- No firm-level survey data — OSINT proxies are noisier; every cell ships a confidence score. +- Region/firm-type axes start thin (sentiment is rarely geotagged); grow with nightlights/filings. +- Divergence needs an official series to compare against; where absent, report independent-only. +- Comtrade lags weeks–months and is vintage-revised → carry vintage-aware timestamps, no look-ahead. +- HK re-exports / CIF-FOB asymmetries distort mirror-trade → use as a consensus signal, not gospel. diff --git a/api/main.py b/api/main.py index 09bcb7f..fd22e9d 100644 --- a/api/main.py +++ b/api/main.py @@ -19,6 +19,8 @@ from api.routes.data import router as data_router from api.routes.digest import router as digest_router from api.routes.health_v4 import router as monitoring_router +from api.routes.ddti import router as ddti_router +from api.routes.conditions import router as conditions_router # API Key Manager from apikeys.routes import router as keys_router @@ -76,6 +78,8 @@ async def lifespan(app: FastAPI): app.include_router(data_router, prefix="/api/v4") app.include_router(digest_router, prefix="/api/v4") app.include_router(monitoring_router, prefix="/api/v4") +app.include_router(ddti_router, prefix="/api/v4") +app.include_router(conditions_router, prefix="/api/v4") # API Key Manager app.include_router(keys_router, prefix="/api/v4") diff --git a/api/routes/conditions.py b/api/routes/conditions.py new file mode 100644 index 0000000..cdc0fbc --- /dev/null +++ b/api/routes/conditions.py @@ -0,0 +1,88 @@ +"""China Economic Conditions API — serves the CBB conditions index + report + dashboard. + +Reads what processors.conditions_index.ConditionsIndexProcessor publishes to Redis: + cbb:latest — the latest sector-by-sector conditions snapshot (JSON) +""" + +import json +import logging +import os +from pathlib import Path + +from fastapi import APIRouter +from fastapi.responses import HTMLResponse, JSONResponse + +logger = logging.getLogger(__name__) +router = APIRouter(prefix="/conditions", tags=["conditions"]) + +_DASHBOARD = Path(__file__).resolve().parent.parent.parent / "dashboards" / "conditions_dashboard.html" +_REPORT_PATH = Path(os.getenv("DATA_DIR", "./data")) / "cbb" / "reports" / "latest.md" + +_SAMPLE_REPORT = """# China Economic Conditions Report (sample) + +> This is a fallback sample. Run `processors/conditions_report.py` to generate a live report. + +## Sector summary + +| Sector | Region | D | Momentum | Confidence | +|--------|--------|---|----------|------------| +| Electronics & machinery | coastal_export | 12.3 | ▲ | high | +| Textiles & apparel | coastal_export | -8.1 | ▼ | med | +| Steel & metals | northeast | 3.4 | ▬ | low | +| Property & construction | national | -22.7 | ▼ | med | +| Logistics & freight | national | 7.8 | ▲ | high | + +## Cross-source triangulation + +- Mirror-reported trade gaps are widest in property-linked sectors. +- High-frequency freight indicators are turning up while official manufacturing PMI is flat. +""" + + +def _redis(): + import redis + return redis.from_url(os.getenv("REDIS_URL", "redis://localhost:6379"), decode_responses=True) + + +@router.get("/index") +async def conditions_index(): + """Latest China economic conditions index. status='live' when real data exists.""" + try: + r = _redis() + raw = r.get("cbb:latest") + r.close() + if not raw: + return JSONResponse({ + "status": "empty", + "note": "No conditions index computed yet. Run processors/conditions_index or " + "scripts/conditions_pull.py to populate cbb:latest.", + "sectors": [], + }) + data = json.loads(raw) + data["status"] = "live" + return data + except Exception as e: + logger.warning(f"[Conditions-API] index read failed: {e}") + return JSONResponse({"status": "error", "error": str(e), "sectors": []}) + + +@router.get("/report") +async def conditions_report(): + """Latest China economic conditions markdown report.""" + try: + if _REPORT_PATH.exists(): + report = _REPORT_PATH.read_text(encoding="utf-8") + return {"status": "live", "report": report} + except Exception as e: + logger.warning(f"[Conditions-API] report read failed: {e}") + + return {"status": "sample", "report": _SAMPLE_REPORT} + + +@router.get("/dashboard", response_class=HTMLResponse) +async def conditions_dashboard(): + """The visual conditions dashboard (same-origin, so live fetch works without CORS).""" + try: + return HTMLResponse(_DASHBOARD.read_text(encoding="utf-8")) + except FileNotFoundError: + return HTMLResponse("

conditions_dashboard.html not found

", status_code=404) diff --git a/api/routes/ddti.py b/api/routes/ddti.py new file mode 100644 index 0000000..db6fe75 --- /dev/null +++ b/api/routes/ddti.py @@ -0,0 +1,67 @@ +"""DDTI API — serves the censorship selectivity/novelty index + the dashboard. + +Reads what processors.ddti_index.DDTIIndexProcessor publishes to Redis: + ddti:index:latest — the ranked threat index (JSON) + alerts:ddti — stream of newly-sensitive / high-threat terms +""" + +import json +import logging +import os +from pathlib import Path + +from fastapi import APIRouter, Query +from fastapi.responses import HTMLResponse, JSONResponse + +logger = logging.getLogger(__name__) +router = APIRouter(prefix="/ddti", tags=["ddti"]) + +_DASHBOARD = Path(__file__).resolve().parent.parent.parent / "dashboards" / "ddti_dashboard.html" + + +def _redis(): + import redis + return redis.from_url(os.getenv("REDIS_URL", "redis://localhost:6379"), decode_responses=True) + + +@router.get("/index") +async def ddti_index(): + """Latest DDTI threat index. status='live' when real data exists.""" + try: + r = _redis() + raw = r.get("ddti:index:latest") + r.close() + if not raw: + return JSONResponse({ + "status": "empty", + "note": "No index computed yet. Enable ddti_probe in sources.yaml and run " + "generate_ddti_index once deletions accumulate.", + "ranked": [], + }) + data = json.loads(raw) + data["status"] = "live" + return data + except Exception as e: + logger.warning(f"[DDTI-API] index read failed: {e}") + return JSONResponse({"status": "error", "error": str(e), "ranked": []}) + + +@router.get("/alerts") +async def ddti_alerts(limit: int = Query(50, ge=1, le=200)): + """Recent newly-sensitive / high-threat term alerts (newest first).""" + try: + r = _redis() + items = r.lrange("alerts:ddti", 0, limit - 1) + r.close() + return {"alerts": [json.loads(x) for x in items]} + except Exception as e: + return {"alerts": [], "error": str(e)} + + +@router.get("/dashboard", response_class=HTMLResponse) +async def ddti_dashboard(): + """The visual dashboard (same-origin, so live fetch works without CORS).""" + try: + return HTMLResponse(_DASHBOARD.read_text(encoding="utf-8")) + except FileNotFoundError: + return HTMLResponse("

ddti_dashboard.html not found

", status_code=404) diff --git a/collectors/cn_indicators.py b/collectors/cn_indicators.py new file mode 100644 index 0000000..1ff15b4 --- /dev/null +++ b/collectors/cn_indicators.py @@ -0,0 +1,449 @@ +"""Chinese high-frequency economic indicators collector. + +Fetches public Chinese / China-relevant high-frequency indicators as configured +in ``enabled_sources``. Sources marked ``access="todo"`` are logged and +skipped; open JSON/CSV sources are fetched and parsed. Failures are caught, +logged, and the collector continues. + +The collector can be driven three ways: + +1. ``config.enabled_sources`` as a list of full source dicts. +2. ``config.enabled_sources`` as a list of keys, filtering ``config/cn_hf_sources.json``. +3. No config → the full ``config/cn_hf_sources.json`` catalog is used, falling + back to a small built-in set of open World Bank China proxies. +""" + +import io +import json +import logging +import math +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Optional + +import pandas as pd + +from core.base_collector import BaseCollector +from core.exceptions import SchemaChangedError + +logger = logging.getLogger(__name__) + +_CATALOG_PATH = Path(__file__).resolve().parent.parent / "config" / "cn_hf_sources.json" + +# Minimal built-in fallback used when the external catalog is absent. +# World Bank endpoints are open, stable, and require no API key. +_DEFAULT_SOURCES: list[dict] = [ + { + "key": "wb_chn_gdp", + "name_zh": "世界银行中国GDP", + "name_en": "World Bank China GDP", + "url": "https://api.worldbank.org/v2/country/CHN/indicator/NY.GDP.MKTP.CD?format=json&per_page=20", + "method": "GET", + "parser": "json", + "json_path": "1", + "date_field": "date", + "value_field": "value", + "unit": "USD", + "sector": "macro", + "access": "open_json", + "frequency": "annual", + "difficulty": "easy", + "note": "Open World Bank API; annual GDP in current USD.", + }, + { + "key": "wb_chn_cpi", + "name_zh": "世界银行中国CPI通胀", + "name_en": "World Bank China CPI Inflation", + "url": "https://api.worldbank.org/v2/country/CHN/indicator/FP.CPI.TOTL.ZG?format=json&per_page=20", + "method": "GET", + "parser": "json", + "json_path": "1", + "date_field": "date", + "value_field": "value", + "unit": "%", + "sector": "macro", + "access": "open_json", + "frequency": "annual", + "difficulty": "easy", + "note": "Open World Bank API; annual CPI inflation.", + }, + { + "key": "wb_chn_exports", + "name_zh": "世界银行中国货物服务出口", + "name_en": "World Bank China Exports of Goods and Services", + "url": "https://api.worldbank.org/v2/country/CHN/indicator/NE.EXP.GNFS.CD?format=json&per_page=20", + "method": "GET", + "parser": "json", + "json_path": "1", + "date_field": "date", + "value_field": "value", + "unit": "USD", + "sector": "manufacturing", + "access": "open_json", + "frequency": "annual", + "difficulty": "easy", + "note": "Open World Bank API; annual exports in current USD.", + }, + # High-frequency Chinese domestic sources — mostly scrape/paid, stubbed as TODO. + { + "key": "bdi", + "name_zh": "波罗的海干散货指数", + "name_en": "Baltic Dry Index", + "url": "https://www.balticexchange.com", + "method": "GET", + "parser": "json", + "unit": "points", + "sector": "transport_logistics", + "access": "todo", + "frequency": "daily", + "difficulty": "medium", + "note": "Daily bulk freight index. Real-time public feed requires subscription or scrape.", + }, + { + "key": "ccfi", + "name_zh": "中国出口集装箱运价指数", + "name_en": "China Containerized Freight Index", + "url": "http://www.sse.net.cn", + "method": "GET", + "parser": "json", + "unit": "points", + "sector": "transport_logistics", + "access": "todo", + "frequency": "weekly", + "difficulty": "medium", + "note": "Published by Shanghai Shipping Exchange; no stable open API — scrape required.", + }, + { + "key": "scfi", + "name_zh": "上海出口集装箱运价指数", + "name_en": "Shanghai Containerized Freight Index", + "url": "http://www.sse.net.cn", + "method": "GET", + "parser": "json", + "unit": "points", + "sector": "transport_logistics", + "access": "todo", + "frequency": "weekly", + "difficulty": "medium", + "note": "Published by Shanghai Shipping Exchange; no stable open API — scrape required.", + }, + { + "key": "yiwu_index", + "name_zh": "义乌中国小商品指数", + "name_en": "Yiwu China Commodity Index", + "url": "http://www.ywindex.com", + "method": "GET", + "parser": "json", + "unit": "points", + "sector": "retail_consumer", + "access": "todo", + "frequency": "weekly", + "difficulty": "medium", + "note": "Yiwu small-commodity price index; no stable open API — scrape required.", + }, + { + "key": "cpca_retail_pv", + "name_zh": "乘联会乘用车零售销量", + "name_en": "CPCA Passenger Vehicle Retail Sales", + "url": "http://www.cpcaauto.com", + "method": "GET", + "parser": "json", + "unit": "units", + "sector": "retail_consumer", + "access": "todo", + "frequency": "weekly", + "difficulty": "hard", + "note": "CPCA weekly/monthly PV retail; published as HTML/Excel — scrape required.", + }, + { + "key": "cpca_wholesale_pv", + "name_zh": "乘联会乘用车批发销量", + "name_en": "CPCA Passenger Vehicle Wholesale Sales", + "url": "http://www.cpcaauto.com", + "method": "GET", + "parser": "json", + "unit": "units", + "sector": "retail_consumer", + "access": "todo", + "frequency": "weekly", + "difficulty": "hard", + "note": "CPCA weekly/monthly PV wholesale; published as HTML/Excel — scrape required.", + }, +] + + +class CNIndicatorsCollector(BaseCollector): + """Collector for Chinese high-frequency economic indicators.""" + + name = "cn_indicators" + source_type = "api" + + def __init__(self, config: dict): + super().__init__(config) + self.enabled_sources = self._load_sources() + + # ── Configuration loading ─────────────────────────────────────── + + def _load_sources(self) -> list[dict]: + """Resolve enabled_sources from config and/or catalog.""" + raw_sources = self.config.get("enabled_sources", ...) + + if raw_sources is not ...: + # Explicit config: list of full dicts, list of keys, or empty list. + if isinstance(raw_sources, list): + if raw_sources and isinstance(raw_sources[0], dict): + return [self._normalize_source(s) for s in raw_sources] + if raw_sources and isinstance(raw_sources[0], str): + catalog = self._load_catalog() + key_set = set(raw_sources) + return [self._normalize_source(s) for s in catalog if s.get("key") in key_set] + return [] + return [] + + # No explicit config: load whole catalog, then built-in defaults. + catalog = self._load_catalog() + if catalog: + return [self._normalize_source(s) for s in catalog] + return [self._normalize_source(s) for s in _DEFAULT_SOURCES] + + @staticmethod + def _load_catalog() -> list[dict]: + try: + if _CATALOG_PATH.exists(): + data = json.loads(_CATALOG_PATH.read_text(encoding="utf-8")) + if isinstance(data, dict): + return data.get("sources", []) or data.get("enabled_sources", []) or [] + if isinstance(data, list): + return data + except Exception as e: + logger.warning(f"[CNIndicators] Failed to load catalog {_CATALOG_PATH}: {e}") + return [] + + @staticmethod + def _normalize_source(src: dict) -> dict: + """Make catalog items compatible with collector config fields.""" + normalized = dict(src) + # cn_hf_sources.json uses access_method; collector internals use access. + if "access" not in normalized and "access_method" in normalized: + normalized["access"] = normalized["access_method"] + # Default parser/method if missing. + normalized.setdefault("method", "GET") + normalized.setdefault("parser", "json") + normalized.setdefault("date_field", "date") + normalized.setdefault("value_field", "value") + normalized.setdefault("unit", "") + normalized.setdefault("sector", "macro") + normalized.setdefault("frequency", "unknown") + normalized.setdefault("name_zh", normalized.get("key", "")) + normalized.setdefault("name_en", normalized.get("key", "")) + return normalized + + # ── Collection ────────────────────────────────────────────────── + + async def collect(self) -> list[dict]: + """Fetch configured sources and return normalized raw records.""" + records: list[dict] = [] + + for src in self.enabled_sources: + key = src["key"] + access = src.get("access", "todo") + + if access == "todo": + logger.info( + f"[CNIndicators] TODO: {key} — " + f"{src.get('note', 'scrape/paid source not yet implemented')}" + ) + continue + + url = src.get("url") + if not url: + logger.warning(f"[CNIndicators] {key}: no URL configured") + continue + + try: + items = await self._fetch_source(src) + except Exception as e: + logger.warning(f"[CNIndicators] {key}: fetch/parse failed: {e}") + continue + + if not isinstance(items, list): + logger.warning( + f"[CNIndicators] {key}: expected list of observations, got {type(items).__name__}" + ) + continue + + count = 0 + date_field = src.get("date_field", "date") + value_field = src.get("value_field", "value") + + for item in items: + if not isinstance(item, dict): + continue + + date = self._normalize_date(item.get(date_field)) + value = self._normalize_value(item.get(value_field)) + if date is None or value is None: + continue + + records.append({ + "key": key, + "date": date, + "value": value, + "unit": src.get("unit", ""), + "sector": src.get("sector", "macro"), + "frequency": src.get("frequency", "unknown"), + "source_name_zh": src.get("name_zh", key), + "source_name_en": src.get("name_en", key), + "url": url, + "access": access, + "metadata_extra": { + k: v for k, v in item.items() + if k not in (date_field, value_field) + }, + }) + count += 1 + + logger.info(f"[CNIndicators] {key}: collected {count} records") + + logger.info(f"[CNIndicators] Total records collected: {len(records)}") + return records + + async def _fetch_source(self, src: dict) -> Any: + """Fetch one source and return the list of observations.""" + url = src["url"] + method = src.get("method", "GET").upper() + + if method == "POST": + resp = await self._http.post(url) + else: + resp = await self._http.get(url) + + logger.info(f"[CNIndicators] {src['key']}: HTTP {resp.status_code} from {url}") + + if resp.status_code != 200: + logger.warning( + f"[CNIndicators] {src['key']}: non-200 status {resp.status_code}" + ) + return [] + + parser = src.get("parser", "json") + if parser == "csv": + df = pd.read_csv(io.StringIO(resp.text)) + return df.to_dict("records") + + data = resp.json() + return self._get_nested(data, src.get("json_path")) + + # ── Parsing helpers ───────────────────────────────────────────── + + @staticmethod + def _get_nested(data: Any, path: Optional[str]) -> Any: + """Navigate a dotted path (supports dict keys and list indices).""" + if not path: + return data + current = data + for part in path.split("."): + if current is None: + return None + if part.isdigit(): + idx = int(part) + if isinstance(current, list) and 0 <= idx < len(current): + current = current[idx] + else: + return None + elif isinstance(current, dict): + current = current.get(part) + else: + return None + return current + + @staticmethod + def _normalize_date(value: Any) -> Optional[datetime]: + """Convert a raw date value to a timezone-aware UTC datetime.""" + if value is None: + return None + + if isinstance(value, datetime): + if value.tzinfo is None: + return value.replace(tzinfo=timezone.utc) + return value + + # Numeric year (e.g. World Bank annual observations). + if isinstance(value, (int, float)) and not isinstance(value, bool): + try: + return datetime(int(value), 1, 1, tzinfo=timezone.utc) + except (ValueError, OverflowError): + return None + + s = str(value).strip() + if not s: + return None + + # ISO / pandas timestamp. + try: + return datetime.fromisoformat(s.replace("Z", "+00:00")) + except ValueError: + pass + + for fmt in ("%Y-%m-%d", "%Y-%m", "%Y", "%Y/%m/%d", "%d-%m-%Y"): + try: + return datetime.strptime(s, fmt).replace(tzinfo=timezone.utc) + except ValueError: + pass + + return None + + @staticmethod + def _normalize_value(value: Any) -> Optional[float]: + """Convert a raw value to float, returning None for missing/invalid.""" + if value is None: + return None + + if isinstance(value, bool): + return None + + if isinstance(value, (int, float)): + v = float(value) + return v if math.isfinite(v) else None + + s = str(value).strip().replace(",", "") + if s.lower() in ("", ".", "-", "nd", "na", "n/a", "null", "none"): + return None + + try: + v = float(s) + return v if math.isfinite(v) else None + except ValueError: + return None + + # ── Parse / Validate ──────────────────────────────────────────── + + async def parse(self, raw_data: list[dict]) -> pd.DataFrame: + """Transform raw records into the EconomicData schema.""" + rows = [] + for r in raw_data: + rows.append({ + "indicator": r["key"], + "date": r["date"], + "value": r["value"], + "unit": r["unit"], + "metadata": { + "category": r["sector"], + "frequency": r["frequency"], + "source_name_zh": r["source_name_zh"], + "source_name_en": r["source_name_en"], + "url": r["url"], + "access": r["access"], + "sector": r["sector"], + "raw": r.get("metadata_extra", {}), + }, + }) + return pd.DataFrame(rows) + + def validate(self, df: pd.DataFrame) -> bool: + """Validate that parsed rows contain the required columns.""" + required = ["indicator", "date", "value"] + missing = [c for c in required if c not in df.columns] + if missing: + raise SchemaChangedError(self.name, required, list(df.columns)) + return True diff --git a/collectors/comtrade_mirror.py b/collectors/comtrade_mirror.py new file mode 100644 index 0000000..0c8b8b3 --- /dev/null +++ b/collectors/comtrade_mirror.py @@ -0,0 +1,397 @@ +"""UN Comtrade mirror collector for China merchandise trade. + +Fetches two complementary views from the UN Comtrade API: + +1. **Reported by China** — `reporterCode=156`, `partnerCode=0` (world), + flows `M` (imports) and `X` (exports). +2. **Mirror view** — major partner countries reporting trade with China + (`partnerCode=156`). Flow directions are inverted so they are comparable + to China-reported flows: + * partner `M` → stored as `X` (China export mirror) + * partner `X` → stored as `M` (China import mirror) + +Each record is written to the `economic_data` table with: +* `source`: `comtrade_mirror` +* `indicator`: `trade_{flow}_{hs}` for the China-reported view and + `trade_{flow}_{hs}_mirror` for the mirror view. The suffix keeps the two + views distinct under the unique ``(source, indicator, date)`` index. +* `value`: `primaryValue` in USD +* `metadata`: `{"hs", "flow", "reporter", "partner", "period", "netWeight", + "view", "original_flow"?}` + +The collector is config-driven but supplies sensible defaults so it can be +registered from `sources.yaml` with minimal boilerplate. +""" + +import asyncio +import logging +import math +import os +import time +from datetime import datetime, timezone +from typing import Any + +import pandas as pd + +from core.base_collector import BaseCollector +from core.exceptions import RateLimitError, SchemaChangedError, SourceDownError + +logger = logging.getLogger(__name__) + +# Two-digit HS chapters most relevant for China cyclical/export sectors. +_DEFAULT_HS_CHAPTERS = [ + "84", # machinery / electrical machinery nuclei + "85", # electrical machinery + "62", # apparel (non-knit) + "61", # apparel (knit) + "73", # articles of iron/steel + "72", # iron/steel + "39", # plastics + "90", # optical / medical instruments + "29", # organic chemicals + "27", # mineral fuels +] + +# Major trading partners used to build the mirror view (UN M49 reporter codes). +_DEFAULT_PARTNER_REPORTERS = [ + 842, # United States + 392, # Japan + 276, # Germany + 410, # Rep. of Korea + 704, # Viet Nam + 528, # Netherlands + 826, # United Kingdom + 356, # India + 36, # Australia + 643, # Russian Federation +] + + +class ComtradeMirrorCollector(BaseCollector): + name = "comtrade_mirror" + source_type = "api" + + def __init__(self, config: dict): + super().__init__(config) + self.base_url = config.get( + "base_url", "https://comtradeapi.un.org/data/v1/get" + ).rstrip("/") + self.api_key = config.get("api_key") or os.getenv("COMTRADE_API_KEY") or None + self.hs_chapters = [ + str(hs).strip()[:2] + for hs in config.get("hs_chapters", _DEFAULT_HS_CHAPTERS) + ] + self.partner_reporters = [ + int(p) for p in config.get("partner_reporters", _DEFAULT_PARTNER_REPORTERS) + ] + self.recent_months = int(config.get("recent_months", 12)) + self.type_code = config.get("type_code", "C") + self.freq_code = config.get("freq_code", "M") + self.classification = config.get("classification", "HS") + self.include_desc = bool(config.get("include_desc", True)) + self.inter_request_delay = float(config.get("inter_request_delay", 1.0)) + self._last_request_at = 0.0 + + # ── Public lifecycle ───────────────────────────────────────────── + + async def collect(self) -> list[dict]: + """Fetch raw Comtrade records for both China-reported and mirror views.""" + headers = {} + if self.api_key: + headers["Ocp-Apim-Subscription-Key"] = self.api_key + + periods = self._periods(self.recent_months) + cmd_code = ",".join(self.hs_chapters) + + logger.info( + f"[{self.name}] Collecting Comtrade data: periods={periods}, " + f"hs_chapters={self.hs_chapters}, partners={len(self.partner_reporters)}" + ) + + records: list[dict] = [] + rate_limited = False + rate_limit_retry = 60 + + # 1) Reported by China + for period in periods: + if rate_limited: + break + for flow in ("M", "X"): + try: + url = self._endpoint(period, reporter=156) + params = self._params(flow=flow, partner=0, cmd_code=cmd_code) + batch = await self._fetch(url, params, headers) + records.extend(batch) + except RateLimitError as e: + rate_limited = True + rate_limit_retry = int(e.retry_after or 60) + logger.warning( + f"[{self.name}] Rate limited on China-reported view; " + f"keeping {len(records)} records collected so far" + ) + break + + # 2) Mirror view (partner reporters) + for partner in self.partner_reporters: + if rate_limited: + break + for period in periods: + if rate_limited: + break + for flow in ("M", "X"): + try: + url = self._endpoint(period, reporter=partner) + params = self._params(flow=flow, partner=156, cmd_code=cmd_code) + batch = await self._fetch(url, params, headers) + # Tag mirror records so parse() can invert the flow. + for rec in batch: + rec["_mirror_reporter"] = partner + rec["_original_flow"] = flow + records.extend(batch) + except RateLimitError as e: + rate_limited = True + rate_limit_retry = int(e.retry_after or 60) + logger.warning( + f"[{self.name}] Rate limited on mirror view; " + f"keeping {len(records)} records collected so far" + ) + break + + logger.info( + f"[{self.name}] Collected {len(records)} raw records " + f"(rate_limited={rate_limited})" + ) + + if not records and rate_limited: + logger.warning( + f"[{self.name}] Rate limited and no records collected; " + f"returning empty to allow graceful degradation" + ) + if not records: + logger.warning( + f"[{self.name}] No records collected from Comtrade; " + f"degrading gracefully" + ) + return records + + async def parse(self, raw_data: list[dict]) -> pd.DataFrame: + """Transform raw Comtrade records into standardized EconomicData rows. + + Sub-commodity rows (e.g. HS6 codes returned for an HS2 chapter query) are + aggregated to the (indicator, date) level so they do not violate the + unique ``(source, indicator, date)`` index on ``economic_data``. Mirror + rows are written under ``trade_{flow}_{hs}_mirror`` so the China-reported + and mirror views can coexist for the same flow and HS chapter. + """ + buckets: dict[tuple[str, datetime], dict] = {} + for rec in raw_data: + try: + parsed = self._parse_record(rec) + if not parsed: + continue + key = (parsed["indicator"], parsed["date"]) + bucket = buckets.get(key) + if bucket is None: + bucket = { + "indicator": parsed["indicator"], + "date": parsed["date"], + "value": 0.0, + "unit": parsed["unit"], + "metadata": dict(parsed["metadata"]), + } + bucket["metadata"]["netWeight"] = 0.0 + buckets[key] = bucket + bucket["value"] += parsed["value"] + new_nw = parsed["metadata"].get("netWeight") or 0.0 + bucket["metadata"]["netWeight"] += new_nw + except Exception as e: + logger.warning(f"[{self.name}] Skipping malformed record: {e}") + + rows = [] + for bucket in buckets.values(): + if not bucket["metadata"].get("netWeight"): + bucket["metadata"]["netWeight"] = None + rows.append(bucket) + return pd.DataFrame(rows) + + def validate(self, df: pd.DataFrame) -> bool: + if df.empty: + return True + required = ["indicator", "date", "value"] + missing = [c for c in required if c not in df.columns] + if missing: + raise SchemaChangedError(self.name, required, list(df.columns)) + return True + + # ── Internal helpers ───────────────────────────────────────────── + + def _endpoint(self, period: str, reporter: int) -> str: + return ( + f"{self.base_url}/{self.type_code}/{self.freq_code}/" + f"{self.classification}/{period}/{reporter}" + ) + + def _params(self, *, flow: str, partner: int, cmd_code: str) -> dict[str, Any]: + params = { + "flowCode": flow, + "partnerCode": str(partner), + "cmdCode": cmd_code, + } + if self.include_desc: + params["includeDesc"] = "True" + return params + + async def _fetch( + self, url: str, params: dict, headers: dict + ) -> list[dict]: + """Execute one throttled GET and return the dataset list.""" + await self._throttle() + try: + resp = await self._http.get(url, params=params, headers=headers) + except Exception as e: + logger.warning(f"[{self.name}] Request error for {url}: {e}") + return [] + + if resp.status_code == 429: + retry_after = 60 + try: + retry_after = int(resp.headers.get("Retry-After", 60)) + except (ValueError, TypeError): + pass + raise RateLimitError(self.name, retry_after=retry_after) + + if resp.status_code != 200: + logger.warning( + f"[{self.name}] HTTP {resp.status_code} for {resp.url}" + ) + return [] + + try: + payload = resp.json() + except Exception as e: + logger.warning(f"[{self.name}] Non-JSON response from {resp.url}: {e}") + return [] + + dataset = self._extract_dataset(payload) + if dataset: + logger.debug(f"[{self.name}] {len(dataset)} records from {resp.url}") + return dataset + + async def _throttle(self): + elapsed = time.monotonic() - self._last_request_at + if elapsed < self.inter_request_delay: + await asyncio.sleep(self.inter_request_delay - elapsed) + self._last_request_at = time.monotonic() + + def _extract_dataset(self, payload: Any) -> list[dict]: + if isinstance(payload, list): + return payload + if isinstance(payload, dict): + for key in ("dataset", "data", "items"): + val = payload.get(key) + if isinstance(val, list): + return val + return [] + + def _parse_record(self, rec: dict) -> dict | None: + flow = self._field(rec, "flowCode") or self._field(rec, "flow") + if not flow: + return None + + is_mirror = bool( + rec.get("_mirror_reporter") + or str(self._field(rec, "partnerCode")) == "156" + ) + original_flow = flow + if is_mirror: + # Invert flow so partner imports from China become China exports. + flow = "X" if flow == "M" else "M" + + period = self._field(rec, "period") + if not period or len(str(period)) != 6: + return None + + hs_raw = self._field(rec, "cmdCode") or self._field(rec, "cmd") + if not hs_raw: + return None + hs_chapter = str(hs_raw).strip()[:2] + + value_raw = self._field(rec, "primaryValue") + if value_raw is None: + return None + try: + value = float(value_raw) + except (ValueError, TypeError): + return None + if not math.isfinite(value): + return None + + net_weight_raw = self._field(rec, "netWgt") or self._field(rec, "netWeight") + net_weight = None + if net_weight_raw is not None: + try: + net_weight = float(net_weight_raw) + if not math.isfinite(net_weight): + net_weight = None + except (ValueError, TypeError): + net_weight = None + + reporter = self._field(rec, "reporterCode") + partner = self._field(rec, "partnerCode") + + # Mirror rows are aggregated across partner reporters, so reporter is + # normalised to 0 (world aggregate) while partner=156 keeps the mirror + # identity needed by the conditions processor. + metadata = { + "hs": hs_chapter, + "flow": flow, + "reporter": 0 if is_mirror else reporter, + "partner": partner, + "period": str(period), + "netWeight": net_weight, + "view": "mirror" if is_mirror else "reported", + } + if is_mirror: + metadata["original_flow"] = original_flow + + base_indicator = f"trade_{flow}_{hs_chapter}" + indicator = f"{base_indicator}_mirror" if is_mirror else base_indicator + + return { + "indicator": indicator, + "date": self._period_to_date(period), + "value": value, + "unit": "USD", + "metadata": metadata, + } + + @staticmethod + def _field(rec: dict, key: str) -> Any: + if key in rec: + return rec[key] + # Some APIs return PascalCase or lower-cased aliases. + alt = key[0].lower() + key[1:] if key else key + if alt != key and alt in rec: + return rec[alt] + return None + + @staticmethod + def _period_to_date(period) -> datetime: + s = str(period) + year = int(s[:4]) + month = int(s[4:6]) + return datetime(year, month, 1, tzinfo=timezone.utc) + + @staticmethod + def _periods(n: int) -> list[str]: + """Return the last `n` months as YYYYMM strings (most recent first).""" + now = datetime.now(timezone.utc) + year, month = now.year, now.month + periods = [] + for _ in range(n): + periods.append(f"{year}{month:02d}") + month -= 1 + if month == 0: + month = 12 + year -= 1 + return periods diff --git a/collectors/ddti_probe.py b/collectors/ddti_probe.py new file mode 100644 index 0000000..675ecd7 --- /dev/null +++ b/collectors/ddti_probe.py @@ -0,0 +1,220 @@ +"""DDTI feasibility probe — can we observe Weibo deletions in 2026? + +The Deletion-Differential Threat Index (DDTI) treats the censor as a sensor: +deletion velocity + selectivity = the regime's revealed threat-perception. +Its empirical foundation (Zhu et al. 2013; Bamman et al. 2012) is a decade old +and predates Weibo's API lockdown and the shift to silent, server-side +censorship. So before building the index, we must answer one question: + + Is a usable deletion signal still reconstructable today, from here? + +This module provides: + * a scheduled BaseCollector that ingests deletion observations from passive + anti-censorship feeds (CDT / FreeWeibo / GreatFire) into the Article table; + * the analytical core (survival-curve buckets, post-status classification, + active-liveness checking) used by scripts/ddti_feasibility.py to emit the + GO / NO-GO verdict. + +NOTE ON ENDPOINTS: the feed URLs are CANDIDATES, listed in sources.yaml. Their +availability in 2026 is exactly what the probe measures — do not assume any of +them work; let the reachability matrix report the truth. +""" + +import hashlib +import logging +import math +from datetime import datetime, timezone + +import pandas as pd + +from core.base_collector import BaseCollector + +logger = logging.getLogger(__name__) + +# Deletion feeds are untrusted external XML — use defusedxml to block XXE and +# billion-laughs attacks. Fall back to stdlib only if defusedxml is absent, and +# say so loudly so the gap is visible rather than silent. +try: + from defusedxml import ElementTree as ET + _XML_HARDENED = True +except ImportError: # pragma: no cover + from xml.etree import ElementTree as ET + _XML_HARDENED = False + logger.warning( + "[DDTI] defusedxml not installed — parsing untrusted XML with stdlib " + "(vulnerable to XXE/billion-laughs). Run: pip install defusedxml" + ) + +# Cumulative survival buckets (seconds). Zhu et al. (2013) reference values, to +# be RE-MEASURED not assumed: ~5% @ 8min, ~30% @ 30min, ~90% @ 24h. +SURVIVAL_BUCKETS = [ + ("8m", 8 * 60), + ("30m", 30 * 60), + ("1h", 60 * 60), + ("6h", 6 * 3600), + ("24h", 24 * 3600), + ("3d", 3 * 86400), +] +HISTORICAL_REFERENCE = {"30m": 0.30, "24h": 0.90} # Zhu et al. 2013, for sanity-check only + +# ── Post-status classification ──────────────────────────────────── +# Weibo does NOT label *who* deleted a post. These Chinese markers (substring, +# never \b — that doesn't anchor on CJK) map a fetched page to a status plus a +# censorship-likelihood in [0,1]. User-deletions are noise; the law/regulation +# language and fast silent removal of high-reach posts are the censorship signal. +_STATUS_MARKERS = [ + # (substring, status, censorship_likelihood) + ("根据相关法律法规和政策", "censored_explicit", 0.97), + ("相关法律法规", "censored_explicit", 0.95), + ("此微博已被作者删除", "user_deleted", 0.10), + ("由于作者隐私设置", "privacy_restricted", 0.15), + ("你没有权限查看", "privacy_restricted", 0.15), + ("抱歉,此微博已被删除", "deleted_ambiguous", 0.55), + ("已被删除", "deleted_ambiguous", 0.55), + ("微博不存在", "gone", 0.45), + ("页面不存在", "gone", 0.45), + ("该内容暂时无法显示", "censored_explicit", 0.90), +] + + +def classify_post_status(http_status: int, body: str) -> dict: + """Map an HTTP response for a single post to a status + censorship likelihood. + + Returns {"status": str, "censorship_likelihood": float|None}. A likelihood of + None means "uninformative" (network/geo block) and must be EXCLUDED from the + survival curve, not treated as alive. + """ + body = body or "" + + # Hard network/geo signals first — these tell us nothing about censorship. + if http_status in (403, 451): + return {"status": "blocked", "censorship_likelihood": None} + if http_status >= 500 or http_status == 0: + return {"status": "unreachable", "censorship_likelihood": None} + + for marker, status, likelihood in _STATUS_MARKERS: + if marker in body: + return {"status": status, "censorship_likelihood": likelihood} + + if http_status == 404: + # Bare 404 with no marker: ambiguous removal. + return {"status": "gone", "censorship_likelihood": 0.45} + + # 200 with no deletion marker → assume the post is still alive. + return {"status": "alive", "censorship_likelihood": 0.0} + + +def survival_curve(latencies_seconds: list[float]) -> dict: + """Cumulative deletion-survival curve from observed deletion latencies. + + Zhu et al. warn the distribution is long-tailed, so we report cumulative + PERCENTILES (fraction deleted within each bucket), never mean/median. + """ + clean = [x for x in latencies_seconds if x is not None and not math.isnan(x) and x >= 0] + n = len(clean) + curve = {} + for label, secs in SURVIVAL_BUCKETS: + curve[label] = (sum(1 for x in clean if x <= secs) / n) if n else None + return {"n": n, "cumulative_deleted_within": curve} + + +async def check_liveness(client, url: str) -> dict: + """Active liveness check for one post URL (the controllable-resolution path). + + Returns a classification dict; on transport failure returns status + 'unreachable' so the caller can measure reachability rather than crash. + """ + try: + resp = await client.get(url, headers={ + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/124.0 Safari/537.36" + ), + }) + return classify_post_status(resp.status_code, resp.text) + except Exception as e: + logger.debug(f"[DDTI] liveness check failed for {url}: {e}") + return {"status": "unreachable", "censorship_likelihood": None, "error": str(e)} + + +class DDTIProbeCollector(BaseCollector): + """Scheduled ingestion of deletion observations from passive feeds. + + source_type='social_media' routes rows to the Article table (and onward to + the multilingual sentiment processor), not the numeric EconomicData table. + """ + + name = "ddti_probe" + source_type = "social_media" + + def __init__(self, config: dict): + super().__init__(config) + # [{name, url}] candidate deletion feeds, from sources.yaml. + self.feeds = config.get("deletion_feeds", []) + + async def collect(self) -> list[dict]: + records = [] + reachability = {} + for feed in self.feeds: + name, url = feed.get("name", feed["url"]), feed["url"] + try: + resp = await self._http.get(url, headers={"User-Agent": "Mozilla/5.0"}) + reachability[name] = resp.status_code + if resp.status_code != 200: + logger.warning(f"[DDTI] {name} → HTTP {resp.status_code}") + continue + records.extend(self._parse_feed_items(name, resp.text)) + except Exception as e: + reachability[name] = f"error:{type(e).__name__}" + logger.warning(f"[DDTI] {name} unreachable: {e}") + + logger.info(f"[DDTI] reachability={reachability} | observations={len(records)}") + return records + + def _parse_feed_items(self, source: str, text: str) -> list[dict]: + """Best-effort RSS/Atom parse of a deletion feed (CDT etc. are WordPress).""" + out = [] + try: + root = ET.fromstring(text) + except Exception as e: + # Not XML/RSS, or defusedxml rejected a malicious entity. Reachability + # is still recorded by the caller; we just yield no items. + logger.debug(f"[DDTI] {source} XML parse skipped: {type(e).__name__}") + return out + for item in root.iter("item"): + # WordPress RSS (CDT) emits one per tag — a free, curated + # topic signal that feeds the selectivity/novelty index directly. + tags = [c.text.strip() for c in item.findall("category") if (c.text or "").strip()] + out.append({ + "source": source, + "title": (item.findtext("title") or "").strip(), + "text": (item.findtext("description") or "").strip(), + "url": (item.findtext("link") or "").strip(), + "published_at": (item.findtext("pubDate") or "").strip(), + "tags": tags, + }) + return out + + async def parse(self, raw_data: list[dict]) -> pd.DataFrame: + rows = [] + for r in raw_data: + url = r.get("url", "") + rows.append({ + "title": r.get("title", "")[:280], + "full_text": r.get("text", ""), + "url": url, + "url_hash": hashlib.sha256(url.encode()).hexdigest()[:32] if url else None, + "author": r.get("source", "ddti"), + "published_at": datetime.now(timezone.utc), + "category": "ddti_deletion", + "metadata": { + "feed": r.get("source"), + "raw_published": r.get("published_at"), + "tags": r.get("tags", []), + }, + }) + return pd.DataFrame(rows) + + def validate(self, df: pd.DataFrame) -> bool: + # Empty is valid: a quiet window or unreachable feeds is itself a finding. + return df.empty or ("url" in df.columns and "title" in df.columns) diff --git a/collectors/weibo_hotsearch.py b/collectors/weibo_hotsearch.py new file mode 100644 index 0000000..86d080d --- /dev/null +++ b/collectors/weibo_hotsearch.py @@ -0,0 +1,136 @@ +"""Weibo (微博) hot-search collector. + +Pulls the real-time hot-search ranking (热搜榜) — the top trending topics on +Chinese social media — and keeps only the financially-relevant entries so they +flow into the sentiment pipeline (XLM-RoBERTa handles the Chinese text). + +The public endpoint returns JSON without login: + https://weibo.com/ajax/side/hotSearch + +Design notes: +- source_type = "social_media" so _upsert routes rows to the Article table + (NOT EconomicData), which is what the sentiment processor reads. +- The FULL hot-search list is fetched into immutable raw storage; the financial + filter is applied in parse() so raw data stays complete for audit/backfill. +- Add/adjust nothing in code to tune cadence — edit sources.yaml. +""" + +import hashlib +import logging +from datetime import datetime, timezone +from urllib.parse import quote + +import pandas as pd + +from core.base_collector import BaseCollector +from core.exceptions import SchemaChangedError, SourceDownError + +logger = logging.getLogger(__name__) + +HOT_SEARCH_URL = "https://weibo.com/ajax/side/hotSearch" +SEARCH_URL_TMPL = "https://s.weibo.com/weibo?q=%23{q}%23" # #topic# search page + + +class WeiboHotSearchCollector(BaseCollector): + name = "weibo_hotsearch" + source_type = "social_media" # → Article table → sentiment pipeline + + def __init__(self, config: dict): + super().__init__(config) + # Finance keywords + denylist drive _is_financially_relevant. We prefer the + # shared lexicon (config/zh_finance_lexicon.json) so the collector and the + # sentiment processor stay in sync; sources.yaml can override/extend. + from processors.zh_finance import load_lexicon + lexicon = load_lexicon() + self.finance_keywords = config.get( + "finance_keywords", lexicon.get("finance_keywords", []) + ) + self.denylist = config.get("denylist", lexicon.get("denylist", [])) + + async def collect(self) -> list[dict]: + """Fetch the full real-time hot-search list.""" + # Weibo's ajax endpoint rejects the default scraper UA; send a browser one. + resp = await self._http.get( + HOT_SEARCH_URL, + headers={ + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36" + ), + "Referer": "https://weibo.com/", + }, + ) + if resp.status_code != 200: + raise SourceDownError(self.name, f"HTTP {resp.status_code}") + + payload = resp.json() + if payload.get("ok") != 1: + raise SourceDownError(self.name, f"API ok={payload.get('ok')}") + + realtime = payload.get("data", {}).get("realtime", []) + logger.info(f"[Weibo] Fetched {len(realtime)} hot-search entries") + return realtime + + async def parse(self, raw_data: list[dict]) -> pd.DataFrame: + """Keep only finance-relevant topics; shape them into Article rows.""" + rows = [] + for item in raw_data: + word = (item.get("word") or "").strip() + if not word: + continue + + # The "note" is a short blurb; fall back to the topic word itself. + note = (item.get("note") or word).strip() + + if not self._is_financially_relevant(word, note): + continue + + url = SEARCH_URL_TMPL.format(q=quote(word)) + rows.append({ + "title": word, + "full_text": note, + "url": url, + "url_hash": hashlib.sha256(url.encode()).hexdigest()[:32], + "author": "weibo_hotsearch", + "published_at": self._onboard_time(item), + "category": "china_social", + # heat metric kept for ranking/velocity downstream + "metadata": { + "raw_hot": item.get("raw_hot"), + "rank": item.get("rank"), + "label": item.get("label_name"), + "weibo_category": item.get("category"), + }, + }) + + logger.info(f"[Weibo] {len(rows)}/{len(raw_data)} entries kept as finance-relevant") + return pd.DataFrame(rows) + + def validate(self, df: pd.DataFrame) -> bool: + # Empty is valid: a window with no finance topics is normal, not a failure. + if df.empty: + return True + if "title" not in df.columns or "url" not in df.columns: + raise SchemaChangedError(self.name, "missing title/url columns") + return True + + @staticmethod + def _onboard_time(item: dict) -> datetime: + """Weibo gives onboard_time as a unix epoch (seconds); default to now.""" + ts = item.get("onboard_time") + if isinstance(ts, (int, float)) and ts > 0: + return datetime.fromtimestamp(ts, tz=timezone.utc) + return datetime.now(timezone.utc) + + # ── Domain logic ────────────────────────────────────────────── + def _is_financially_relevant(self, word: str, note: str) -> bool: + """Keep a hot-search topic only if it's about markets/economy/finance. + + Substring matching (NOT \\b regex — that doesn't anchor on Chinese). + Denylist wins: it excludes false positives that embed a finance word, + e.g. 经济适用男 ("budget boyfriend" slang) contains 经济 ("economy"). + """ + haystack = f"{word} {note}" + if any(bad in haystack for bad in self.denylist): + return False + return any(kw in haystack for kw in self.finance_keywords) diff --git a/config/cbb_taxonomy.json b/config/cbb_taxonomy.json new file mode 100644 index 0000000..0020842 --- /dev/null +++ b/config/cbb_taxonomy.json @@ -0,0 +1,1082 @@ +{ + "sectors": { + "agriculture": { + "hs_codes": [ + "01", + "02", + "03", + "04", + "05", + "06", + "07", + "08", + "09", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "21", + "22", + "23", + "24" + ], + "keywords": { + "zh": [ + "农业", + "农产品", + "粮食", + "谷物", + "大豆", + "猪肉", + "蔬菜", + "水果", + "饲料", + "化肥", + "天气", + "丰收", + "减产", + "禽流感", + "进口", + "出口" + ], + "en": [ + "agriculture", + "agricultural products", + "grain", + "cereals", + "soybeans", + "pork", + "vegetables", + "fruits", + "feed", + "fertilizer", + "weather", + "harvest", + "crop reduction", + "avian influenza", + "imports", + "exports" + ] + }, + "cn_hf_sources": [ + "macro_customs" + ], + "provinces": [ + { + "name_zh": "黑龙江", + "name_en": "Heilongjiang", + "role": "grain production hub" + }, + { + "name_zh": "河南", + "name_en": "Henan", + "role": "grain and livestock hub" + }, + { + "name_zh": "山东", + "name_en": "Shandong", + "role": "vegetable and fruit hub" + }, + { + "name_zh": "四川", + "name_en": "Sichuan", + "role": "livestock and grain hub" + }, + { + "name_zh": "新疆", + "name_en": "Xinjiang", + "role": "cotton and fruit hub" + } + ], + "firms": [ + { + "name_zh": "牧原食品股份有限公司", + "name_en": "Muyuan Foods", + "ticker": "002714.SZ", + "ownership": "private" + }, + { + "name_zh": "新希望六和股份有限公司", + "name_en": "New Hope Liuhe", + "ticker": "000876.SZ", + "ownership": "private" + }, + { + "name_zh": "温氏食品集团股份有限公司", + "name_en": "Wens Foodstuff Group", + "ticker": "300498.SZ", + "ownership": "private" + } + ], + "region": "national" + }, + "finance_banking": { + "hs_codes": [], + "keywords": { + "zh": [ + "银行", + "金融", + "信贷", + "贷款", + "存款", + "利率", + "降准", + "降息", + "加息", + "MLF", + "LPR", + "央行", + "货币政策", + "流动性", + "债券", + "股市", + "A股", + "信贷脉冲", + "社融", + "M2", + "不良资产", + "不良贷款", + "净息差", + "影子银行", + "理财", + "信托", + "金融科技", + "数字人民币", + "商业银行", + "国有银行", + "股份制银行", + "城商行", + "农商行", + "信用卡", + "房贷", + "地方债", + "城投债" + ], + "en": [ + "bank", + "banking", + "finance", + "financial", + "credit", + "loan", + "deposit", + "interest rate", + "RRR cut", + "reserve requirement ratio", + "rate cut", + "PBOC", + "central bank", + "monetary policy", + "liquidity", + "bonds", + "stock market", + "A-shares", + "credit impulse", + "aggregate financing", + "M2", + "NPL", + "non-performing loan", + "NIM", + "net interest margin", + "shadow banking", + "wealth management", + "trust", + "fintech", + "digital yuan", + "e-CNY", + "commercial bank", + "state-owned bank", + "joint-stock bank", + "city commercial bank", + "credit card", + "mortgage", + "local government debt", + "LGFV debt" + ] + }, + "cn_hf_sources": [ + "macro_pboc_credit", + "macro_nbs", + "macro_caixin_pmi", + "macro_cfl_pmi", + "macro_customs" + ], + "provinces": [ + { + "name_zh": "北京", + "name_en": "Beijing", + "role": "national regulatory and policy center" + }, + { + "name_zh": "上海", + "name_en": "Shanghai", + "role": "interbank market and stock exchange hub" + }, + { + "name_zh": "广东", + "name_en": "Guangdong", + "role": "Shenzhen exchange and fintech center" + }, + { + "name_zh": "浙江", + "name_en": "Zhejiang", + "role": "digital payments and wealth management hub" + } + ], + "firms": [ + { + "name_zh": "中国工商银行", + "name_en": "Industrial and Commercial Bank of China", + "ticker": "601398.SH", + "ownership": "SOE" + }, + { + "name_zh": "中国建设银行", + "name_en": "China Construction Bank", + "ticker": "601939.SH", + "ownership": "SOE" + }, + { + "name_zh": "中国农业银行", + "name_en": "Agricultural Bank of China", + "ticker": "601288.SH", + "ownership": "SOE" + } + ], + "region": "national" + }, + "manufacturing": { + "hs_codes": [ + "84", + "85", + "62", + "61", + "72", + "73", + "76", + "39", + "90", + "29", + "94", + "95" + ], + "keywords": { + "zh": [ + "制造业", + "出口", + "订单", + "工厂", + "生产", + "PMI", + "工业", + "机械", + "电子", + "纺织", + "钢铁", + "汽车", + "家电", + "供应链", + "外贸" + ], + "en": [ + "manufacturing", + "exports", + "orders", + "factory", + "production", + "PMI", + "industrial", + "machinery", + "electronics", + "textiles", + "steel", + "automotive", + "home appliances", + "supply chain", + "foreign trade" + ] + }, + "cn_hf_sources": [ + "steel_mysteel", + "bdi", + "ccfi", + "scfi", + "yiwu_index", + "cement_digital", + "coal_cctd", + "autos_cpca_retail" + ], + "provinces": [ + { + "name_zh": "广东", + "name_en": "Guangdong", + "role": "export manufacturing hub" + }, + { + "name_zh": "江苏", + "name_en": "Jiangsu", + "role": "electronics and machinery hub" + }, + { + "name_zh": "浙江", + "name_en": "Zhejiang", + "role": "private manufacturing and textiles hub" + }, + { + "name_zh": "山东", + "name_en": "Shandong", + "role": "heavy industry and equipment hub" + }, + { + "name_zh": "上海", + "name_en": "Shanghai", + "role": "advanced manufacturing and trade gateway" + } + ], + "firms": [ + { + "name_zh": "美的集团", + "name_en": "Midea Group", + "ticker": "000333.SZ", + "ownership": "private" + }, + { + "name_zh": "格力电器", + "name_en": "Gree Electric", + "ticker": "000651.SZ", + "ownership": "private" + }, + { + "name_zh": "比亚迪", + "name_en": "BYD", + "ticker": "1211.HK", + "ownership": "private" + } + ], + "region": "coastal_export" + }, + "mining_commodities": { + "hs_codes": [ + "25", + "26", + "27", + "71", + "72", + "74", + "75", + "76", + "78", + "79", + "80", + "81" + ], + "keywords": { + "zh": [ + "矿业", + "矿产", + "铁矿石", + "铜", + "铝", + "镍", + "锌", + "锡", + "铅", + "煤炭", + "原油", + "大宗商品", + "金属", + "贵金属", + "能源", + "矿产进口", + "矿产出口", + "资源", + "稀土" + ], + "en": [ + "mining", + "minerals", + "iron ore", + "copper", + "aluminum", + "nickel", + "zinc", + "tin", + "lead", + "coal", + "crude oil", + "commodities", + "metals", + "precious metals", + "energy", + "mineral imports", + "mineral exports", + "resources", + "rare earth" + ] + }, + "cn_hf_sources": [ + "coal_cctd", + "coal_power_consumption", + "steel_mysteel", + "bdi" + ], + "provinces": [ + { + "name_zh": "山西", + "name_en": "Shanxi", + "role": "coal mining hub" + }, + { + "name_zh": "内蒙古", + "name_en": "Inner Mongolia", + "role": "coal and rare earth mining hub" + }, + { + "name_zh": "河北", + "name_en": "Hebei", + "role": "steel production base" + }, + { + "name_zh": "江西", + "name_en": "Jiangxi", + "role": "copper and rare earth processing hub" + }, + { + "name_zh": "新疆", + "name_en": "Xinjiang", + "role": "oil and coal frontier region" + } + ], + "firms": [ + { + "name_zh": "中国神华能源", + "name_en": "China Shenhua Energy", + "ticker": "601088.SS", + "ownership": "SOE" + }, + { + "name_zh": "宝钢股份", + "name_en": "Baoshan Iron & Steel", + "ticker": "600019.SS", + "ownership": "SOE" + }, + { + "name_zh": "中国铝业", + "name_en": "Aluminum Corporation of China", + "ticker": "601600.SS", + "ownership": "SOE" + } + ], + "region": "national" + }, + "property_construction": { + "hs_codes": [ + "25", + "26", + "68", + "69", + "70", + "72", + "73", + "74", + "76" + ], + "keywords": { + "zh": [ + "房地产", + "楼市", + "房价", + "商品房", + "住宅", + "新房", + "二手房", + "土地出让", + "土地成交", + "房地产投资", + "房地产开发", + "新开工", + "施工面积", + "竣工面积", + "水泥", + "螺纹钢", + "钢材", + "钢铁", + "建材", + "建筑", + "基建", + "保障房", + "烂尾楼", + "恒大", + "万科", + "碧桂园", + "保利" + ], + "en": [ + "real estate", + "property market", + "housing market", + "home prices", + "residential property", + "commercial property", + "construction", + "cement", + "steel rebar", + "iron ore", + "copper", + "building materials", + "infrastructure", + "land sales", + "property investment", + "housing starts", + "floor space", + "Evergrande", + "Vanke", + "Country Garden" + ] + }, + "cn_hf_sources": [ + "property_cric", + "property_zhongzhi_land", + "cement_digital", + "steel_mysteel", + "steel_100njz_construction" + ], + "provinces": [ + { + "name_zh": "广东", + "name_en": "Guangdong", + "role": "major property market and construction demand hub" + }, + { + "name_zh": "江苏", + "name_en": "Jiangsu", + "role": "residential and infrastructure demand hub" + }, + { + "name_zh": "浙江", + "name_en": "Zhejiang", + "role": "coastal property market and private developer hub" + }, + { + "name_zh": "北京", + "name_en": "Beijing", + "role": "policy and regulatory centre, tier-1 benchmark market" + }, + { + "name_zh": "上海", + "name_en": "Shanghai", + "role": "tier-1 commercial and residential benchmark" + } + ], + "firms": [ + { + "name_zh": "万科企业股份有限公司", + "name_en": "China Vanke Co., Ltd.", + "ticker": "000002.SZ", + "ownership": "private" + }, + { + "name_zh": "保利发展控股集团股份有限公司", + "name_en": "Poly Developments and Holdings Group Co., Ltd.", + "ticker": "600048.SH", + "ownership": "private" + }, + { + "name_zh": "中国建筑股份有限公司", + "name_en": "China State Construction Engineering Corporation Ltd.", + "ticker": "601668.SH", + "ownership": "private" + } + ], + "region": "national" + }, + "retail_consumer": { + "hs_codes": [ + "30", + "33", + "34", + "42", + "43", + "48", + "49", + "50", + "51", + "52", + "53", + "54", + "55", + "56", + "57", + "58", + "59", + "60", + "63", + "64", + "65", + "66", + "67", + "87", + "91", + "92", + "96" + ], + "keywords": { + "zh": [ + "零售", + "消费", + "社会消费品零售总额", + "消费升级", + "消费降级", + "消费品", + "耐用消费品", + "快速消费品", + "食品饮料", + "化妆品", + "个人护理", + "服装", + "鞋帽", + "箱包", + "家具", + "家电", + "汽车", + "电商", + "线上零售", + "双十一", + "618", + "CPI", + "消费者信心", + "可支配收入", + "客流量", + "商场" + ], + "en": [ + "retail", + "consumer", + "consumption", + "retail sales", + "consumer spending", + "consumer goods", + "durables", + "FMCG", + "food and beverage", + "cosmetics", + "personal care", + "apparel", + "footwear", + "headgear", + "leather goods", + "furniture", + "home appliances", + "automobiles", + "e-commerce", + "online retail", + "Singles' Day", + "618 shopping festival", + "CPI", + "consumer confidence", + "disposable income", + "foot traffic", + "shopping mall" + ] + }, + "cn_hf_sources": [ + "yiwu_index", + "autos_cpca_retail", + "mobility_box_office" + ], + "provinces": [ + { + "name_zh": "广东", + "name_en": "Guangdong", + "role": "largest consumer market and e-commerce hub" + }, + { + "name_zh": "江苏", + "name_en": "Jiangsu", + "role": "high-income consumption and retail hub" + }, + { + "name_zh": "山东", + "name_en": "Shandong", + "role": "large population consumer market" + }, + { + "name_zh": "浙江", + "name_en": "Zhejiang", + "role": "private consumption and live-commerce hub" + }, + { + "name_zh": "上海", + "name_en": "Shanghai", + "role": "tier-1 retail and luxury consumption benchmark" + }, + { + "name_zh": "四川", + "name_en": "Sichuan", + "role": "inland consumption hub" + } + ], + "firms": [ + { + "name_zh": "阿里巴巴集团", + "name_en": "Alibaba Group", + "ticker": "9988.HK", + "ownership": "private" + }, + { + "name_zh": "京东集团股份有限公司", + "name_en": "JD.com", + "ticker": "9618.HK", + "ownership": "private" + }, + { + "name_zh": "农夫山泉股份有限公司", + "name_en": "Nongfu Spring", + "ticker": "9633.HK", + "ownership": "private" + } + ], + "region": "national" + }, + "services": { + "hs_codes": [], + "keywords": { + "zh": [ + "服务业", + "第三产业", + "消费服务", + "餐饮", + "旅游", + "酒店", + "电影票房", + "航空客运", + "铁路客运", + "出行", + "生活服务", + "休闲娱乐", + "客流量", + "服务消费" + ], + "en": [ + "services", + "tertiary sector", + "consumer services", + "catering", + "tourism", + "hotel", + "box office", + "aviation passenger", + "railway passenger", + "mobility", + "local services", + "leisure", + "foot traffic", + "service consumption" + ] + }, + "cn_hf_sources": [ + "mobility_box_office", + "mobility_civil_aviation", + "mobility_12306", + "mobility_baidu_migration", + "mobility_baidu_congestion", + "mobility_gaode" + ], + "provinces": [ + { + "name_zh": "北京", + "name_en": "Beijing", + "role": "financial, tech and business-services hub" + }, + { + "name_zh": "上海", + "name_en": "Shanghai", + "role": "finance, trade and hospitality hub" + }, + { + "name_zh": "广东", + "name_en": "Guangdong", + "role": "population, tourism and transport services hub" + }, + { + "name_zh": "浙江", + "name_en": "Zhejiang", + "role": "e-commerce and digital services hub" + }, + { + "name_zh": "四川", + "name_en": "Sichuan", + "role": "inland consumption and tourism hub" + } + ], + "firms": [ + { + "name_zh": "美团", + "name_en": "Meituan", + "ticker": "3690.HK", + "ownership": "private" + }, + { + "name_zh": "携程集团", + "name_en": "Trip.com Group", + "ticker": "9961.HK", + "ownership": "private" + }, + { + "name_zh": "中国中免", + "name_en": "China Tourism Group Duty Free", + "ticker": "601888.SS", + "ownership": "private" + } + ], + "region": "national" + }, + "technology": { + "hs_codes": [ + "84", + "85", + "90", + "91", + "92", + "37", + "70" + ], + "keywords": { + "zh": [ + "科技", + "技术", + "电子", + "半导体", + "芯片", + "集成电路", + "人工智能", + "AI", + "5G", + "通信", + "互联网", + "软件", + "硬件", + "智能手机", + "手机", + "电脑", + "笔记本", + "数据中心", + "云计算", + "物联网", + "智能制造", + "高端制造", + "出口", + "订单", + "生产" + ], + "en": [ + "technology", + "tech", + "electronics", + "semiconductors", + "chips", + "integrated circuits", + "artificial intelligence", + "AI", + "5G", + "telecommunications", + "telecom", + "internet", + "software", + "hardware", + "smartphone", + "mobile phone", + "computer", + "laptop", + "data center", + "cloud computing", + "IoT", + "smart manufacturing", + "advanced manufacturing", + "exports", + "orders", + "production" + ] + }, + "cn_hf_sources": [ + "ccfi", + "scfi", + "yiwu_index", + "autos_cpca_retail" + ], + "provinces": [ + { + "name_zh": "广东", + "name_en": "Guangdong", + "role": "electronics manufacturing and export hub" + }, + { + "name_zh": "江苏", + "name_en": "Jiangsu", + "role": "semiconductor and ICT manufacturing hub" + }, + { + "name_zh": "上海", + "name_en": "Shanghai", + "role": "semiconductor R&D and fintech hub" + }, + { + "name_zh": "北京", + "name_en": "Beijing", + "role": "internet, AI and software hub" + }, + { + "name_zh": "浙江", + "name_en": "Zhejiang", + "role": "e-commerce and digital economy hub" + } + ], + "firms": [ + { + "name_zh": "腾讯控股", + "name_en": "Tencent Holdings", + "ticker": "0700.HK", + "ownership": "private" + }, + { + "name_zh": "阿里巴巴集团", + "name_en": "Alibaba Group", + "ticker": "9988.HK", + "ownership": "private" + }, + { + "name_zh": "中芯国际集成电路制造", + "name_en": "SMIC", + "ticker": "688981.SS", + "ownership": "SOE" + } + ], + "region": "coastal_export" + }, + "transport_logistics": { + "hs_codes": [ + "86", + "87", + "88", + "89" + ], + "keywords": { + "zh": [ + "物流", + "运输", + "货运", + "港口", + "集装箱", + "航运", + "海运", + "公路", + "铁路", + "航空", + "快递", + "运价", + "运价指数", + "BDI", + "CCFI", + "SCFI", + "供应链", + "吞吐量", + "外贸", + "出口", + "多式联运", + "物流园", + "中欧班列", + "快递业务量" + ], + "en": [ + "logistics", + "transport", + "freight", + "shipping", + "ports", + "containers", + "maritime", + "railway", + "highway", + "aviation", + "express delivery", + "cargo", + "freight rates", + "BDI", + "CCFI", + "SCFI", + "supply chain", + "throughput", + "exports", + "trade corridor", + "multimodal transport", + "logistics park", + "China-Europe Railway Express", + "parcel volume" + ] + }, + "cn_hf_sources": [ + "bdi", + "ccfi", + "scfi", + "freight_road_logistics", + "mobility_12306", + "mobility_civil_aviation" + ], + "provinces": [ + { + "name_zh": "上海", + "name_en": "Shanghai", + "role": "world’s largest container port and international shipping hub" + }, + { + "name_zh": "广东", + "name_en": "Guangdong", + "role": "Pearl River Delta export logistics and aviation gateway" + }, + { + "name_zh": "浙江", + "name_en": "Zhejiang", + "role": "Ningbo-Zhoushan port and e-commerce logistics hub" + }, + { + "name_zh": "江苏", + "name_en": "Jiangsu", + "role": "manufacturing-export logistics and Yangtze River port cluster" + }, + { + "name_zh": "北京", + "name_en": "Beijing", + "role": "national railway and aviation network hub" + } + ], + "firms": [ + { + "name_zh": "中远海运控股股份有限公司", + "name_en": "COSCO SHIPPING Holdings", + "ticker": "601919.SS", + "ownership": "SOE" + }, + { + "name_zh": "顺丰控股股份有限公司", + "name_en": "SF Holding", + "ticker": "002352.SZ", + "ownership": "private" + }, + { + "name_zh": "中国铁路工程集团有限公司", + "name_en": "China Railway Engineering Group", + "ticker": "601390.SS", + "ownership": "SOE" + } + ], + "region": "national" + } + } +} \ No newline at end of file diff --git a/config/cn_hf_sources.json b/config/cn_hf_sources.json new file mode 100644 index 0000000..50755f5 --- /dev/null +++ b/config/cn_hf_sources.json @@ -0,0 +1,279 @@ +{ + "sources": [ + { + "key": "autos_cpca_retail", + "name_zh": "乘联会乘用车零售销量", + "name_en": "CPCA China Passenger Vehicle Retail Sales", + "url": "https://www.cpcaauto.com/", + "access_method": "todo", + "frequency": "monthly", + "sector": "autos", + "difficulty": "hard", + "note": "Monthly passenger-vehicle retail sales for China published by the China Passenger Car Association (CPCA / 乘联会). The association releases the figures via website articles, PDF reports and its WeChat public account; there is no stable open JSON or CSV endpoint. Extracting a clean monthly time-series requires scraping or parsing the published reports. Marked todo until a scraper is implemented." + }, + { + "key": "autos_cpca_wholesale", + "name_zh": "乘联会乘用车批发销量", + "name_en": "CPCA China Passenger Vehicle Wholesale Sales", + "url": "https://www.cpcaauto.com/", + "access_method": "todo", + "frequency": "monthly", + "sector": "autos", + "difficulty": "hard", + "note": "Monthly passenger-vehicle wholesale sales for China published by the China Passenger Car Association (CPCA / 乘联会). The association releases the figures via website articles, PDF reports and its WeChat public account; there is no stable open JSON or CSV endpoint. Extracting a clean monthly time-series requires scraping or parsing the published reports. Marked todo until a scraper is implemented." + }, + { + "key": "cement_digital", + "name_zh": "数字水泥网水泥价格指数", + "name_en": "Digital Cement Network Cement Price Index", + "url": "http://www.dcement.com/", + "access_method": "todo", + "frequency": "daily", + "sector": "cement", + "difficulty": "hard", + "note": "Published by Digital Cement (数字水泥网). No documented open API; index values appear on public pages and require screen scraping or API discovery, often behind anti-bot protection. Marked todo until a stable endpoint or scrape parser is confirmed." + }, + { + "key": "coal_cctd", + "name_zh": "CCTD环渤海动力煤价格指数", + "name_en": "CCTD Bohai-Rim Thermal Coal Price Index", + "url": "http://www.cctdcoal.com/", + "access_method": "todo", + "frequency": "daily", + "sector": "coal", + "difficulty": "hard", + "note": "Official CCTD daily 5,500/5,000/4,500 kcal Bohai-rim thermal coal reference prices (published on working days). No public JSON/API endpoint found; content is Chinese-only and requires scraping, so marked as TODO." + }, + { + "key": "coal_power_consumption", + "name_zh": "火电煤炭消费量", + "name_en": "Thermal coal consumption", + "url": "https://www.cctd.com.cn/", + "access_method": "todo", + "frequency": "daily", + "sector": "coal", + "difficulty": "hard", + "note": "No documented public API for daily coal burn at Chinese power plants; CCTD and provincial grid data are published as HTML tables/PDFs that require scraping or manual extraction." + }, + { + "key": "bdi", + "name_zh": "波罗的海干散货指数", + "name_en": "Baltic Dry Index", + "url": "https://www.balticexchange.com/en/data-services/market-information0/dry-services.html", + "access_method": "todo", + "frequency": "daily", + "sector": "transport_logistics", + "difficulty": "hard", + "note": "Daily composite dry-bulk freight index published by the Baltic Exchange in London. The authoritative time-series is subscription-only; public aggregator pages (e.g. Trading Economics, investing.com, Macrotrends) expose current and historical values but require scraping and are subject to anti-bot/ToS limits. Marked todo until a stable open endpoint or scraper is implemented." + }, + { + "key": "ccfi", + "name_zh": "中国出口集装箱运价指数", + "name_en": "China Containerized Freight Index (CCFI)", + "url": "https://en.sse.net.cn/currentIndex?indexName=ccfi", + "access_method": "open_json", + "frequency": "weekly", + "sector": "transport_logistics", + "difficulty": "easy", + "note": "Published weekly (Fridays) by the Shanghai Shipping Exchange. The public endpoint returns the current CCFI composite index and route sub-indices as JSON without authentication. Historical single-date queries via /singleIndex/ccfi require login, but the latest/current release is openly available." + }, + { + "key": "freight_road_logistics", + "name_zh": "中国公路物流运价指数", + "name_en": "China Road Logistics Price Index", + "url": "http://www.chinawuliu.com.cn/", + "access_method": "todo", + "frequency": "weekly", + "sector": "transport_logistics", + "difficulty": "hard", + "note": "Published jointly by the China Federation of Logistics and Purchasing (CFLP) and Guangdong Lin'an Logistics Group. The weekly/monthly road-freight price index is released as HTML press releases on chinawuliu.com.cn and WeChat; no open JSON/CSV endpoint was found. Requires HTML scraping or manual extraction." + }, + { + "key": "scfi", + "name_zh": "上海出口集装箱运价指数", + "name_en": "Shanghai Containerized Freight Index (SCFI)", + "url": "https://en.sse.net.cn/currentIndex?indexName=scfi", + "access_method": "open_json", + "frequency": "weekly", + "sector": "transport_logistics", + "difficulty": "easy", + "note": "Published weekly (Fridays) by the Shanghai Shipping Exchange. The public /currentIndex endpoint returns the latest SCFI composite index and per-route sub-indices as JSON without authentication. Historical multi-date queries via /index/mutipleIndex require login, but the current release is openly available." + }, + { + "key": "macro_caixin_pmi", + "name_zh": "财新中国通用制造业PMI", + "name_en": "Caixin China General Manufacturing PMI", + "url": "https://www.caixinglobal.com/", + "access_method": "todo", + "frequency": "monthly", + "sector": "macro", + "difficulty": "hard", + "note": "Monthly press-release articles published by Caixin Insight Group / S&P Global on Caixin Global. No open JSON/CSV API; underlying historical series is subscription-only via S&P Global (Markit). Requires HTML scraping of the latest release or manual headline extraction, and is often paywalled." + }, + { + "key": "macro_cfl_pmi", + "name_zh": "中国制造业采购经理指数", + "name_en": "China CFLP Manufacturing PMI", + "url": "http://en.chinawuliu.com.cn/cflp-pmi/", + "access_method": "todo", + "frequency": "monthly", + "sector": "macro", + "difficulty": "hard", + "note": "Published jointly by NBS and CFLP as HTML press releases; no public JSON/API endpoint. Requires scraping report pages (e.g. http://www.chinawuliu.com.cn/lhhzq/YYYYMM/DD/XXXXXX.shtml) or parsing NBS stats.gov.cn release tables." + }, + { + "key": "macro_customs", + "name_zh": "中国进出口贸易总额(海关月度)", + "name_en": "China Monthly Import and Export Trade (GACC)", + "url": "https://chinadata.live/api/v2/data/china-trade-monthly", + "access_method": "open_json", + "frequency": "monthly", + "sector": "macro", + "difficulty": "easy", + "note": "Free no-key JSON API from China Data Portal, sourced from General Administration of Customs of China (GACC) official monthly releases. Returns total trade, exports, imports and trade balance in USD millions." + }, + { + "key": "macro_nbs", + "name_zh": "国家统计局宏观数据", + "name_en": "NBS China Macroeconomic Data", + "url": "https://data.stats.gov.cn/easyquery.htm?cn=A01", + "access_method": "todo", + "frequency": "monthly", + "sector": "macro", + "difficulty": "hard", + "note": "Monthly macroeconomic releases from the National Bureau of Statistics of China (NBS / 国家统计局), including CPI, PPI, industrial production, retail sales, fixed-asset investment and surveyed urban unemployment. The NBS National Data (EasyQuery) portal can be queried via parameterized endpoints such as https://data.stats.gov.cn/easyquery.htm?m=QueryData&dbcode=hgyd&rowcode=zb&colcode=sj, but responses are JSONP/HTML, require constructing wd/dfwds parameters, and are protected by anti-bot measures with no documented open API key. Marked todo until a stable scraper or API client is implemented." + }, + { + "key": "macro_pboc_credit", + "name_zh": "人民银行信贷收支与货币供应", + "name_en": "PBOC Credit, Money Supply and Aggregate Financing", + "url": "http://www.pbc.gov.cn/en/3688240/index.html", + "access_method": "todo", + "frequency": "monthly", + "sector": "macro", + "difficulty": "hard", + "note": "Monthly money supply, sources/uses of credit funds and aggregate financing data from the People’s Bank of China (Monetary Statistics pages). No stable open JSON or CSV endpoint is available; figures are published as HTML/Excel tables that require scraping and Chinese date parsing. Marked todo until a parser is implemented." + }, + { + "key": "mobility_12306", + "name_zh": "12306铁路客运量", + "name_en": "12306 Railway Passenger Traffic", + "url": "https://www.12306.cn", + "access_method": "todo", + "frequency": "daily", + "sector": "transport_logistics", + "difficulty": "hard", + "note": "12306 has no public API. Booking/search data is only available by scraping the official site or via Ministry of Transport monthly aggregate railway passenger reports. Marked todo until a stable public endpoint or scraper is implemented." + }, + { + "key": "mobility_baidu_congestion", + "name_zh": "百度地图城市拥堵指数", + "name_en": "Baidu Maps City Congestion Index", + "url": "https://jiaotong.baidu.com/reports/", + "access_method": "todo", + "frequency": "daily", + "sector": "mobility", + "difficulty": "hard", + "note": "Baidu Maps publishes city congestion rankings and reports on jiaotong.baidu.com, but no open bulk JSON/Csv endpoint exists. Real-time road traffic requires a Baidu Maps API key (ak) and is restricted by platform terms; implementation needs a scraper or authenticated API integration." + }, + { + "key": "mobility_baidu_migration", + "name_zh": "百度迁徙", + "name_en": "Baidu Migration Index (Qianxi)", + "url": "http://huiyan.baidu.com/migration/cityrank.jsonp", + "access_method": "todo", + "frequency": "daily", + "sector": "mobility", + "difficulty": "hard", + "note": "Undocumented JSONP endpoints on huiyan.baidu.com (cityrank/provincerank/historycurve/lastdate) expose daily move-in/move-out rankings and migration-scale indices by region ID. No API key is required, but requests need correct region codes, JSONP stripping, and geo-blocking/cookie handling. Marked TODO until a robust scraper/collector is implemented." + }, + { + "key": "mobility_box_office", + "name_zh": "出行与电影票房", + "name_en": "Mobility & Box Office", + "url": "https://piaofang.maoyan.com/dashboard", + "access_method": "todo", + "frequency": "daily", + "sector": "mobility", + "difficulty": "hard", + "note": "Composite high-frequency proxy for Chinese consumer mobility and discretionary services. Maoyan dashboard loads real-time box-office via https://piaofang.maoyan.com/dashboard-ajax/movie (JSON) but requires dynamic signatures, timestamp, cookie rotation and Referer headers. Mobility proxies (Baidu Qianxi migration index, Amap city congestion index) are also scrape-only and periodically change obfuscation. Marked todo until a stable, lawful access path is implemented." + }, + { + "key": "mobility_civil_aviation", + "name_zh": "中国民航月度主要生产指标统计", + "name_en": "CAAC Monthly Civil Aviation KPIs", + "url": "https://www.caac.gov.cn/XXGK/XXGK/TJSJ/TJSJ_1/", + "access_method": "todo", + "frequency": "monthly", + "sector": "mobility", + "difficulty": "hard", + "note": "CAAC publishes monthly production indicator PDFs (passenger trips, cargo/mail, aircraft movements) on a Chinese-language index page. No open JSON/CSV or machine-readable API endpoint was found; extraction requires scraping the index, downloading PDF attachments, and parsing tables." + }, + { + "key": "mobility_gaode", + "name_zh": "高德地图城市拥堵延时指数", + "name_en": "Amap City Congestion Delay Index", + "url": "https://report.amap.com/", + "access_method": "todo", + "frequency": "daily", + "sector": "mobility", + "difficulty": "hard", + "note": "Amap (Gaode) publishes city congestion-delay and traffic-health dashboards at report.amap.com, updated daily/real-time. No open bulk JSON/CSV endpoint was found; Amap's LBS Traffic Status API requires an authenticated key and is governed by platform terms. Implementation needs a scraper or approved API integration." + }, + { + "key": "property_cric", + "name_zh": "克尔瑞房地产数据", + "name_en": "CRIC China Real Estate Data", + "url": "https://www.cricbigdata.com/", + "access_method": "todo", + "frequency": "monthly", + "sector": "property", + "difficulty": "hard", + "note": "Commercial/proprietary real-estate data from CRIC (克尔瑞), a leading China property data and consulting provider. No public open API; indicators such as sales volume, inventory, prices, land auctions and developer rankings are behind a subscription/paywall and would require authenticated scraping or a commercial license. Marked todo until a scraper or API agreement is implemented." + }, + { + "key": "property_zhongzhi_land", + "name_zh": "中指云土地招拍挂数据", + "name_en": "CREIS / Zhongzhi China Land Auction and Transaction Data", + "url": "https://www.cih-index.com/landlist/land/", + "access_method": "todo", + "frequency": "daily", + "sector": "property", + "difficulty": "hard", + "note": "Daily land auction, supply-plan and transaction listings published by China Index Academy (CREIS / 中指云 / 中指研究院) at cih-index.com. The public portal shows search/filter pages and limited preview records, but full parcel details, historical time-series and ranked city aggregates are gated behind login/subscription or delivered through the commercial API at https://api.cih-index.com/. No stable open JSON/CSV endpoint is available, so this source is marked todo until an authenticated scraper or commercial API agreement is implemented." + }, + { + "key": "steel_100njz_construction", + "name_zh": "百年建筑网建筑钢材价格", + "name_en": "100njz Construction Steel Price", + "url": "https://jiancai.mysteel.com/", + "access_method": "todo", + "frequency": "daily", + "sector": "steel", + "difficulty": "hard", + "note": "百年建筑网(100njz.com)建筑钢材行情由上海钢联(Mysteel)建筑钢材频道(jiancai.mysteel.com)提供,日度更新。价格数据以HTML/动态渲染为主,需登录或商业数据接口,反爬机制强。TODO:实现专用scraper或接入Mysteel商业API。" + }, + { + "key": "steel_mysteel", + "name_zh": "我的钢铁网钢材价格", + "name_en": "Mysteel China Steel Prices", + "url": "https://news.mysteel.com/", + "access_method": "todo", + "frequency": "daily", + "sector": "steel", + "difficulty": "hard", + "note": "Daily spot steel and raw-materials prices for China published by Mysteel (我的钢铁网), including rebar, hot-rolled coil, cold-rolled coil, iron ore and coke. The public news page shows latest prices but there is no stable open JSON/API endpoint; the full historical time-series and detailed indices are behind a subscription/anti-bot wall. Marked todo until a scraper or commercial API agreement is implemented." + }, + { + "key": "yiwu_index", + "name_zh": "义乌·中国小商品指数", + "name_en": "Yiwu China Commodity Index", + "url": "https://www.ywindex.com/", + "access_method": "todo", + "frequency": "weekly", + "sector": "retail_consumer", + "difficulty": "hard", + "note": "Official Yiwu small-commodity price, prosperity and confidence indices published by the Ministry of Commerce / Yiwu Municipal Government and operated by Zhejiang China Commodity City Group. The public portal renders the data in a Nuxt SPA loaded from internal apiserver.chinagoods.com endpoints; there is no documented open JSON/API and the time-series is behind dynamic rendering and anti-bot controls. Marked TODO until a robust scraper or reverse-engineered endpoint is implemented." + } + ] +} \ No newline at end of file diff --git a/config/ddti_threat_categories.json b/config/ddti_threat_categories.json new file mode 100644 index 0000000..212383b --- /dev/null +++ b/config/ddti_threat_categories.json @@ -0,0 +1,49 @@ +{ + "_meta": { + "purpose": "Maps DDTI censored terms to PALIMPSEST analytical domains so the threat board can be grouped/colored by what part of the state vector each signal informs.", + "term_domain_by": "Kimi Code (Moonshot) — neutral taxonomy classification task it accepted.", + "fallback": "terms not listed resolve to OTHER." + }, + "domains": { + "ECONOMY": {"label": "Economy & Markets", "color": "#3ad6a0"}, + "POLITICS": {"label": "Politics & Governance", "color": "#ff7a3c"}, + "SOCIETY": {"label": "Society & Stability", "color": "#d8a657"}, + "TECHNOLOGY": {"label": "Technology", "color": "#5aa2ff"}, + "FOREIGN": {"label": "Foreign Relations", "color": "#b07cff"}, + "INFORMATION": {"label": "Media & Censorship", "color": "#ff2f2f"}, + "SAFETY": {"label": "Public Safety", "color": "#e0c64a"}, + "OTHER": {"label": "Other", "color": "#8c8678"} + }, + "term_domain": { + "Internet censorship": "INFORMATION", + "online censorship": "INFORMATION", + "social media censorship": "INFORMATION", + "foreign hostile forces": "FOREIGN", + "food safety": "SAFETY", + "censorship": "INFORMATION", + "AI": "TECHNOLOGY", + "artificial intelligence": "TECHNOLOGY", + "online public opinion": "INFORMATION", + "WeChat": "TECHNOLOGY", + "chinese abroad": "SOCIETY", + "Shanxi": "SAFETY", + "mining accidents": "SAFETY", + "Douyin": "TECHNOLOGY", + "Russia relations": "FOREIGN", + "404": "INFORMATION", + "404 Deleted Content Archive": "INFORMATION", + "Censorship Vault": "INFORMATION", + "Germany": "FOREIGN", + "LGBT rights": "SOCIETY", + "LGBTQ+": "SOCIETY", + "RSDL": "POLITICS", + "Weibo": "TECHNOLOGY", + "academic corruption": "POLITICS", + "academic fraud": "POLITICS", + "account bombing": "INFORMATION", + "activists": "SOCIETY", + "advertising": "ECONOMY", + "bloggers": "INFORMATION", + "courts": "POLITICS" + } +} diff --git a/config/sources.yaml b/config/sources.yaml index 17a68d3..896f972 100644 --- a/config/sources.yaml +++ b/config/sources.yaml @@ -192,3 +192,229 @@ sources: - "MIBOR SOFR rate" - "treasury yield" - "Fed rate decision" + + # ── DDTI feasibility probe (censorship-as-signal) ──────────── + ddti_probe: + enabled: false # enable after scripts/ddti_feasibility.py returns GO + schedule: "*/15 * * * *" # every 15 min once live + collector_class: "collectors.ddti_probe.DDTIProbeCollector" + config: + deletion_feeds: + - name: cdt_english + url: "https://chinadigitaltimes.net/feed/" + - name: cdt_minitrue + url: "https://chinadigitaltimes.net/china/minitrue/feed/" + - name: cdt_chinese + url: "https://chinadigitaltimes.net/chinese/feed/" + + # ── China Social (Weibo hot search) ────────────────────────── + weibo_hotsearch: + enabled: true + schedule: "*/10 * * * *" # Every 10 minutes (hot list churns fast) + collector_class: "collectors.weibo_hotsearch.WeiboHotSearchCollector" + config: + # Used by _is_financially_relevant() — extend with your domain terms. + finance_keywords: + - "A股" # A-shares + - "股市" # stock market + - "央行" # central bank (PBOC) + - "降准" # RRR cut + - "降息" # rate cut + - "加息" # rate hike + - "人民币" # RMB / yuan + - "楼市" # property market + - "房地产" # real estate + - "经济" # economy + - "通胀" # inflation + - "GDP" + + # ── China Economic Conditions Engine (CBB) ─────────────────── + comtrade_mirror: + enabled: true + schedule: "0 */6 * * *" # Every 6 hours + collector_class: "collectors.comtrade_mirror.ComtradeMirrorCollector" + config: + base_url: "https://comtradeapi.un.org/data/v1/get" + # Optional API key. Leave empty to run unauthenticated; the collector will + # also check the COMTRADE_API_KEY environment variable if this is blank. + api_key: "" + reporter_code: 156 # China + partner_code_world: 0 # World aggregate + flows: ["M", "X"] # Imports / Exports + hs_chapters: + - "84" # Nuclear reactors, boilers, machinery + - "85" # Electrical machinery + - "62" # Apparel (not knit) + - "61" # Apparel (knit) + - "72" # Iron and steel + - "73" # Articles of iron/steel + - "39" # Plastics + - "27" # Mineral fuels + - "71" # Natural/cultured pearls, precious metals + - "90" # Optical, medical, precision instruments + - "87" # Vehicles (autos) + - "94" # Furniture + - "95" # Toys, games, sports equipment + mirror_partners: + - 842 # United States + - 392 # Japan + - 276 # Germany + - 410 # South Korea + - 704 # Vietnam + - 528 # Netherlands + - 356 # India + - 826 # United Kingdom + - 702 # Singapore + - 36 # Australia + - 124 # Canada + - 458 # Malaysia + - 764 # Thailand + - 380 # Italy + - 251 # France + recent_months: 6 + timeout: 60 + rate_limit: 1.0 + + cn_indicators: + enabled: true + schedule: "0 7 * * *" # 7 AM UTC daily + collector_class: "collectors.cn_indicators.CNIndicatorsCollector" + config: + enabled_sources: + # ── Freight / logistics ───────────────────────────────── + - key: bdi + name_zh: 波罗的海干散货指数 + name_en: Baltic Dry Index + url: "https://tradingeconomics.com/commodity/baltic" + method: GET + parser: json + unit: index + sector: transport_logistics + access: todo + note: "Daily bulk-shipping proxy. Public page; requires scraper or FRED API key." + - key: ccfi + name_zh: 中国出口集装箱运价指数 + name_en: China Containerized Freight Index + url: "http://www.sse.net.cn/index/singleIndex?indexType=ccfi" + method: GET + parser: json + unit: index + sector: transport_logistics + access: scrape + note: "Shanghai Shipping Exchange; HTML table scrape." + - key: scfi + name_zh: 上海出口集装箱运价指数 + name_en: Shanghai Containerized Freight Index + url: "http://www.sse.net.cn/index/singleIndex?indexType=scfi" + method: GET + parser: json + unit: index + sector: transport_logistics + access: scrape + note: "Shanghai Shipping Exchange; HTML table scrape." + + # ── Trade hubs ────────────────────────────────────────── + - key: yiwu_index + name_zh: 义乌中国小商品指数 + name_en: Yiwu China Commodity Index + url: "http://www.ywindex.com/" + method: GET + parser: json + unit: index + sector: trade + access: todo + note: "Yiwu small-commodity wholesale benchmark; scrape or partner feed required." + + # ── Autos ─────────────────────────────────────────────── + - key: cpca_retail_pv + name_zh: 乘联会乘用车零售销量 + name_en: CPCA Passenger Vehicle Retail Sales + url: "https://www.cpauto.com.cn/" + method: GET + parser: json + unit: units + sector: automotive + access: todo + note: "CPCA publishes via WeChat/website; requires scrape or data partner." + - key: cpca_wholesale_pv + name_zh: 乘联会乘用车批发销量 + name_en: CPCA Passenger Vehicle Wholesale Sales + url: "https://www.cpauto.com.cn/" + method: GET + parser: json + unit: units + sector: automotive + access: todo + note: "CPCA publishes via WeChat/website; requires scrape or data partner." + + # ── Steel ─────────────────────────────────────────────── + - key: steel_price_index + name_zh: 中国钢材价格指数 + name_en: China Steel Price Index + url: "https://www.mysteel.com/" + method: GET + parser: json + unit: CNY/t + sector: steel + access: todo + note: "Mysteel/Myspic index is commercial; public alternative under review." + + # ── Cement ────────────────────────────────────────────── + - key: cement_price_index + name_zh: 中国水泥价格指数 + name_en: China Cement Price Index + url: "https://www.ccement.com/" + method: GET + parser: json + unit: CNY/t + sector: construction_materials + access: todo + note: "Cement price regional averages; requires scrape." + + # ── Coal / energy ─────────────────────────────────────── + - key: coal_price_index + name_zh: 中国煤炭价格指数 + name_en: China Coal Price Index + url: "https://www.cctd.com.cn/" + method: GET + parser: json + unit: CNY/t + sector: energy + access: todo + note: "Coal price benchmarks (e.g., CCI, CCTD); commercial/scrape." + + # ── Property ──────────────────────────────────────────── + - key: property_price_index + name_zh: 商品住宅销售价格指数 + name_en: China Property Price Index + url: "https://www.stats.gov.cn/tjsj/" + method: GET + parser: json + unit: index + sector: property + access: todo + note: "NBS monthly property price data; PDF/HTML tables." + + # ── Mobility ──────────────────────────────────────────── + - key: baidu_mobility + name_zh: 百度迁徙/出行指数 + name_en: Baidu Migration/Mobility Index + url: "https://qianxi.baidu.com/" + method: GET + parser: json + unit: index + sector: consumer_mobility + access: todo + note: "Public dashboards but no stable open API; scrape or API partnership." + + # ── Macro ─────────────────────────────────────────────── + - key: pboc_lpr + name_zh: 贷款市场报价利率 + name_en: PBOC Loan Prime Rate + url: "http://www.pbc.gov.cn/zhengcehuobisi/11111/index.html" + method: GET + parser: json + unit: pct + sector: macro + access: todo + note: "Monthly 1Y/5Y LPR; official PBOC site, HTML tables." diff --git a/config/zh_censorship_gazetteer.json b/config/zh_censorship_gazetteer.json new file mode 100644 index 0000000..7861404 --- /dev/null +++ b/config/zh_censorship_gazetteer.json @@ -0,0 +1,85 @@ +{ + "_meta": { + "purpose": "Chinese censorship euphemisms & deletion-trigger vocabulary for DDTI term extraction on Chinese-language deletion feeds.", + "note": "Publicly documented evasion terms (cf. CDT Grass-Mud Horse Lexicon). Used DEFENSIVELY to DETECT censored content. Authored by Claude: the China-hosted Kimi model rejected this task as high-risk — route politically sensitive China work off the in-jurisdiction model.", + "usage": "extract_terms() substring-matches the zh fields against Chinese text; en is the analyst gloss." + }, + "categories": { + "june4_tiananmen": [ + {"zh": "六四", "en": "June 4 (Tiananmen)"}, + {"zh": "八九", "en": "'89 (1989 movement)"}, + {"zh": "8964", "en": "1989-06-04 numeric"}, + {"zh": "五月三十五日", "en": "May 35th = June 4 (date evasion)"}, + {"zh": "八平方", "en": "8 squared = 8²=64 -> June 4"}, + {"zh": "坦克人", "en": "Tank Man"}, + {"zh": "VIIV", "en": "Roman 64 evasion"}, + {"zh": "烛光", "en": "candlelight (vigil)"}, + {"zh": "广场", "en": "the Square (Tiananmen)"} + ], + "leadership_xi": [ + {"zh": "维尼", "en": "Winnie the Pooh = Xi Jinping"}, + {"zh": "小熊维尼", "en": "Winnie the Pooh = Xi"}, + {"zh": "包子", "en": "Baozi/steamed bun = Xi"}, + {"zh": "习包子", "en": "Xi-baozi"}, + {"zh": "总加速师", "en": "Accelerator-in-Chief (sarcastic, Xi)"}, + {"zh": "一尊", "en": "the Venerated One (Xi)"}, + {"zh": "庆丰帝", "en": "Qingfeng Emperor (Xi)"}, + {"zh": "登基", "en": "enthronement (re leadership term)"}, + {"zh": "称帝", "en": "declaring emperor"} + ], + "protest_dissent": [ + {"zh": "白纸", "en": "blank paper (A4 protest)"}, + {"zh": "白纸革命", "en": "White Paper / A4 Revolution"}, + {"zh": "散步", "en": "taking a walk = protest evasion"}, + {"zh": "围观", "en": "onlooking/gathering"}, + {"zh": "上访", "en": "petitioning authorities"}, + {"zh": "维权", "en": "rights defense"}, + {"zh": "群体性事件", "en": "mass incident (official euphemism for protest)"}, + {"zh": "罢工", "en": "strike"}, + {"zh": "声援", "en": "voicing support/solidarity"} + ], + "economic_distress": [ + {"zh": "躺平", "en": "lying flat (work refusal)"}, + {"zh": "内卷", "en": "involution (ruinous competition)"}, + {"zh": "鬼城", "en": "ghost city"}, + {"zh": "烂尾楼", "en": "unfinished/abandoned buildings"}, + {"zh": "断供", "en": "mortgage payment halt / strike"}, + {"zh": "暴雷", "en": "financial blowup / default"}, + {"zh": "挤兑", "en": "bank run"}, + {"zh": "提款难", "en": "withdrawal difficulty (frozen deposits)"}, + {"zh": "失业潮", "en": "wave of unemployment"}, + {"zh": "经济下行", "en": "economic downturn"} + ], + "emigration_run": [ + {"zh": "润", "en": "run = emigrate (pun on run/rùn)"}, + {"zh": "润学", "en": "runology (study of emigrating)"}, + {"zh": "走线", "en": "'the route' = irregular migration (often via Latin America)"}, + {"zh": "走线客", "en": "route-takers (irregular migrants)"}, + {"zh": "肉身翻墙", "en": "physically scaling the wall (fleeing)"}, + {"zh": "偷渡", "en": "clandestine border crossing"}, + {"zh": "移民潮", "en": "emigration wave"} + ], + "censorship_meta": [ + {"zh": "河蟹", "en": "river crab = 'harmony' = censored"}, + {"zh": "和谐", "en": "to harmonize = to censor"}, + {"zh": "翻墙", "en": "scale the wall = use a VPN"}, + {"zh": "删帖", "en": "delete posts"}, + {"zh": "屏蔽", "en": "block/shield"}, + {"zh": "敏感词", "en": "sensitive word"}, + {"zh": "404", "en": "deleted / not found"}, + {"zh": "五毛", "en": "50-cent army (paid commenters)"}, + {"zh": "境外势力", "en": "foreign forces (deflection trope)"} + ], + "repression_triggers": [ + {"zh": "维稳", "en": "stability maintenance"}, + {"zh": "寻衅滋事", "en": "'picking quarrels' (catch-all charge)"}, + {"zh": "颜色革命", "en": "color revolution (regime threat framing)"}, + {"zh": "煽动颠覆", "en": "inciting subversion"}, + {"zh": "跨省", "en": "cross-province (police pursuit)"}, + {"zh": "喝茶", "en": "'drink tea' = police summons/interrogation"}, + {"zh": "失联", "en": "lost contact = disappeared"}, + {"zh": "被自杀", "en": "'suicided' (suspicious death)"}, + {"zh": "双规", "en": "shuanggui (Party detention)"} + ] + } +} diff --git a/config/zh_finance_lexicon.json b/config/zh_finance_lexicon.json new file mode 100644 index 0000000..cbd2fa9 --- /dev/null +++ b/config/zh_finance_lexicon.json @@ -0,0 +1,23 @@ +{ + "_meta": { + "purpose": "Chinese financial lexicon for Weibo hot-search relevance filtering and sentiment enrichment.", + "generated_by": "Kimi Code (Moonshot kimi-for-coding) — delegated for native Chinese-language domain knowledge", + "consumed_by": [ + "collectors/weibo_hotsearch.py :: _is_financially_relevant", + "processors/sentiment.py :: Chinese hawkish/dovish/sector detection (pending)" + ], + "note": "denylist OVERRIDES finance_keywords (excludes false positives like 经济适用男)." + }, + "finance_keywords": ["A股", "港股", "美股", "央行", "降准", "降息", "加息", "人民币", "楼市", "通胀", "牛市", "熊市", "股灾", "IPO", "沪指", "深指", "创业板", "科创板", "北交所", "港股通", "美联储", "汇率", "黄金", "原油", "比特币", "区块链", "基金", "理财", "保险", "银行", "房地产", "国债", "地方债", "社融", "M2", "CPI", "PPI", "GDP", "上市公司", "财报", "券商", "证监会", "金融监管"], + "denylist": ["经济适用男", "经济适用女", "房车", "房车生活", "房车旅行", "房车改装", "零花钱", "压岁钱", "钱包", "省钱攻略", "钱多多", "富二代", "房奴"], + "hawkish_keywords": ["加息", "加息周期", "收紧", "紧缩", "缩表", "上调利率", "提高利率", "收紧货币", "货币紧缩", "流动性收紧", "鹰派", "利率上行", "控制通胀"], + "dovish_keywords": ["降息", "降准", "宽松", "量化宽松", "刺激", "放水", "释放流动性", "下调利率", "降低利率", "货币宽松", "鸽派", "降准降息", "扩张性政策"], + "sector_keywords": { + "banking": ["银行", "银行股", "商业银行", "国有银行", "不良贷款", "净息差", "存款", "贷款", "信用卡"], + "markets": ["股市", "大盘", "指数", "上证指数", "深证成指", "创业板", "科创板", "恒生指数", "纳斯达克", "标普500", "成交量", "涨停", "跌停"], + "real_estate": ["房地产", "楼市", "房价", "房企", "商品房", "二手房", "限购", "房贷", "土地市场", "房产税"], + "commodities": ["黄金", "原油", "铜", "铁矿石", "大宗商品", "农产品", "有色金属", "白银", "天然气", "油价"], + "forex": ["人民币", "美元", "欧元", "日元", "汇率", "人民币兑美元", "外汇储备", "离岸人民币", "美元指数", "汇市"], + "tech": ["科技", "科技股", "半导体", "芯片", "人工智能", "AI", "新能源", "电动车", "互联网", "中概股", "华为", "宁德时代", "腾讯", "阿里巴巴"] + } +} diff --git a/config/zh_market_modifiers.json b/config/zh_market_modifiers.json new file mode 100644 index 0000000..4e70838 --- /dev/null +++ b/config/zh_market_modifiers.json @@ -0,0 +1,12 @@ +{ + "_meta": { + "purpose": "Chinese market intensity/negation/direction modifiers for sentiment refinement.", + "generated_by": "Kimi Code (Moonshot) — neutral Chinese-finance task it accepted (cf. it REJECTED the censorship-lexicon task as high-risk; sensitive China work is routed off the in-jurisdiction model).", + "consumed_by": "processors/zh_finance.py (negators) + candidate finance_keywords" + }, + "intensity_amplifiers": [{"zh":"暴","en":"sharply/violently"},{"zh":"大幅","en":"significantly/largely"},{"zh":"强劲","en":"strongly"},{"zh":"猛烈","en":"fiercely"},{"zh":"急剧","en":"sharply/rapidly"},{"zh":"显著","en":"markedly"},{"zh":"迅猛","en":"swiftly and fiercely"},{"zh":"疯狂","en":"frantically"},{"zh":"极速","en":"extremely fast"},{"zh":"大规模","en":"large-scale"}], + "intensity_diminishers": [{"zh":"微","en":"slightly"},{"zh":"小幅","en":"slightly/small-scale"},{"zh":"略有","en":"slightly"},{"zh":"温和","en":"moderately"},{"zh":"有限","en":"limitedly"},{"zh":"稍显","en":"somewhat"},{"zh":"缓慢","en":"slowly"},{"zh":"逐步","en":"gradually"},{"zh":"局部","en":"partially"},{"zh":"轻微","en":"mildly"}], + "additional_negators": [{"zh":"尚未","en":"not yet"},{"zh":"未","en":"not"},{"zh":"不再","en":"no longer"},{"zh":"缺乏","en":"lack"},{"zh":"无缘","en":"miss out/not entitled"},{"zh":"难","en":"hard to"},{"zh":"并无","en":"not actually"},{"zh":"并未","en":"did not"},{"zh":"拒绝","en":"refuse"},{"zh":"排除","en":"exclude"}], + "market_distress_terms": [{"zh":"重挫","en":"plunge"},{"zh":"暴跌","en":"plummet"},{"zh":"崩盘","en":"crash/meltdown"},{"zh":"跳水","en":"dive"},{"zh":"杀跌","en":"panic selling"},{"zh":"利空","en":"negative news/bearish"},{"zh":"熊市","en":"bear market"},{"zh":"下行","en":"downward trend"},{"zh":"失守","en":"fall below"},{"zh":"疲软","en":"weakness"},{"zh":"暴跌潮","en":"wave of crashes"},{"zh":"熔断","en":"circuit breaker"}], + "market_positive_terms": [{"zh":"反弹","en":"rebound"},{"zh":"大涨","en":"surge"},{"zh":"牛市","en":"bull market"},{"zh":"上行","en":"upward trend"},{"zh":"突破","en":"break through"},{"zh":"拉升","en":"pull up/rocket"},{"zh":"利好","en":"positive news/bullish"},{"zh":"强劲","en":"strong/robust"},{"zh":"回暖","en":"warm up/recover"},{"zh":"冲高","en":"rush higher"},{"zh":"放量上涨","en":"rise on increased volume"},{"zh":"企稳","en":"stabilize and recover"}] +} diff --git a/core/tasks.py b/core/tasks.py index f41f269..7e83f7b 100644 --- a/core/tasks.py +++ b/core/tasks.py @@ -701,6 +701,21 @@ def generate_digest(): return {"error": str(e)} +@app.task +def generate_ddti_index(): + """Recompute the DDTI selectivity/novelty index from ddti_deletion articles. + + Schedule alongside collect-ddti_probe (e.g. every 15-30 min) in scheduler beat. + Writes ddti:index:latest to Redis; pushes high-threat/new terms to alerts:ddti. + """ + try: + from processors.ddti_index import DDTIIndexProcessor + return DDTIIndexProcessor().run() + except Exception as e: + logger.error(f"DDTI index failed: {e}") + return {"error": str(e)} + + # ══════════════════════════════════════════════════════════════ # 4. ROUTING — Push collected data to DragonScope + LiquiFi # ══════════════════════════════════════════════════════════════ diff --git a/dashboards/conditions_dashboard.html b/dashboards/conditions_dashboard.html new file mode 100644 index 0000000..2fa5eef --- /dev/null +++ b/dashboards/conditions_dashboard.html @@ -0,0 +1,318 @@ + + + + + +CBB Engine · China Economic Conditions + + + + + + +
+
+
+
CBB.ENGINE
+
China Economic Conditions · Sector × Region Diffusion Index
+
+
+
Index · GO
+
UTC
+
+
+ +
+ SAMPLE DATA + demonstration feed — connect the API for live signal +
+ +
+ +
+
01Conditions Heatmap — blended diffusion by sector and region
+
+ + + + + + + + + + + + + + + + +
SectorRegionPeriodDiffusion DSentiment SDAnchor ASMomentumMirror GapConfidenceMentions
+
+ +
+ Deteriorating (D ≤ −15) + Neutral (−15 < D < 15) + Improving (D ≥ 15) + Momentum: up flat down +
+
+ +
+
SCOPE — sector-level nowcasting from reported trade, partner-reported mirror trade, high-frequency indicators, and sentiment mentions.
+
SOURCE: UN Comtrade mirror + configured China high-frequency feeds + NLP sentiment pipeline · ↻ refresh
+
+
+ + + + diff --git a/dashboards/ddti_dashboard.html b/dashboards/ddti_dashboard.html new file mode 100644 index 0000000..41f20c6 --- /dev/null +++ b/dashboards/ddti_dashboard.html @@ -0,0 +1,386 @@ + + + + + +PALIMPSEST · DDTI — Censor Attention Monitor + + + + + + +
+
+
+
PALIMPSEST.DDTI
+
Deletion-Differential Threat Index · Censor Attention Monitor
+
+
+
Selectivity · GO
+
Velocity · Blocked (egress)
+
UTC
+
+
+ +
+ SAMPLE DATA + demonstration feed — connect the API for live signal +
+ +
+ +
+
+
01Threat Board — ranked by censor attention × novelty
+
+
+ +
+ +
+
SCOPE — censor attention allocation (numerator-only; not a true deletion rate).
+
SOURCE: China Digital Times 404 archive + Minitrue · velocity sub-signal pending in-China egress · ↻ refresh
+
+
+ + + + + diff --git a/processors/conditions_index.py b/processors/conditions_index.py new file mode 100644 index 0000000..a77ec27 --- /dev/null +++ b/processors/conditions_index.py @@ -0,0 +1,531 @@ +"""China Economic Conditions Engine — conditions index processor. + +Implements a pure, offline-testable diffusion index that blends three +independent signals for each sector/month: + +1. Reported trade value (China customs / UN Comtrade reporter==156). +2. Mirror trade value (partner-reported flows involving China). +3. Chinese high-frequency indicators (BDI, CCFI, SCFI, etc.). +4. Sentiment diffusion from news/social mentions. + +The public core is `compute_conditions(...)`. The `ConditionsIndexProcessor` +wraps it as a Celery-task-style aggregate processor that reads from PostgreSQL +and publishes the latest result to Redis (`cbb:latest`). +""" + +import json +import logging +import math +import os +from collections import defaultdict +from datetime import datetime, timedelta, timezone +from pathlib import Path +from typing import Optional + +from core.base_processor import BaseProcessor + +logger = logging.getLogger(__name__) + +_TAXONOMY_PATH = Path(__file__).resolve().parent.parent / "config" / "cbb_taxonomy.json" + +# Tunable index parameters +ANCHOR_TANH_SCALE = 0.10 # growth rate that maps to ~76% of saturation +SENTIMENT_POS_THRESHOLD = 0.15 +SENTIMENT_NEG_THRESHOLD = -0.15 +TRADE_WEIGHT = 0.6 +SENTIMENT_WEIGHT = 0.4 +HIGH_CONFIDENCE_MENTIONS = 30 +MED_CONFIDENCE_MENTIONS = 10 + + +def _norm_dt(dt: Optional[datetime]) -> Optional[datetime]: + """Normalize a datetime to UTC; treat naive datetimes as UTC.""" + if dt is None: + return None + if dt.tzinfo is None: + return dt.replace(tzinfo=timezone.utc) + return dt.astimezone(timezone.utc) + + +def _month_key(dt: datetime) -> tuple: + dt = _norm_dt(dt) + return (dt.year, dt.month) + + +def _month_start(year: int, month: int) -> datetime: + return datetime(year, month, 1, tzinfo=timezone.utc) + + +def _prev_month(year: int, month: int) -> tuple: + if month == 1: + return (year - 1, 12) + return (year, month - 1) + + +def _is_complete_month(year: int, month: int, now: datetime) -> bool: + """True when the last instant of ``year/month`` is at or before ``now``.""" + now = _norm_dt(now) + if month == 12: + end = datetime(year + 1, 1, 1, tzinfo=timezone.utc) - timedelta(seconds=1) + else: + end = datetime(year, month + 1, 1, tzinfo=timezone.utc) - timedelta(seconds=1) + return end <= now + + +def _latest_complete_month(now: datetime) -> tuple: + """Calendar-based latest complete month at or before ``now``.""" + now = _norm_dt(now) + y, m = now.year, now.month + if _is_complete_month(y, m, now): + return (y, m) + return _prev_month(y, m) + + +def _period_str(year: int, month: int) -> str: + return f"{year:04d}-{month:02d}" + + +def _load_taxonomy(path: Optional[Path] = None) -> dict: + """Load CBB taxonomy from disk; return an empty skeleton on failure.""" + path = path or _TAXONOMY_PATH + try: + return json.loads(path.read_text(encoding="utf-8")) + except Exception as e: + logger.warning(f"[ConditionsIndex] taxonomy load failed: {e}") + return {"sectors": {}} + + +def compute_conditions(trade_series, cn_indicators, sentiment_mentions, taxonomy, now): + """Pure computation core for the China Economic Conditions index. + + Parameters + ---------- + trade_series: + List of dicts with keys ``date``, ``flow`` (M|X), ``hs``, + ``value``, ``reporter``, ``partner``. + cn_indicators: + List of dicts with keys ``date``, ``indicator``, ``value``. + sentiment_mentions: + List of dicts with keys ``date``, ``sector``, ``score``. + taxonomy: + Dict loaded from ``config/cbb_taxonomy.json``. + now: + Datetime anchor. The latest complete month ≤ ``now`` and the previous + month are used for momentum. + + Returns + ------- + List of per-sector result dicts (see spec for field definitions). + """ + sectors = taxonomy.get("sectors", {}) + if not sectors: + return [] + + now = _norm_dt(now) + latest = _latest_complete_month(now) + previous = _prev_month(*latest) + + # Map HS codes → sectors (a code may belong to multiple sectors). + hs_to_sectors: dict[str, list[str]] = defaultdict(list) + for sector_key, sector in sectors.items(): + for hs in sector.get("hs_codes", []): + hs_to_sectors[str(hs)].append(sector_key) + + # Aggregate reported / mirror trade by (sector, month). + reported: dict[str, dict[tuple, float]] = defaultdict(lambda: defaultdict(float)) + mirror: dict[str, dict[tuple, float]] = defaultdict(lambda: defaultdict(float)) + for rec in trade_series: + dt = _norm_dt(rec.get("date")) + if dt is None: + continue + m = _month_key(dt) + hs = str(rec.get("hs", "")) + val = rec.get("value") + if val is None: + continue + try: + val = float(val) + except (TypeError, ValueError): + continue + reporter = rec.get("reporter") + partner = rec.get("partner") + for sector in hs_to_sectors.get(hs, []): + if str(reporter) == "156": + reported[sector][m] += val + if str(partner) == "156": + mirror[sector][m] += val + + # Aggregate CN high-frequency indicators by (month, indicator key). + ind_by_month: dict[tuple, dict[str, float]] = {} + for i in cn_indicators: + dt = _norm_dt(i.get("date")) + if dt is None: + continue + m = _month_key(dt) + key = i.get("indicator", "") + val = i.get("value") + if val is None or not key: + continue + try: + val = float(val) + except (TypeError, ValueError): + continue + inner = ind_by_month.setdefault(m, {}) + inner[key] = inner.get(key, 0.0) + val + + # Aggregate sentiment mentions by (sector, month). + mentions_by_sector_month: dict[str, dict[tuple, list[float]]] = defaultdict( + lambda: defaultdict(list) + ) + for mention in sentiment_mentions: + dt = _norm_dt(mention.get("date")) + if dt is None: + continue + m = _month_key(dt) + sector = mention.get("sector", "") + score = mention.get("score", 0.0) + try: + score = float(score) + except (TypeError, ValueError): + score = 0.0 + mentions_by_sector_month[sector][m].append(score) + + results = [] + for sector_key in sorted(sectors.keys()): + sector = sectors[sector_key] + region = sector.get("region", "national") + hf_sources = sector.get("cn_hf_sources", []) or [] + + month_metrics = {} + for m in (latest, previous): + prev_m = _prev_month(*m) + + # --- Anchor: reported trade first, then CN HF fallback --- + g = None + anchor_source = None + if m in reported[sector_key] and prev_m in reported[sector_key]: + cur_val = reported[sector_key][m] + prev_val = reported[sector_key][prev_m] + g = (cur_val - prev_val) / max(1.0, abs(prev_val)) + anchor_source = "trade" + else: + cur_ind = ind_by_month.get(m, {}) + prev_ind = ind_by_month.get(prev_m, {}) + for src in hf_sources: + if src in cur_ind and src in prev_ind: + cur_val = cur_ind[src] + prev_val = prev_ind[src] + g = (cur_val - prev_val) / max(1.0, abs(prev_val)) + anchor_source = f"cn_hf:{src}" + break + + anchor_available = g is not None + as_value = 100.0 * math.tanh(g / ANCHOR_TANH_SCALE) if anchor_available else 0.0 + + # --- Sentiment diffusion --- + scores = mentions_by_sector_month.get(sector_key, {}).get(m, []) + pos = sum(1 for s in scores if s > SENTIMENT_POS_THRESHOLD) + neg = sum(1 for s in scores if s < SENTIMENT_NEG_THRESHOLD) + neutral = len(scores) - pos - neg + sd = 100.0 * (pos - neg) / max(1, pos + neg + neutral) + + # --- Blended diffusion --- + if anchor_available: + d = SENTIMENT_WEIGHT * sd + TRADE_WEIGHT * as_value + else: + d = float(sd) + + month_metrics[m] = { + "D": d, + "SD": sd, + "AS": as_value, + "reported_value": reported[sector_key].get(m), + "mirror_value": mirror[sector_key].get(m), + "anchor_growth": g, + "anchor_source": anchor_source, + "pos": pos, + "neg": neg, + "neutral": neutral, + } + + latest_metrics = month_metrics[latest] + prev_metrics = month_metrics.get(previous, {}) + d_latest = latest_metrics["D"] + sd_latest = latest_metrics["SD"] + as_latest = latest_metrics["AS"] + momentum = d_latest - prev_metrics.get("D", 0.0) if previous in month_metrics else 0.0 + n_mentions = latest_metrics["pos"] + latest_metrics["neg"] + latest_metrics["neutral"] + + # --- Mirror gap --- + rpt = latest_metrics["reported_value"] + mir = latest_metrics["mirror_value"] + if rpt is not None and mir is not None: + mirror_gap = 100.0 * (mir - rpt) / max(1.0, abs(rpt)) + else: + mirror_gap = None + + # --- Confidence --- + anchor_available = latest_metrics["anchor_growth"] is not None + if n_mentions >= HIGH_CONFIDENCE_MENTIONS and anchor_available: + confidence = "high" + elif n_mentions >= MED_CONFIDENCE_MENTIONS or anchor_available: + confidence = "med" + else: + confidence = "low" + + results.append({ + "sector": sector_key, + "region": region, + "period": _period_str(*latest), + "D": round(d_latest, 4), + "SD": round(sd_latest, 4), + "AS": round(as_latest, 4), + "momentum": round(momentum, 4), + "mirror_gap": round(mirror_gap, 4) if mirror_gap is not None else None, + "confidence": confidence, + "n_mentions": n_mentions, + "inputs": { + "reported_value": latest_metrics["reported_value"], + "mirror_value": latest_metrics["mirror_value"], + "anchor_growth": latest_metrics["anchor_growth"], + "anchor_source": latest_metrics["anchor_source"], + "pos": latest_metrics["pos"], + "neg": latest_metrics["neg"], + "neutral": latest_metrics["neutral"], + }, + }) + + return results + + +def _build_inputs_from_db(now: datetime): + """Query PostgreSQL and build the in-memory inputs for ``compute_conditions``. + + Returns a tuple ``(trade_series, cn_indicators, sentiment_mentions, taxonomy)``. + Each list may be empty on database or parsing errors. + """ + trade_series = [] + cn_indicators = [] + sentiment_mentions = [] + + try: + from api.database import SessionLocal + from storage.models import EconomicData, SentimentScore + except Exception as e: + logger.warning(f"[ConditionsIndex] DB imports unavailable: {e}") + return trade_series, cn_indicators, sentiment_mentions, _load_taxonomy() + + db = SessionLocal() + try: + econ_cutoff = now - timedelta(days=90) + econ_rows = ( + db.query(EconomicData) + .filter( + EconomicData.source.in_(["comtrade_mirror", "cn_indicators"]), + EconomicData.collected_at >= econ_cutoff, + ) + .all() + ) + for row in econ_rows: + try: + if row.source == "comtrade_mirror": + # indicator format: trade_{flow}_{hs} + parts = (row.indicator or "").split("_") + if len(parts) == 3 and parts[0] == "trade": + _, flow, hs = parts + else: + continue + meta = row.extra_data or {} + trade_series.append({ + "date": row.date, + "flow": flow, + "hs": hs, + "value": float(row.value) if row.value is not None else None, + "reporter": meta.get("reporter"), + "partner": meta.get("partner"), + "net_weight": meta.get("netWeight"), + }) + elif row.source == "cn_indicators": + cn_indicators.append({ + "date": row.date, + "indicator": row.indicator, + "value": float(row.value) if row.value is not None else None, + }) + except Exception as e: + logger.warning(f"[ConditionsIndex] skipping row {row.id}: {e}") + continue + + sent_cutoff = now - timedelta(days=30) + sent_rows = ( + db.query(SentimentScore) + .filter(SentimentScore.created_at >= sent_cutoff) + .all() + ) + for row in sent_rows: + sector_scores = row.sector_scores or {} + for sector_key in sector_scores.keys(): + sentiment_mentions.append({ + "date": row.created_at, + "sector": sector_key, + "score": float(row.overall) if row.overall is not None else 0.0, + }) + except Exception as e: + logger.error(f"[ConditionsIndex] DB query failed: {e}") + finally: + db.close() + + return trade_series, cn_indicators, sentiment_mentions, _load_taxonomy() + + +class ConditionsIndexProcessor(BaseProcessor): + """Aggregate processor: DB inputs → conditions index → Redis + snapshot rows.""" + + name = "conditions_index" + + def process_one(self, article: dict) -> dict: + return {"status": "use_run"} + + def run(self) -> dict: + now = datetime.now(timezone.utc) + trade_series, cn_indicators, sentiment_mentions, taxonomy = _build_inputs_from_db(now) + + try: + results = compute_conditions( + trade_series, cn_indicators, sentiment_mentions, taxonomy, now + ) + except Exception as e: + logger.error(f"[ConditionsIndex] compute failed: {e}") + return {"status": "error", "error": str(e)} + + # Publish to Redis + try: + import redis + payload = { + "generated_at": now.isoformat(), + "period": _period_str(*_latest_complete_month(now)), + "sectors": results, + } + r = redis.from_url( + os.getenv("REDIS_URL", "redis://localhost:6379"), decode_responses=True + ) + r.set("cbb:latest", json.dumps(payload, ensure_ascii=False), ex=7200) + r.close() + except Exception as e: + logger.warning(f"[ConditionsIndex] Redis publish failed: {e}") + + # Persist snapshots + try: + from api.database import SessionLocal + from storage.models import ConditionsIndexSnapshot + + db = SessionLocal() + try: + for res in results: + snap = ConditionsIndexSnapshot( + generated_at=now, + period=res.get("period"), + sector=res.get("sector"), + region=res.get("region"), + diffusion=res.get("D", 0.0), + sentiment=res.get("SD", 0.0), + anchor=res.get("AS", 0.0), + momentum=res.get("momentum", 0.0), + mirror_gap=res.get("mirror_gap"), + confidence=res.get("confidence", "low"), + n_mentions=res.get("n_mentions", 0), + inputs=res.get("inputs", {}), + ) + db.add(snap) + db.commit() + finally: + db.close() + except Exception as e: + logger.warning(f"[ConditionsIndex] snapshot persist failed: {e}") + + logger.info(f"[ConditionsIndex] computed {len(results)} sectors") + return { + "status": "success", + "sectors": len(results), + "period": _period_str(*_latest_complete_month(now)), + "generated_at": now.isoformat(), + } + + +if __name__ == "__main__": + # Offline self-test: 2–3 synthetic months across electronics, autos, steel. + taxonomy = { + "sectors": { + "electronics": { + "hs_codes": ["85"], + "cn_hf_sources": ["bdi"], + "region": "coastal_export", + }, + "autos": { + "hs_codes": ["87"], + "cn_hf_sources": ["ccfi"], + "region": "coastal_export", + }, + "steel": { + "hs_codes": ["72"], + "cn_hf_sources": [], + "region": "northeast", + }, + } + } + + base = datetime(2024, 6, 1, tzinfo=timezone.utc) # latest complete month = May 2024 + + trade_series = [ + # March 2024 baseline + {"date": datetime(2024, 3, 1, tzinfo=timezone.utc), "flow": "X", "hs": "85", "value": 850.0, "reporter": 156, "partner": 0}, + {"date": datetime(2024, 3, 1, tzinfo=timezone.utc), "flow": "X", "hs": "87", "value": 460.0, "reporter": 156, "partner": 0}, + {"date": datetime(2024, 3, 1, tzinfo=timezone.utc), "flow": "X", "hs": "72", "value": 320.0, "reporter": 156, "partner": 0}, + # April 2024 previous month + {"date": datetime(2024, 4, 1, tzinfo=timezone.utc), "flow": "X", "hs": "85", "value": 900.0, "reporter": 156, "partner": 0}, + {"date": datetime(2024, 4, 1, tzinfo=timezone.utc), "flow": "X", "hs": "87", "value": 480.0, "reporter": 156, "partner": 0}, + {"date": datetime(2024, 4, 1, tzinfo=timezone.utc), "flow": "X", "hs": "72", "value": 310.0, "reporter": 156, "partner": 0}, + {"date": datetime(2024, 4, 1, tzinfo=timezone.utc), "flow": "M", "hs": "85", "value": 920.0, "reporter": 0, "partner": 156}, + # May 2024 latest month + {"date": datetime(2024, 5, 1, tzinfo=timezone.utc), "flow": "X", "hs": "85", "value": 1000.0, "reporter": 156, "partner": 0}, + {"date": datetime(2024, 5, 1, tzinfo=timezone.utc), "flow": "X", "hs": "87", "value": 500.0, "reporter": 156, "partner": 0}, + {"date": datetime(2024, 5, 1, tzinfo=timezone.utc), "flow": "X", "hs": "72", "value": 300.0, "reporter": 156, "partner": 0}, + {"date": datetime(2024, 5, 1, tzinfo=timezone.utc), "flow": "M", "hs": "85", "value": 1050.0, "reporter": 0, "partner": 156}, + ] + + cn_indicators = [ + {"date": datetime(2024, 4, 1, tzinfo=timezone.utc), "indicator": "bdi", "value": 1750.0}, + {"date": datetime(2024, 5, 1, tzinfo=timezone.utc), "indicator": "bdi", "value": 1800.0}, + {"date": datetime(2024, 4, 1, tzinfo=timezone.utc), "indicator": "ccfi", "value": 880.0}, + {"date": datetime(2024, 5, 1, tzinfo=timezone.utc), "indicator": "ccfi", "value": 900.0}, + ] + + sentiment_mentions = [ + # April + {"date": datetime(2024, 4, 5, tzinfo=timezone.utc), "sector": "electronics", "score": 0.20}, + {"date": datetime(2024, 4, 6, tzinfo=timezone.utc), "sector": "electronics", "score": -0.05}, + {"date": datetime(2024, 4, 7, tzinfo=timezone.utc), "sector": "autos", "score": 0.30}, + {"date": datetime(2024, 4, 8, tzinfo=timezone.utc), "sector": "steel", "score": -0.25}, + # May + {"date": datetime(2024, 5, 5, tzinfo=timezone.utc), "sector": "electronics", "score": 0.35}, + {"date": datetime(2024, 5, 6, tzinfo=timezone.utc), "sector": "electronics", "score": 0.10}, + {"date": datetime(2024, 5, 7, tzinfo=timezone.utc), "sector": "electronics", "score": -0.20}, + {"date": datetime(2024, 5, 8, tzinfo=timezone.utc), "sector": "autos", "score": 0.25}, + {"date": datetime(2024, 5, 9, tzinfo=timezone.utc), "sector": "autos", "score": 0.05}, + {"date": datetime(2024, 5, 10, tzinfo=timezone.utc), "sector": "steel", "score": -0.10}, + ] + + results = compute_conditions(trade_series, cn_indicators, sentiment_mentions, taxonomy, base) + + print("\nChina Economic Conditions Index (offline self-test)") + print("=" * 95) + print(f"{'Sector':<14} {'Region':<16} {'Period':<8} {'D':>8} {'SD':>8} {'AS':>8} {'Mom':>7} {'Gap':>8} {'Conf':>5} {'N':>4}") + print("-" * 95) + for r in results: + gap = f"{r['mirror_gap']:.1f}" if r["mirror_gap"] is not None else "-" + mom_arrow = "▲" if r["momentum"] > 0.5 else ("▼" if r["momentum"] < -0.5 else "▬") + print( + f"{r['sector']:<14} {r['region']:<16} {r['period']:<8} " + f"{r['D']:>8.2f} {r['SD']:>8.2f} {r['AS']:>8.2f} " + f"{mom_arrow} {r['momentum']:>5.2f} {gap:>8} {r['confidence']:>5} {r['n_mentions']:>4}" + ) + print("=" * 95) diff --git a/processors/conditions_report.py b/processors/conditions_report.py new file mode 100644 index 0000000..eaeaed4 --- /dev/null +++ b/processors/conditions_report.py @@ -0,0 +1,433 @@ +"""China economic conditions report generator. + +Reads the latest CBB index (Redis key `cbb:latest` or recent +`ConditionsIndexSnapshot` rows), builds a neutral briefing prompt, calls an +LLM via the project's free-llm router / Anthropic / Ollama fallback chain, and +writes the report to `data/cbb/reports/.md` plus `latest.md`. + +A lightweight metadata record is stored by reusing the existing `DailyDigest` +table; the canonical report content lives on disk so the API can serve it +without depending on Postgres. +""" + +import asyncio +import json +import logging +import os +from datetime import datetime, timedelta, timezone +from pathlib import Path + +import httpx + +from core.base_processor import BaseProcessor + +logger = logging.getLogger(__name__) + +_REPORT_DIR = Path(__file__).resolve().parent.parent / "data" / "cbb" / "reports" +_OLLAMA_URL = os.getenv("OLLAMA_URL", "http://localhost:11434") + + +class ConditionsReportGenerator(BaseProcessor): + name = "conditions_report" + batch_size = 50 + + def __init__(self, config: dict = None): + super().__init__(config) + self.llm_model = self.config.get("llm_model", "claude-sonnet-4-6") + self.ollama_model = self.config.get("ollama_model", "llama3") + self.send_telegram = self.config.get("send_telegram", False) + + def process_one(self, article: dict) -> dict: + return {"status": "use_run"} + + def run(self) -> dict: + """Generate today's China economic conditions report.""" + from api.database import SessionLocal + from storage.models import DailyDigest + + db = SessionLocal() + try: + today = datetime.now(timezone.utc).date() + + # 1. Load latest index data. + index_data = self._load_latest_index(db) + if not index_data: + return {"status": "no_data", "date": str(today)} + + sectors = index_data.get("sectors", []) + generated_at = index_data.get("generated_at") + + # 2. Build prompt and generate report. + prompt = self._build_prompt(sectors, generated_at) + report = self._generate_report(prompt, sectors, generated_at) + + # 3. Write report files. + report_path, latest_path = self._write_report(report, today) + + # 4. Store lightweight metadata record. + digest = DailyDigest( + date=today, + summary=report, + top_themes=[{"sector": s.get("sector"), "D": s.get("D")} for s in sectors], + sentiment_summary={ + "sectors": len(sectors), + "generated_at": generated_at, + "report_path": str(report_path), + }, + key_data_releases=[ + { + "sector": s.get("sector"), + "D": s.get("D"), + "momentum": s.get("momentum"), + "confidence": s.get("confidence"), + } + for s in sectors + ], + new_circulars=[], + ) + db.add(digest) + db.commit() + + if self.send_telegram: + self._send_telegram(report) + + return { + "status": "success", + "date": str(today), + "sectors": len(sectors), + "report_path": str(report_path), + "report_length": len(report), + } + except Exception as e: + logger.error(f"[ConditionsReport] Failed: {e}") + try: + db.rollback() + except Exception: + pass + return {"status": "error", "error": str(e)} + finally: + db.close() + + def _load_latest_index(self, db) -> dict: + """Read `cbb:latest` from Redis or fall back to recent DB snapshots.""" + try: + import redis + + r = redis.from_url( + os.getenv("REDIS_URL", "redis://localhost:6379"), + decode_responses=True, + ) + raw = r.get("cbb:latest") + r.close() + if raw: + data = json.loads(raw) + if data and data.get("sectors"): + return data + except Exception as e: + logger.warning(f"[ConditionsReport] Redis read failed: {e}") + + # Fallback: query recent ConditionsIndexSnapshot rows. + try: + from storage.models import ConditionsIndexSnapshot + + cutoff = datetime.now(timezone.utc) - timedelta(hours=24) + rows = ( + db.query(ConditionsIndexSnapshot) + .filter(ConditionsIndexSnapshot.generated_at >= cutoff) + .order_by(ConditionsIndexSnapshot.generated_at.desc()) + .all() + ) + if not rows: + return {} + + # Keep the newest row per sector. + seen = {} + for row in rows: + if row.sector not in seen: + seen[row.sector] = row + + sectors = [] + generated_at = None + for row in seen.values(): + ts = row.generated_at.isoformat() if row.generated_at else None + if generated_at is None and ts: + generated_at = ts + sectors.append({ + "sector": row.sector, + "region": row.region, + "period": row.period, + "D": float(row.diffusion) if row.diffusion is not None else 0.0, + "SD": float(row.sentiment) if row.sentiment is not None else 0.0, + "AS": float(row.anchor) if row.anchor is not None else 0.0, + "momentum": float(row.momentum) if row.momentum is not None else 0.0, + "mirror_gap": ( + float(row.mirror_gap) if row.mirror_gap is not None else None + ), + "confidence": row.confidence or "low", + "n_mentions": int(row.n_mentions) if row.n_mentions is not None else 0, + "inputs": row.inputs or {}, + }) + + return {"sectors": sectors, "generated_at": generated_at} + except Exception as e: + logger.warning(f"[ConditionsReport] DB fallback failed: {e}") + return {} + + def _build_prompt(self, sectors: list[dict], generated_at: str | None) -> str: + """Build a neutral LLM prompt from sector index data.""" + lines = [ + "You are an economic analyst writing a neutral, data-focused briefing on current " + "conditions in the Chinese economy.", + "", + f"Index generated at: {generated_at or 'unknown'}", + f"Sectors covered: {len(sectors)}", + "", + "For each sector below, comment on:", + "- Current diffusion (D): negative = weaker, positive = stronger.", + "- Momentum vs the previous month.", + "- Confidence level (low / med / high) and why.", + "", + "Then identify the biggest movers (largest absolute change in D or momentum).", + "", + "Finally, include a short 'Cross-source triangulation' section. " + "Compare the official/trade anchor and the mirror-gap where available. " + "Frame any divergence as a data-quality / nowcasting commentary rather than " + "accusation. Note when independent or commercial indicators align or depart " + "from the headline direction.", + "", + "Use plain Markdown. Keep the tone neutral, concise, and focused on the numbers.", + "", + "--- Sector data ---", + ] + + for s in sectors: + inputs = s.get("inputs", {}) or {} + anchor_source = inputs.get("anchor_source") or "none" + reported = inputs.get("reported_value") + mirror = inputs.get("mirror_value") + lines.append( + f"\nSector: {s.get('sector')} | Region: {s.get('region', 'unknown')} | " + f"Period: {s.get('period', 'unknown')}" + ) + lines.append( + f"- Diffusion D={s.get('D', 0):.2f}, " + f"sentiment SD={s.get('SD', 0):.2f}, " + f"anchor AS={s.get('AS', 0):.2f}, " + f"momentum={s.get('momentum', 0):.2f}" + ) + lines.append( + f"- Confidence={s.get('confidence', 'low')}, " + f"mentions={s.get('n_mentions', 0)}" + ) + if s.get("mirror_gap") is not None: + lines.append(f"- Mirror gap={s.get('mirror_gap'):.2f}%") + lines.append(f"- Anchor source={anchor_source}") + if reported is not None: + lines.append(f"- Reported trade value={reported}") + if mirror is not None: + lines.append(f"- Mirror trade value={mirror}") + + return "\n".join(lines) + + def _generate_report( + self, prompt: str, sectors: list[dict], generated_at: str | None + ) -> str: + """Try LLM providers in order; fall back to a rule-based stub.""" + # 1. free_llm_router (async). + try: + from free_llm_router import FreeLLMRouter + + router = FreeLLMRouter() + result = asyncio.run( + router.chat_completion( + messages=[{"role": "user", "content": prompt}], + task_type="briefing", + temperature=0.3, + max_tokens=2048, + ) + ) + text = result.get("text", "").strip() + if text: + return text + except Exception as e: + logger.warning(f"[ConditionsReport] FreeLLMRouter failed: {e}") + + # 2. Anthropic Claude. + api_key = os.getenv("ANTHROPIC_API_KEY") + if api_key: + try: + import anthropic + + client = anthropic.Anthropic(api_key=api_key) + message = client.messages.create( + model=self.llm_model, + max_tokens=2048, + messages=[{"role": "user", "content": prompt}], + ) + text = message.content[0].text.strip() + if text: + return text + except Exception as e: + logger.warning(f"[ConditionsReport] Claude API failed: {e}") + + # 3. Ollama local fallback. + try: + resp = httpx.post( + f"{_OLLAMA_URL}/api/generate", + json={"model": self.ollama_model, "prompt": prompt, "stream": False}, + timeout=120, + ) + if resp.status_code == 200: + text = resp.json().get("response", "").strip() + if text: + return text + except Exception as e: + logger.warning(f"[ConditionsReport] Ollama failed: {e}") + + # 4. Rule-based stub. + return self._rule_based_report(sectors, generated_at) + + def _rule_based_report( + self, sectors: list[dict], generated_at: str | None + ) -> str: + """Minimal deterministic report when no LLM is available.""" + now = datetime.now(timezone.utc) + lines = [ + f"# China Economic Conditions Briefing — {now.date().isoformat()}", + "", + f"_Index generated at: {generated_at or 'unknown'}_", + "", + "## Sector conditions", + ] + + def _arrow(momentum: float) -> str: + if momentum > 0.5: + return "▲ improving" + if momentum < -0.5: + return "▼ weakening" + return "▬ stable" + + for s in sorted(sectors, key=lambda x: abs(x.get("momentum", 0)), reverse=True): + sector = s.get("sector", "unknown") + d = s.get("D", 0.0) + momentum = s.get("momentum", 0.0) + conf = s.get("confidence", "low") + gap = s.get("mirror_gap") + lines.append( + f"- **{sector}**: D={d:.2f}, momentum={momentum:.2f} {_arrow(momentum)}, " + f"confidence={conf}" + ) + if gap is not None: + lines.append(f" - Mirror gap: {gap:.2f}%") + + if sectors: + movers = sorted(sectors, key=lambda x: abs(x.get("momentum", 0)), reverse=True)[:3] + lines.extend(["", "## Biggest movers"]) + for s in movers: + lines.append( + f"- **{s.get('sector')}**: momentum {s.get('momentum', 0):.2f} " + f"(D {s.get('D', 0):.2f})" + ) + + lines.extend( + [ + "", + "## Cross-source triangulation", + "- Compare official/trade anchors with mirror-gap and high-frequency indicators.", + "- Large mirror gaps may reflect reporting lags, valuation effects, or " + "transshipment; treat them as nowcasting uncertainty, not proof of revision.", + "- Where confidence is low, rely on the direction of high-frequency commercial " + "series and sentiment diffusion rather than point estimates.", + "", + "_Report generated by rule-based fallback (no LLM available)._", + ] + ) + return "\n".join(lines) + + def _write_report(self, report: str, today) -> tuple[Path, Path]: + """Write dated report and update latest symlink/file.""" + _REPORT_DIR.mkdir(parents=True, exist_ok=True) + date_str = today.isoformat() + report_path = _REPORT_DIR / f"{date_str}.md" + latest_path = _REPORT_DIR / "latest.md" + report_path.write_text(report, encoding="utf-8") + latest_path.write_text(report, encoding="utf-8") + return report_path, latest_path + + @staticmethod + def _escape_markdown(text: str) -> str: + """Escape special characters for Telegram MarkdownV2.""" + for ch in ( + "_", "*", "[", "]", "(", ")", "~", "`", ">", "#", "+", + "-", "=", "|", "{", "}", ".", "!", + ): + text = text.replace(ch, f"\\{ch}") + return text + + def _send_telegram(self, report: str): + """Send report via Telegram bot (best-effort).""" + bot_token = os.getenv("TELEGRAM_BOT_TOKEN") + chat_id = os.getenv("TELEGRAM_ALERT_CHAT_ID") + if not bot_token or not chat_id: + return + + try: + escaped = self._escape_markdown(report[:3500]) + httpx.post( + f"https://api.telegram.org/bot{bot_token}/sendMessage", + json={ + "chat_id": chat_id, + "text": f"📊 *China Conditions Report*\n\n{escaped}", + "parse_mode": "MarkdownV2", + }, + timeout=10, + ) + except Exception as e: + logger.warning(f"[ConditionsReport] Telegram send failed: {e}") + + +if __name__ == "__main__": + # Stand-alone sanity run: generate a report from sample index data. + sample = { + "generated_at": datetime.now(timezone.utc).isoformat(), + "sectors": [ + { + "sector": "electronics_machinery", + "region": "coastal_export", + "period": "2024-05", + "D": 18.5, + "SD": 12.0, + "AS": 22.0, + "momentum": 4.2, + "mirror_gap": -8.3, + "confidence": "high", + "n_mentions": 42, + "inputs": { + "reported_value": 120_000_000_000.0, + "mirror_value": 110_000_000_000.0, + "anchor_growth": 0.12, + "anchor_source": "trade", + }, + }, + { + "sector": "property_construction", + "region": "national", + "period": "2024-05", + "D": -22.1, + "SD": -18.5, + "AS": -25.0, + "momentum": -6.7, + "mirror_gap": None, + "confidence": "med", + "n_mentions": 18, + "inputs": {"anchor_source": "cn_hf:bdi"}, + }, + ], + } + + gen = ConditionsReportGenerator() + prompt = gen._build_prompt(sample["sectors"], sample["generated_at"]) + report = gen._generate_report(prompt, sample["sectors"], sample["generated_at"]) + paths = gen._write_report(report, datetime.now(timezone.utc).date()) + print(report) + print(f"\n--- wrote: {paths[0]} and {paths[1]} ---") diff --git a/processors/ddti_index.py b/processors/ddti_index.py new file mode 100644 index 0000000..75d2f02 --- /dev/null +++ b/processors/ddti_index.py @@ -0,0 +1,355 @@ +"""DDTI selectivity/novelty index — the reachable two-thirds of the DDTI. + +The feasibility probe (scripts/ddti_feasibility.py) found that deletion *velocity* +(minute-resolution survival curves) needs in-China egress, but *selectivity* and +*novelty* are reconstructable today from China Digital Times' curated deletion +stream. This module turns that stream into a ranked threat index. + +HONEST SCOPE: CDT gives a numerator (censored items), not a denominator (all +items on a topic), so this is NOT a true deletion-RATE selectivity. It measures +**censor attention allocation** — how much of the apparatus's output targets each +term, recency-weighted — plus **novelty** (newly-sensitive / bursting terms). A +true rate would require joining against a topic-volume denominator (e.g. the +weibo_hotsearch trending stream); see compute_selectivity_novelty's docstring. + +Layered like sentiment/zh_finance: a pure, testable scoring core + +extract_terms() + a thin BaseProcessor that reads accumulated ddti_deletion +Articles and writes the index to Redis. +""" + +import json +import logging +import os +import re +from datetime import datetime, timedelta, timezone +from functools import lru_cache +from pathlib import Path + +from core.base_processor import BaseProcessor + +logger = logging.getLogger(__name__) + +_GAZETTEER_PATH = Path(__file__).resolve().parent.parent / "config" / "zh_censorship_gazetteer.json" +_DOMAINS_PATH = Path(__file__).resolve().parent.parent / "config" / "ddti_threat_categories.json" + + +@lru_cache(maxsize=1) +def load_domain_map() -> tuple: + """Return (term->domain dict, domains-meta dict). Empty on miss.""" + try: + data = json.loads(_DOMAINS_PATH.read_text(encoding="utf-8")) + return data.get("term_domain", {}), data.get("domains", {}) + except Exception as e: + logger.warning(f"[DDTI-Index] domain map load failed: {e}") + return {}, {} + + +@lru_cache(maxsize=1) +def load_censorship_terms() -> tuple: + """Flatten the Chinese censorship gazetteer to a tuple of zh terms. Empty on miss.""" + try: + data = json.loads(_GAZETTEER_PATH.read_text(encoding="utf-8")) + terms = [] + for cat in data.get("categories", {}).values(): + terms += [e["zh"] for e in cat if e.get("zh")] + return tuple(dict.fromkeys(terms)) # dedup, preserve order + except Exception as e: + logger.warning(f"[DDTI-Index] censorship gazetteer load failed: {e}") + return tuple() + +# ── Tunable index parameters ────────────────────────────────────── +CURRENT_WINDOW_DAYS = 3 # what counts as "now" +HISTORY_WINDOW_DAYS = 30 # baseline period for burst/novelty +HALF_LIFE_DAYS = 2.0 # attention decay: a 2-day-old deletion counts half +NOVELTY_WEIGHT = 1.5 # how hard novelty amplifies attention (the key knob) +TOP_N = 25 +ALERT_THREAT_THRESHOLD = 3.0 # push terms above this to the alert stream + +# Quoted/bracketed spans — bilingual. CDT puts the censored term in quotes +# (Chinese 《》「」 or curly/straight double quotes), so these spans ARE the +# sensitive vocabulary, not the article's CMS category. +_ENTITY_SPAN = re.compile(r"[《「『“\"]([^》」』”\"]{2,60})[》」』”\"]") + +# Canonical China-censorship entities that recur unquoted in English headlines. +# Substring-matched; kept small and auditable rather than a full NER model. +_EN_GAZETTEER = [ + "Tiananmen", "Xi Jinping", "ByteDance", "Douyin", "WeChat", "Weibo", + "Hong Kong", "Xinjiang", "Tibet", "Taiwan", "PBOC", "Sino-American Summit", + "capital controls", "youth unemployment", "coal mine", "bank run", + "property", "censorship", "sensitive words", "404", +] + +# Generic CDT/CMS taxonomy that carries no threat signal — dropped from terms. +_TAXONOMY_STOP = { + "cdt highlights", "level 2 article", "level 3 article", "china & the world", + "politics", "economy", "law", "sci-tech", "society", "recent news", + "the great divide", "translation", "china", "chinese", "featured", "news", +} + + +def combine_threat(attention: float, novelty: float, novelty_weight: float = NOVELTY_WEIGHT) -> float: + """Combine censor-attention and novelty into one threat score. [TUNING POINT] + + Default: threat = attention · (1 + novelty_weight · novelty). + - attention dominates magnitude (a loud, heavily-censored term scores high); + - novelty multiplies it (a *newly* sensitive term of equal volume outranks a + chronically-censored one). + Trade-offs you may want to change: + - Pure additive (attention + w·novelty) treats a brand-new low-volume term as + a top threat — more sensitive to emerging signals, noisier. + - Multiplicative (current) needs *some* volume before novelty matters — calmer, + but can miss a single-post canary on a brand-new term. + """ + return attention * (1.0 + novelty_weight * novelty) + + +def extract_terms(title: str, text: str, tags: list[str], lexicon: dict) -> list[str]: + """Extract candidate threat terms from a censored item (deterministic). + + Three sources, unioned: (1) CDT's own tags, (2) known finance/policy + vocabulary from the shared lexicon present as substrings, (3) bracketed/quoted + entity spans in the headline. Substring matching (no \\b — CJK-safe). + """ + blob = f"{title} {text}" + terms = set() + + # (1) quoted/bracketed spans in the title — the censored term itself + for m in _ENTITY_SPAN.findall(title or ""): + m = m.strip().strip(",.;:") + if 1 < len(m) <= 60: + terms.add(m) + + # (2) canonical censorship entities (case-insensitive substring) + low = blob.lower() + for ent in _EN_GAZETTEER: + if ent.lower() in low: + terms.add(ent) + + # (3a) Chinese censorship euphemisms / deletion triggers (fires on zh feeds) + for zh in load_censorship_terms(): + if zh in blob: + terms.add(zh) + + # (3) Chinese finance/policy vocabulary (for any Chinese in the text) + lex_terms = ( + lexicon.get("finance_keywords", []) + + lexicon.get("hawkish_keywords", []) + + lexicon.get("dovish_keywords", []) + ) + for sector_kws in lexicon.get("sector_keywords", {}).values(): + lex_terms += sector_kws + for kw in lex_terms: + if kw and kw in blob: + terms.add(kw) + + # (4) non-generic CDT tags only (drop CMS taxonomy noise) + for t in tags or []: + t = t.strip() + if t and t.lower() not in _TAXONOMY_STOP: + terms.add(t) + + return sorted(terms) + + +def compute_selectivity_novelty( + observations: list[dict], + now: datetime, + *, + current_window_days: int = CURRENT_WINDOW_DAYS, + history_window_days: int = HISTORY_WINDOW_DAYS, + half_life_days: float = HALF_LIFE_DAYS, + novelty_weight: float = NOVELTY_WEIGHT, + top_n: int = TOP_N, + domain_map: dict = None, +) -> dict: + """Rank censored terms by threat = attention × novelty amplification. + + observations: [{"terms": [str], "detected_at": datetime(aware), "title": str, + "url": str, "source": str}] + + attention(term) = Σ over CURRENT-window deletions of 0.5**(age_days/half_life) + — recency-weighted censor attention. + novelty(term) = 1.0 if the term never appeared in the baseline period and is + appearing now (a newly-sensitive term); else a bounded + function of the burst ratio (recent_rate / baseline_rate). + threat(term) = combine_threat(attention, novelty). + + To upgrade this to a TRUE selectivity rate, divide recent_count by a topic-volume + denominator (e.g. weibo_hotsearch mentions of the same term) before ranking. + """ + current_cutoff = now - timedelta(days=current_window_days) + history_cutoff = now - timedelta(days=history_window_days) + baseline_days = max(1e-9, history_window_days - current_window_days) + half_life_seconds = half_life_days * 86400 + + agg: dict[str, dict] = {} + + def _slot(term): + return agg.setdefault(term, { + "attention": 0.0, "recent_count": 0, "hist_count": 0, + "first_seen": None, "samples": [], + }) + + n_used = 0 + for obs in observations: + ts = obs.get("detected_at") + if ts is None or ts < history_cutoff: + continue + n_used += 1 + in_current = ts >= current_cutoff + age_days = max(0.0, (now - ts).total_seconds()) / 86400 + decay = 0.5 ** (age_days * 86400 / half_life_seconds) if half_life_seconds else 1.0 + + for term in obs.get("terms", []): + s = _slot(term) + if s["first_seen"] is None or ts < s["first_seen"]: + s["first_seen"] = ts + if in_current: + s["attention"] += decay + s["recent_count"] += 1 + if len(s["samples"]) < 3 and obs.get("title"): + s["samples"].append({"title": obs["title"][:140], "url": obs.get("url", "")}) + else: + s["hist_count"] += 1 + + ranked = [] + for term, s in agg.items(): + if s["recent_count"] < 1: + continue # not a *current* threat + baseline_rate = s["hist_count"] / baseline_days + recent_rate = s["recent_count"] / current_window_days + is_new = s["hist_count"] == 0 + if is_new: + novelty = 1.0 + burst_ratio = None + else: + burst_ratio = recent_rate / baseline_rate if baseline_rate > 0 else float("inf") + excess = max(0.0, burst_ratio - 1.0) + novelty = excess / (1.0 + excess) # bounded to [0,1) + threat = combine_threat(s["attention"], novelty, novelty_weight) + ranked.append({ + "term": term, + "domain": (domain_map or {}).get(term, "OTHER"), + "threat": round(threat, 4), + "attention": round(s["attention"], 4), + "novelty": round(novelty, 4), + "burst_ratio": (round(burst_ratio, 2) if burst_ratio not in (None, float("inf")) else burst_ratio), + "is_new": is_new, + "recent_count": s["recent_count"], + "hist_count": s["hist_count"], + "first_seen": s["first_seen"].isoformat() if s["first_seen"] else None, + "samples": s["samples"], + }) + + ranked.sort(key=lambda x: x["threat"], reverse=True) + return { + "generated_at": now.isoformat(), + "window": {"current_days": current_window_days, "history_days": history_window_days, + "half_life_days": half_life_days, "novelty_weight": novelty_weight}, + "scope": "censor_attention_allocation (numerator-only; not a true deletion rate)", + "n_observations_used": n_used, + "n_terms": len(ranked), + "ranked": ranked[:top_n], + } + + +def persist_snapshot(index: dict, db) -> bool: + """Write one DDTIIndexSnapshot row (the time-series record). Best-effort.""" + try: + from storage.models import DDTIIndexSnapshot + top = index["ranked"][0] if index.get("ranked") else {} + row = DDTIIndexSnapshot( + generated_at=datetime.fromisoformat(index["generated_at"]), + n_observations=index.get("n_observations_used", 0), + n_terms=index.get("n_terms", 0), + n_new=sum(1 for r in index.get("ranked", []) if r.get("is_new")), + top_term=top.get("term"), + top_threat=float(top.get("threat", 0.0)), + window=index.get("window", {}), + ranked=index.get("ranked", []), + scope=index.get("scope"), + ) + db.add(row) + db.commit() + return True + except Exception as e: + logger.warning(f"[DDTI-Index] snapshot persist failed: {e}") + try: + db.rollback() + except Exception: + pass + return False + + +class DDTIIndexProcessor(BaseProcessor): + """Aggregate processor: ddti_deletion Articles → ranked threat index → Redis.""" + + name = "ddti_index" + + def __init__(self, config: dict = None): + super().__init__(config) + self.history_days = self.config.get("history_window_days", HISTORY_WINDOW_DAYS) + + def process_one(self, article: dict) -> dict: + return {"status": "use_run"} # aggregate processor — see run() + + def run(self) -> dict: + try: + from api.database import SessionLocal + from storage.models import Article + from processors.zh_finance import load_lexicon + except Exception as e: + return {"status": "error", "error": f"imports unavailable: {e}"} + + lexicon = load_lexicon() + now = datetime.now(timezone.utc) + cutoff = now - timedelta(days=self.history_days) + + db = SessionLocal() + try: + rows = ( + db.query(Article) + .filter(Article.category == "ddti_deletion", Article.collected_at >= cutoff) + .all() + ) + observations = [] + for a in rows: + meta = getattr(a, "extra_data", None) or {} + tags = meta.get("tags", []) if isinstance(meta, dict) else [] + observations.append({ + "terms": extract_terms(a.title or "", a.full_text or "", tags, lexicon), + "detected_at": a.collected_at or now, + "title": a.title or "", + "url": a.url or "", + "source": a.author or "", + }) + + dmap, _ = load_domain_map() + index = compute_selectivity_novelty(observations, now, domain_map=dmap) + self._publish(index) + persist_snapshot(index, db) # durable time-series row + logger.info(f"[DDTI-Index] {index['n_terms']} terms from {index['n_observations_used']} deletions") + return {"status": "success", "terms": index["n_terms"], + "observations": index["n_observations_used"]} + except Exception as e: + logger.error(f"[DDTI-Index] run failed: {e}") + return {"status": "error", "error": str(e)} + finally: + db.close() + + def _publish(self, index: dict): + """Write latest index to Redis + push high-threat terms to an alert stream.""" + try: + import redis + r = redis.from_url(os.getenv("REDIS_URL", "redis://localhost:6379"), decode_responses=True) + r.set("ddti:index:latest", json.dumps(index, ensure_ascii=False), ex=7200) + for term in index["ranked"]: + if term["threat"] >= ALERT_THREAT_THRESHOLD or term["is_new"]: + r.lpush("alerts:ddti", json.dumps({ + "term": term["term"], "threat": term["threat"], + "is_new": term["is_new"], "at": index["generated_at"], + }, ensure_ascii=False)) + r.ltrim("alerts:ddti", 0, 199) + r.close() + except Exception as e: + logger.warning(f"[DDTI-Index] Redis publish failed: {e}") diff --git a/processors/sentiment.py b/processors/sentiment.py index 036fa57..13fb52a 100644 --- a/processors/sentiment.py +++ b/processors/sentiment.py @@ -135,9 +135,16 @@ def process_one(self, article: dict) -> dict: # Detect language and route to appropriate model language = self._detect_language(text) score, model_used = self._analyze(text, language=language) - text_lower = text.lower() - direction = self._detect_policy_direction(text_lower) - sectors = self._detect_sectors(text_lower) + + # Policy-direction / sector enrichment is language-specific. The English + # path uses \b-anchored regex (sentiment.py), which never matches CJK; for + # Chinese we route to substring-based detection that handles negation. + if language == "zh": + direction, sectors = self._detect_zh_policy_and_sectors(text) + else: + text_lower = text.lower() + direction = self._detect_policy_direction(text_lower) + sectors = self._detect_sectors(text_lower) return { "article_id": article_id, @@ -316,6 +323,23 @@ def _detect_sectors(self, text_lower: str) -> dict: sectors[sector] = {"mentions": mentions} return sectors + def _detect_zh_policy_and_sectors(self, text: str) -> tuple[str, dict]: + """Chinese policy-direction + sector detection (substring, negation-aware). + + Returns (direction, sectors). Degrades to ("neutral", {}) if the lexicon + is missing, so a Chinese article still gets its sentiment score. + """ + try: + from processors.zh_finance import ( + detect_chinese_policy_and_sectors, + load_lexicon, + ) + result = detect_chinese_policy_and_sectors(text, load_lexicon()) + return result["policy_direction"], result["sectors"] + except Exception as e: + logger.debug(f"[Sentiment] Chinese enrichment failed: {e}") + return "neutral", {} + def _store_results(self, results: list[dict], db): from storage.models import SentimentScore diff --git a/processors/zh_finance.py b/processors/zh_finance.py new file mode 100644 index 0000000..c276d78 --- /dev/null +++ b/processors/zh_finance.py @@ -0,0 +1,128 @@ +"""Chinese financial-text helpers: lexicon loading + policy/sector detection. + +Why this module exists: +- sentiment.py's hawkish/dovish/sector detection uses `\b...\b` regex, which does + NOT anchor on CJK characters — so it silently never matches Chinese. This module + provides substring-based detection that works on Chinese, including negation + handling (不加息 = "no rate hike" → not hawkish). +- The lexicon itself lives in config/zh_finance_lexicon.json so it can be tuned + without code changes (same philosophy as sources.yaml feeds). + +The detection function was drafted by Kimi Code (native Chinese-language strength) +and reviewed/integrated here. +""" + +import json +import logging +from functools import lru_cache +from pathlib import Path + +logger = logging.getLogger(__name__) + +_LEXICON_PATH = Path(__file__).resolve().parent.parent / "config" / "zh_finance_lexicon.json" + + +@lru_cache(maxsize=1) +def load_lexicon() -> dict: + """Load and cache the Chinese finance lexicon. Returns {} if missing.""" + try: + with open(_LEXICON_PATH, encoding="utf-8") as f: + data = json.load(f) + logger.info(f"[zh_finance] Loaded lexicon from {_LEXICON_PATH}") + return data + except FileNotFoundError: + logger.warning(f"[zh_finance] Lexicon not found at {_LEXICON_PATH}") + return {} + except json.JSONDecodeError as e: + logger.error(f"[zh_finance] Lexicon is invalid JSON: {e}") + return {} + + +def detect_chinese_policy_and_sectors(text: str, lexicon: dict) -> dict: + """ + Detect Chinese policy direction (hawkish/dovish/neutral) and sector mentions. + + Uses substring matching because Chinese has no word boundaries. Handles + negation: a keyword immediately preceded (within ~2 characters) by a negator + is not counted for its own side and is instead counted for the opposite side. + """ + # Chinese negation markers in financial/policy text. Base set + Kimi-contributed + # additions (尚未/并未/并无/不再) from config/zh_market_modifiers.json. + NEGATORS = ("不", "未", "没", "暂不", "不会", "难以", "尚未", "并未", "并无", "不再") + # Minimum net hits required before a direction is declared. + POLICY_THRESHOLD = 2 + + def _count_direction(text, keywords, opposite_counts): + """ + Count non-negated keyword hits for one policy direction. + + Negated hits are recorded in opposite_counts so they can be added to the + opposite direction's total later. + """ + hits = 0 + for kw in keywords: + # Ignore empty keywords to avoid spurious matching. + if not kw: + continue + start = 0 + while True: + idx = text.find(kw, start) + if idx == -1: + break + # Inspect up to 2 characters immediately before the keyword. + preceding = text[max(0, idx - 2):idx] + negated = any(neg in preceding for neg in NEGATORS) + if negated: + # A negated hawkish signal (e.g. 不加息) is dovish, + # and a negated dovish signal (e.g. 不降息) is hawkish. + opposite_counts.append(1) + else: + hits += 1 + # Advance by 1 to allow overlapping matches. + start = idx + 1 + return hits + + # Containers for negated hits that flip to the opposite direction. + hawkish_negated_as_dovish = [] + dovish_negated_as_hawkish = [] + + hawkish_hits = _count_direction( + text, lexicon.get("hawkish_keywords", []), hawkish_negated_as_dovish + ) + dovish_hits = _count_direction( + text, lexicon.get("dovish_keywords", []), dovish_negated_as_hawkish + ) + + # Combine direct hits with hits flipped from the opposite direction. + hawkish_total = hawkish_hits + len(dovish_negated_as_hawkish) + dovish_total = dovish_hits + len(hawkish_negated_as_dovish) + + # Decide the policy direction, requiring a minimum number of net hits. + if hawkish_total >= POLICY_THRESHOLD and hawkish_total > dovish_total: + policy_direction = "hawkish" + elif dovish_total >= POLICY_THRESHOLD and dovish_total > hawkish_total: + policy_direction = "dovish" + else: + policy_direction = "neutral" + + # Count sector mentions via substring matching. + sectors = {} + for sector, keywords in lexicon.get("sector_keywords", {}).items(): + count = 0 + for kw in keywords: + if not kw: + continue + start = 0 + while True: + idx = text.find(kw, start) + if idx == -1: + break + count += 1 + start = idx + 1 + if count: + sectors[sector] = {"mentions": count} + + return { + "policy_direction": policy_direction, + "sectors": sectors, + } diff --git a/scripts/conditions_pull.py b/scripts/conditions_pull.py new file mode 100644 index 0000000..7fd7441 --- /dev/null +++ b/scripts/conditions_pull.py @@ -0,0 +1,358 @@ +#!/usr/bin/env python3 +"""End-to-end China economic conditions pull (no Celery). + +This script wires the CBB collectors, the pure conditions-index computation, +and durable storage tiers together in one runnable file. + +Storage tiers (all best-effort, independent): + 1. Disk snapshot — data/cbb/snapshots/cbb_.json + 2. Disk history — data/cbb/history.jsonl (compact append-only log) + 3. Postgres — conditions_index_snapshots rows, IF reachable. + 4. Redis — cbb:latest, IF reachable. + +Usage: + python scripts/conditions_pull.py +""" + +import asyncio +import json +import logging +import os +import sys +from datetime import datetime, timedelta, timezone +from pathlib import Path +from typing import Any + +import pandas as pd + +ROOT = Path(__file__).resolve().parent.parent +sys.path.insert(0, str(ROOT)) + +from collectors.comtrade_mirror import ComtradeMirrorCollector +from collectors.cn_indicators import CNIndicatorsCollector +from processors.conditions_index import compute_conditions +from storage.models import ConditionsIndexSnapshot +from api.database import SessionLocal + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s %(name)s: %(message)s", +) +logger = logging.getLogger("conditions_pull") + +OUT_DIR = ROOT / "data" / "cbb" +SNAPSHOT_DIR = OUT_DIR / "snapshots" +HISTORY_PATH = OUT_DIR / "history.jsonl" +TAXONOMY_PATH = ROOT / "config" / "cbb_taxonomy.json" + + +# ── Input helpers ──────────────────────────────────────────────────────────── +def _load_taxonomy() -> dict: + try: + with open(TAXONOMY_PATH, "r", encoding="utf-8") as f: + return json.load(f) + except Exception as e: + logger.warning("Could not load taxonomy from %s: %s", TAXONOMY_PATH, e) + + # Minimal inline fallback so the script can run before the config fragment lands. + return { + "sectors": { + "electronics_machinery": { + "hs_codes": ["84", "85", "90"], + "region": "coastal_export", + "cn_hf_sources": ["ccfi", "scfi"], + }, + "textiles_apparel": { + "hs_codes": ["61", "62", "63"], + "region": "coastal_export", + "cn_hf_sources": ["ccfi"], + }, + "autos": { + "hs_codes": ["87"], + "region": "inland", + "cn_hf_sources": ["cpca_retail_pv", "cpca_wholesale_pv"], + }, + "steel": { + "hs_codes": ["72", "73"], + "region": "northeast", + "cn_hf_sources": ["bdi"], + }, + "cement": { + "hs_codes": ["25", "68"], + "region": "inland", + "cn_hf_sources": [], + }, + "coal": { + "hs_codes": ["27"], + "region": "inland", + "cn_hf_sources": ["bdi"], + }, + "transport_logistics": { + "hs_codes": ["86", "88", "89"], + "region": "national", + "cn_hf_sources": ["bdi", "ccfi", "scfi"], + }, + "property": { + "hs_codes": ["94"], + "region": "national", + "cn_hf_sources": [], + }, + "consumer_macro": { + "hs_codes": ["29", "33", "39"], + "region": "national", + "cn_hf_sources": ["yiwu_index"], + }, + } + } + + +def _query_sentiment_mentions() -> list[dict]: + """Load sentiment sector scores from Postgres (best-effort).""" + try: + from storage.models import SentimentScore + + db = SessionLocal() + try: + cutoff = datetime.now(timezone.utc) - timedelta(days=30) + rows = db.query(SentimentScore).filter(SentimentScore.created_at >= cutoff).all() + mentions: list[dict] = [] + for row in rows: + scores = row.sector_scores or {} + for sector in scores.keys(): + mentions.append( + { + "date": row.created_at or datetime.now(timezone.utc), + "sector": sector, + "score": float(row.overall or 0.0), + } + ) + return mentions + finally: + db.close() + except Exception as e: + logger.warning("Sentiment query failed: %s", e) + return [] + + +def _to_records(parsed: Any) -> list[dict]: + if isinstance(parsed, pd.DataFrame): + return parsed.replace({pd.NA: None}).to_dict("records") + return list(parsed or []) + + +def _records_from_economic_data( + trade_rows: list[dict], indicator_rows: list[dict] +) -> tuple[list[dict], list[dict]]: + """Convert EconomicData-style records into compute_conditions inputs.""" + trade: list[dict] = [] + indicators: list[dict] = [] + + for row in trade_rows: + # Accept either raw collector records or EconomicData dicts. + if "flow" in row and "hs" in row: + trade.append(row) + continue + + ind = str(row.get("indicator", "")) + meta = row.get("metadata") or row.get("extra_data") or {} + if not isinstance(meta, dict): + meta = {} + + if ind.startswith("trade_"): + parts = ind.split("_") + if len(parts) >= 3: + _, flow, hs = parts[:3] + trade.append( + { + "date": row.get("date"), + "flow": flow, + "hs": hs, + "value": float(row.get("value") or 0.0), + "reporter": meta.get("reporter", "156"), + "partner": meta.get("partner", "0"), + "net_weight": meta.get("netWeight", 0.0), + } + ) + + for row in indicator_rows: + if row.get("indicator") is not None and row.get("value") is not None: + indicators.append( + { + "date": row.get("date"), + "indicator": row.get("indicator"), + "value": float(row.get("value") or 0.0), + } + ) + + return trade, indicators + + +async def _collect_trade() -> list[dict]: + """Run the Comtrade mirror collector and return trade records.""" + collector = ComtradeMirrorCollector({"recent_months": 3}) + try: + raw = await collector.collect() + parsed = await collector.parse(raw) + return _to_records(parsed) + except Exception as e: + logger.warning("Trade collection failed: %s", e) + return [] + finally: + await collector.close() + + +async def _collect_cn_indicators() -> list[dict]: + """Run the CN indicators collector and return indicator records.""" + collector = CNIndicatorsCollector({}) + try: + raw = await collector.collect() + parsed = await collector.parse(raw) + return _to_records(parsed) + except Exception as e: + logger.warning("CN indicator collection failed: %s", e) + return [] + finally: + await collector.close() + + +# ── Storage helpers ────────────────────────────────────────────────────────── +def _store_snapshot(index: list[dict], now: datetime) -> Path: + SNAPSHOT_DIR.mkdir(parents=True, exist_ok=True) + stamp = now.strftime("%Y%m%dT%H%M%S") + snap_path = SNAPSHOT_DIR / f"cbb_{stamp}.json" + payload = { + "generated_at": now.isoformat(), + "count": len(index), + "index": index, + } + snap_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") + return snap_path + + +def _append_history(index: list[dict], now: datetime) -> None: + OUT_DIR.mkdir(parents=True, exist_ok=True) + summary = { + "generated_at": now.isoformat(), + "count": len(index), + "sectors": [ + { + "sector": r["sector"], + "period": r["period"], + "D": r["D"], + "momentum": r["momentum"], + "confidence": r["confidence"], + } + for r in index + ], + } + with open(HISTORY_PATH, "a", encoding="utf-8") as f: + f.write(json.dumps(summary, ensure_ascii=False) + "\n") + + +def _store_postgres(index: list[dict], now: datetime) -> str: + try: + db = SessionLocal() + try: + for row in index: + snapshot = ConditionsIndexSnapshot( + generated_at=now, + period=row.get("period"), + sector=row["sector"], + region=row.get("region"), + diffusion=row.get("D", 0.0), + sentiment=row.get("SD", 0.0), + anchor=row.get("AS", 0.0), + momentum=row.get("momentum", 0.0), + mirror_gap=row.get("mirror_gap"), + confidence=row.get("confidence", "low"), + n_mentions=row.get("n_mentions", 0), + inputs=row.get("inputs", {}), + ) + db.add(snapshot) + db.commit() + return "ok" + finally: + db.close() + except Exception as e: + return f"unavailable ({type(e).__name__})" + + +def _store_redis(index: list[dict], now: datetime) -> str: + try: + import redis + + r = redis.from_url(os.getenv("REDIS_URL", "redis://localhost:6379"), decode_responses=True) + payload = { + "generated_at": now.isoformat(), + "status": "live", + "count": len(index), + "index": index, + } + r.set("cbb:latest", json.dumps(payload, ensure_ascii=False), ex=7200) + r.close() + return "ok" + except Exception as e: + return f"unavailable ({type(e).__name__})" + + +def _print_table(index: list[dict]) -> None: + print("\n=== China Economic Conditions Index ===") + header = ( + f"{'Sector':<22} {'Region':<14} {'Period':<8} {'D':>8} " + f"{'SD':>8} {'AS':>8} {'Mom':>8} {'Gap':>8} {'Conf':>6} {'N':>5}" + ) + print(header) + print("-" * len(header)) + for row in index: + gap = f"{row['mirror_gap']:.1f}" if row.get("mirror_gap") is not None else "-" + print( + f"{row['sector']:<22} {row['region']:<14} {row['period']:<8} " + f"{row['D']:>8.2f} {row['SD']:>8.2f} {row['AS']:>8.2f} " + f"{row['momentum']:>8.2f} {gap:>8} {row['confidence']:>6} {row['n_mentions']:>5d}" + ) + print(f"\nTotal sectors: {len(index)}") + + +# ── Main entry point ───────────────────────────────────────────────────────── +async def main(): + now = datetime.now(timezone.utc) + taxonomy = _load_taxonomy() + if not taxonomy.get("sectors"): + print("No taxonomy available; aborting.") + return [] + + print("[conditions_pull] Collecting trade data...") + trade_raw = await _collect_trade() + print(f"[conditions_pull] Trade records: {len(trade_raw)}") + + print("[conditions_pull] Collecting CN high-frequency indicators...") + cn_raw = await _collect_cn_indicators() + print(f"[conditions_pull] CN indicator records: {len(cn_raw)}") + + print("[conditions_pull] Loading sentiment mentions...") + sentiment = _query_sentiment_mentions() + print(f"[conditions_pull] Sentiment mentions: {len(sentiment)}") + + trade_records, indicator_records = _records_from_economic_data(trade_raw, cn_raw) + index = compute_conditions(trade_records, indicator_records, sentiment, taxonomy, now) + + # Disk tiers (always attempted). + snap_path = _store_snapshot(index, now) + _append_history(index, now) + + # Remote tiers (best-effort). + db_status = _store_postgres(index, now) + redis_status = _store_redis(index, now) + + print("\n=== Storage ===") + print(f" snapshot : {snap_path}") + print(f" history : {HISTORY_PATH}") + print(f" postgres : {db_status}") + print(f" redis : {redis_status}") + + _print_table(index) + return index + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/scripts/ddti_feasibility.py b/scripts/ddti_feasibility.py new file mode 100644 index 0000000..f831f16 --- /dev/null +++ b/scripts/ddti_feasibility.py @@ -0,0 +1,147 @@ +"""DDTI feasibility experiment — run this to get the GO / NO-GO verdict. + +Answers: can we reconstruct a usable Weibo deletion signal today, from this host? +It measures three things and refuses to guess: + + 1. CONTROL — is there working network at all? (fetch a neutral host) + 2. PASSIVE — which anti-censorship feeds (CDT/FreeWeibo/GreatFire) are + reachable, and do they yield dated deletion items? + 3. ACTIVE — can we fetch individual Weibo posts to check liveness, and do + the responses classify into censorship vs. user-deletion? + +Verdict logic deliberately separates "this sandbox has no network" from +"China/Weibo blocked us" from "the signal genuinely isn't there anymore" — only +the last is a real NO-GO for the DDTI. + +Usage: python -m scripts.ddti_feasibility +Run it on the production VPS (and behind the egress you intend to use), NOT in a +restricted sandbox, or the CONTROL gate will (correctly) tell you to. +""" + +import asyncio +import json +import sys + +import httpx + +from collectors.ddti_probe import ( + DDTIProbeCollector, + check_liveness, + classify_post_status, + survival_curve, +) + +CONTROL_URL = "https://example.com" + +# Candidate passive deletion feeds (verified empirically by this script). +CANDIDATE_FEEDS = [ + {"name": "cdt_english", "url": "https://chinadigitaltimes.net/feed/"}, + {"name": "cdt_minitrue", "url": "https://chinadigitaltimes.net/china/minitrue/feed/"}, + {"name": "cdt_chinese", "url": "https://chinadigitaltimes.net/chinese/feed/"}, + {"name": "freeweibo", "url": "https://freeweibo.com/"}, + {"name": "greatfire", "url": "https://en.greatfire.org/"}, +] + +# A handful of Weibo post URLs to test the ACTIVE liveness path. Replace with +# real recently-collected post IDs; placeholders just exercise reachability. +CANDIDATE_POSTS = [ + "https://weibo.com/1234567890/AbCdEfGhI", +] + +VERDICT_VOLUME_THRESHOLD = 20 # min dated deletion items for a "GO" on passive + + +async def _control_ok(client) -> bool: + try: + r = await client.get(CONTROL_URL) + return r.status_code == 200 + except Exception: + return False + + +async def _probe_passive(client) -> dict: + """Reachability + yield per candidate feed.""" + collector = DDTIProbeCollector({"deletion_feeds": []}) + results = {} + for feed in CANDIDATE_FEEDS: + entry = {"reachable": False, "status": None, "items": 0, "dated_items": 0} + try: + r = await client.get(feed["url"], headers={"User-Agent": "Mozilla/5.0"}) + entry["status"] = r.status_code + entry["reachable"] = r.status_code == 200 + if r.status_code == 200: + items = collector._parse_feed_items(feed["name"], r.text) + entry["items"] = len(items) + entry["dated_items"] = sum(1 for i in items if i.get("published_at")) + except Exception as e: + entry["status"] = f"error:{type(e).__name__}" + results[feed["name"]] = entry + return results + + +async def _probe_active(client) -> dict: + """Can we fetch posts, and do responses classify informatively?""" + statuses = [] + for url in CANDIDATE_POSTS: + statuses.append(await check_liveness(client, url)) + informative = sum(1 for s in statuses if s.get("censorship_likelihood") is not None) + return { + "checked": len(statuses), + "informative": informative, + "reachable": informative > 0, + "sample": statuses, + } + + +def _verdict(control: bool, passive: dict, active: dict) -> dict: + if not control: + return {"verdict": "INCONCLUSIVE", + "reason": "No working network on this host (control fetch failed). " + "Rerun on the production VPS — this is NOT a statement about China."} + + passive_go = any( + v["reachable"] and v["dated_items"] >= VERDICT_VOLUME_THRESHOLD + for v in passive.values() + ) + passive_partial = any(v["reachable"] and v["items"] > 0 for v in passive.values()) + active_go = active["reachable"] + + if passive_go or active_go: + path = [] + if passive_go: + path.append("passive feeds yield dated deletion items at volume") + if active_go: + path.append("active liveness checks return classifiable responses") + return {"verdict": "GO", "reason": "; ".join(path), + "build_next": "deletion-velocity tracker → survival curves → DDTI"} + if passive_partial: + return {"verdict": "PARTIAL", + "reason": "feeds reachable but low yield / no timing resolution. " + "Usable as a coarse selectivity signal, not velocity. " + "Consider weighting toward anchor + coherence mechanisms."} + return {"verdict": "NO-GO", + "reason": "control network works but no deletion source is reachable/usable from here. " + "Either route through different egress, or pivot to the anchor-calibration " + "and cross-domain-coherence mechanisms, which need no censorship data."} + + +async def main(): + async with httpx.AsyncClient(timeout=20, follow_redirects=True) as client: + control = await _control_ok(client) + passive = await _probe_passive(client) if control else {} + active = await _probe_active(client) if control else {"reachable": False, "checked": 0, "informative": 0, "sample": []} + + report = { + "control_network_ok": control, + "xml_hardened": __import__("collectors.ddti_probe", fromlist=["_XML_HARDENED"])._XML_HARDENED, + "passive_feeds": passive, + "active_liveness": active, + "verdict": _verdict(control, passive, active), + } + print(json.dumps(report, indent=2, ensure_ascii=False)) + # Non-zero exit on a hard NO-GO so this can gate CI / a build pipeline. + sys.exit(0 if report["verdict"]["verdict"] in ("GO", "PARTIAL", "INCONCLUSIVE") else 2) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/scripts/ddti_live_pull.py b/scripts/ddti_live_pull.py new file mode 100644 index 0000000..41e1a4f --- /dev/null +++ b/scripts/ddti_live_pull.py @@ -0,0 +1,204 @@ +"""DDTI live pull — fetch REAL China Digital Times deletion/coverage data now, +compute the selectivity/novelty index, and STORE it durably. + +Storage tiers (all best-effort, independent): + 1. Disk time-series — data/ddti/index_.json (one file per pull) + + data/ddti/history.jsonl (compact append-only log). ALWAYS written. + 2. Dashboard embed — injects the real snapshot into dashboards/ddti_dashboard.html + so opening the file shows real data offline. + 3. Postgres — ddti_index_snapshots row, IF the DB is reachable. + 4. Redis — ddti:index:latest, IF reachable. + +Honest scope: a single pull has no 30-day history, so novelty defaults high +(everything looks "new" the first time). Run it repeatedly (cron) and the +history.jsonl / Postgres rows accumulate the real time-series. Ranking by +attention (tag frequency × recency) is meaningful from the first pull. + +Usage: python -m scripts.ddti_live_pull +""" + +import asyncio +import json +import re +from datetime import datetime, timezone +from email.utils import parsedate_to_datetime +from pathlib import Path + +import httpx + +from collectors.ddti_probe import DDTIProbeCollector +from processors.ddti_index import compute_selectivity_novelty, extract_terms, load_domain_map +from processors.zh_finance import load_lexicon + +ROOT = Path(__file__).resolve().parent.parent +OUT_DIR = ROOT / "data" / "ddti" +DASHBOARD = ROOT / "dashboards" / "ddti_dashboard.html" + +BROWSER_UA = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/124.0 Safari/537.36") + +# Real CDT feeds (English-first, per the "make it English" request). The script +# follows redirects and uses a browser UA — some returned 301/403 to the probe's +# bare client. It keeps whatever actually answers with items. +FEEDS = [ + {"name": "cdt_english", "url": "https://chinadigitaltimes.net/feed/"}, + {"name": "cdt_404", "url": "https://chinadigitaltimes.net/china/404-archive/feed/"}, + {"name": "cdt_minitrue", "url": "https://chinadigitaltimes.net/china/minitrue/feed/"}, + {"name": "cdt_economy", "url": "https://chinadigitaltimes.net/china/economy/feed/"}, +] + +# CDT structural/editorial tags that aren't threat topics — drop as noise. +STOP_TAGS = { + "translation", "cdt highlights", "level 2 article", "level 3 article", + "china", "chinese", "featured", "news", "society", "video", "image", +} + + +def _parse_date(s: str) -> datetime: + if not s: + return datetime.now(timezone.utc) + try: + d = parsedate_to_datetime(s) + return d if d.tzinfo else d.replace(tzinfo=timezone.utc) + except Exception: + return datetime.now(timezone.utc) + + +def _clean_terms(terms): + return [t for t in terms if t.strip().lower() not in STOP_TAGS and len(t.strip()) > 1] + + +async def pull(): + lexicon = load_lexicon() + collector = DDTIProbeCollector({"deletion_feeds": []}) # reuse its RSS parser + observations, reachability = [], {} + + async with httpx.AsyncClient(timeout=25, follow_redirects=True, + headers={"User-Agent": BROWSER_UA, "Referer": "https://chinadigitaltimes.net/"}) as client: + for feed in FEEDS: + try: + r = await client.get(feed["url"]) + reachability[feed["name"]] = r.status_code + if r.status_code != 200: + continue + items = collector._parse_feed_items(feed["name"], r.text) + for it in items: + terms = _clean_terms(extract_terms(it["title"], it["text"], it.get("tags", []), lexicon)) + if not terms: + continue + observations.append({ + "terms": terms, + "detected_at": _parse_date(it.get("published_at", "")), + "title": it["title"], + "url": it["url"], + "source": feed["name"], + }) + except Exception as e: + reachability[feed["name"]] = f"error:{type(e).__name__}" + + return observations, reachability + + +def store_disk(index: dict) -> dict: + OUT_DIR.mkdir(parents=True, exist_ok=True) + stamp = index["generated_at"].replace(":", "").replace("-", "").replace(".", "_") + snap_path = OUT_DIR / f"index_{stamp}.json" + snap_path.write_text(json.dumps(index, ensure_ascii=False, indent=2), encoding="utf-8") + + # compact append-only time-series log + top = index["ranked"][0] if index["ranked"] else {} + line = json.dumps({ + "generated_at": index["generated_at"], + "n_terms": index["n_terms"], + "n_observations": index["n_observations_used"], + "n_new": sum(1 for r in index["ranked"] if r.get("is_new")), + "top_term": top.get("term"), "top_threat": top.get("threat"), + }, ensure_ascii=False) + with open(OUT_DIR / "history.jsonl", "a", encoding="utf-8") as f: + f.write(line + "\n") + return {"snapshot": str(snap_path), "history": str(OUT_DIR / "history.jsonl")} + + +def embed_in_dashboard(index: dict) -> bool: + """Inject the real snapshot so opening the HTML file shows real data offline.""" + try: + html = DASHBOARD.read_text(encoding="utf-8") + payload = json.dumps(index, ensure_ascii=False).replace("") + html = re.sub(r"()?", + block, html, count=1, flags=re.DOTALL) + DASHBOARD.write_text(html, encoding="utf-8") + return True + except Exception as e: + print(f" embed failed: {e}") + return False + + +def store_db(index: dict) -> str: + try: + from api.database import SessionLocal, init_db + from processors.ddti_index import persist_snapshot + init_db() # create_all — makes ddti_index_snapshots if missing + db = SessionLocal() + try: + ok = persist_snapshot(index, db) + return "ok" if ok else "failed" + finally: + db.close() + except Exception as e: + return f"unavailable ({type(e).__name__})" + + +def store_redis(index: dict) -> str: + try: + import os + import redis + r = redis.from_url(os.getenv("REDIS_URL", "redis://localhost:6379"), decode_responses=True) + r.set("ddti:index:latest", json.dumps(index, ensure_ascii=False), ex=7200) + r.close() + return "ok" + except Exception as e: + return f"unavailable ({type(e).__name__})" + + +async def main(): + print("Pulling live CDT feeds…") + observations, reachability = await pull() + print(f" reachability: {reachability}") + print(f" observations: {len(observations)}") + + now = datetime.now(timezone.utc) + # Cold-start windows: CDT's curated feed spans ~6 weeks, so treat the whole + # batch as "current" and rank by frequency×recency. Once daily cron pulls make + # the data dense, the processor's default 3/30 windows let novelty/burst lead. + dmap, _ = load_domain_map() + index = compute_selectivity_novelty( + observations, now, current_window_days=45, history_window_days=180, top_n=30, + domain_map=dmap, + ) + index["source_feeds"] = reachability + + disk = store_disk(index) + embedded = embed_in_dashboard(index) + db = store_db(index) + rds = store_redis(index) + + print("\n=== STORED ===") + print(f" disk snapshot : {disk['snapshot']}") + print(f" disk history : {disk['history']}") + print(f" dashboard : {'embedded real snapshot' if embedded else 'embed failed'}") + print(f" postgres : {db}") + print(f" redis : {rds}") + + print(f"\n=== INDEX ({index['n_terms']} terms / {index['n_observations_used']} items) ===") + for i, r in enumerate(index["ranked"][:12], 1): + new = " [NEW]" if r["is_new"] else "" + print(f" {i:2}. {r['term'][:38]:38} threat={r['threat']:6.2f} " + f"atten={r['attention']:5.2f} novelty={r['novelty']:.2f} n={r['recent_count']}{new}") + if not index["ranked"]: + print(" (no terms — feeds may have been unreachable from this network)") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/storage/models.py b/storage/models.py index 59ea2e8..15b0143 100644 --- a/storage/models.py +++ b/storage/models.py @@ -177,3 +177,59 @@ class CollectionLog(Base): Index("idx_log_status", "status"), Index("idx_log_run_at", "run_at"), ) + + +class DDTIIndexSnapshot(Base): + """Time-series of DDTI selectivity/novelty index computations. + + One row per index run, so threat scores can be charted over time (the Redis + `ddti:index:latest` key is only the live cache). The full ranked list is kept + in `ranked` (JSONB); the scalar columns are denormalized for fast querying. + """ + __tablename__ = "ddti_index_snapshots" + + id = Column(Integer, primary_key=True, autoincrement=True) + generated_at = Column(DateTime(timezone=True), nullable=False, + default=lambda: datetime.now(timezone.utc)) + n_observations = Column(Integer, default=0) + n_terms = Column(Integer, default=0) + n_new = Column(Integer, default=0) # newly-sensitive terms this window + top_term = Column(Text, nullable=True) + top_threat = Column(Float, default=0.0) + window = Column(JSONB, default=dict) # current/history days, weights + ranked = Column(JSONB, default=list) # full ranked term list + scope = Column(Text, nullable=True) + + __table_args__ = ( + Index("idx_ddti_generated_at", "generated_at"), + Index("idx_ddti_top_term", "top_term"), + ) + + +class ConditionsIndexSnapshot(Base): + """Time-series of China economic conditions index computations. + + One row per sector per index run. The Redis `cbb:latest` key is the live cache; + this table provides durable history for trend analysis. + """ + __tablename__ = "conditions_index_snapshots" + + id = Column(Integer, primary_key=True, autoincrement=True) + generated_at = Column(DateTime(timezone=True), nullable=False, + default=lambda: datetime.now(timezone.utc)) + period = Column(String(8), nullable=True) + sector = Column(String(64), nullable=False) + region = Column(String(32), nullable=True) + diffusion = Column(Float, default=0.0) + sentiment = Column(Float, default=0.0) + anchor = Column(Float, default=0.0) + momentum = Column(Float, default=0.0) + mirror_gap = Column(Float, nullable=True) + confidence = Column(String(8), default="low") + n_mentions = Column(Integer, default=0) + inputs = Column(JSONB, default=dict) + + __table_args__ = ( + Index("idx_cbb_generated_at", "generated_at"), + Index("idx_cbb_sector", "sector"), + ) From d5632abebda1844752aaa6d3e6b1fafc3b5b8eca Mon Sep 17 00:00:00 2001 From: mrinal Date: Thu, 18 Jun 2026 19:50:50 +0530 Subject: [PATCH 3/9] =?UTF-8?q?fix(cbb):=20wire=203=20live=20CN=20sources?= =?UTF-8?q?=20+=20correct=20trade=E2=86=92sector=20mapping?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add bespoke parsers for the open_json sources whose JSON the generic date/value mapper couldn't reach: CCFI/SCFI (SSE composite, current+prior week) and macro_customs (chinadata.live, 43 months of GACC trade). 45 real records. - Fix swarm taxonomy: macro_customs (total goods trade) now maps to manufacturing + transport_logistics, not agriculture/finance_banking. - Remove a duplicate logger definition. Result: conditions index shows real live signal (manufacturing/transport D=26.89, anchor +44.8, momentum -23.1 from live customs data). Co-Authored-By: Claude Opus 4.8 (1M context) --- collectors/cn_indicators.py | 56 +++++++++++++++++++++++++++++++++++++ config/cbb_taxonomy.json | 13 ++++----- 2 files changed, 62 insertions(+), 7 deletions(-) diff --git a/collectors/cn_indicators.py b/collectors/cn_indicators.py index 1ff15b4..7ffbe99 100644 --- a/collectors/cn_indicators.py +++ b/collectors/cn_indicators.py @@ -24,6 +24,59 @@ import pandas as pd from core.base_collector import BaseCollector + + +# ── Per-source bespoke parsers ─────────────────────────────────────── +# Some open sources return idiosyncratic JSON the generic json_path/date/value +# mapping can't reach. Each parser takes the decoded response and returns a flat +# list of {"date": str, "value": number, **extra} observations. + +def _parse_sse_freight(data: Any) -> list: + """Shanghai Shipping Exchange CCFI/SCFI composite index. + + Emits the current AND prior-week points so the index has a period-over-period + delta to compute momentum from. + """ + d = (data or {}).get("data", {}) or {} + cur, last = d.get("currentDate"), d.get("lastDate") + lines = d.get("lineDataList", []) or [] + + def emit(item, label): + out = [{"date": cur, "value": item.get("currentContent"), "line": label}] + if last and item.get("lastContent") is not None: + out.append({"date": last, "value": item.get("lastContent"), "line": label}) + return out + + for item in lines: + dit = (item.get("dataItemTypeName") or "") + en = ((item.get("properties") or {}).get("lineName_EN") or "").strip().upper() + if dit.endswith("_T") or en == "COMPOSITE INDEX": + return emit(item, "COMPOSITE") + if lines: # fallback: first line + return emit(lines[0], lines[0].get("dataItemTypeName") or "LINE") + return [] + + +def _parse_chinadata_series(data: Any, value_key: str = "export") -> list: + """chinadata.live series: data.data is a list of monthly trade rows.""" + rows = ((data or {}).get("data", {}) or {}).get("data", []) or [] + out = [] + for r in rows: + if not isinstance(r, dict): + continue + out.append({ + "date": r.get("date"), + "value": r.get(value_key), + **{k: r.get(k) for k in ("total", "export", "import", "balance") if k in r}, + }) + return out + + +_CUSTOM_PARSERS = { + "ccfi": _parse_sse_freight, + "scfi": _parse_sse_freight, + "macro_customs": _parse_chinadata_series, +} from core.exceptions import SchemaChangedError logger = logging.getLogger(__name__) @@ -332,6 +385,9 @@ async def _fetch_source(self, src: dict) -> Any: return df.to_dict("records") data = resp.json() + custom = _CUSTOM_PARSERS.get(src.get("key")) + if custom: + return custom(data) return self._get_nested(data, src.get("json_path")) # ── Parsing helpers ───────────────────────────────────────────── diff --git a/config/cbb_taxonomy.json b/config/cbb_taxonomy.json index 0020842..c7556ae 100644 --- a/config/cbb_taxonomy.json +++ b/config/cbb_taxonomy.json @@ -65,9 +65,7 @@ "exports" ] }, - "cn_hf_sources": [ - "macro_customs" - ], + "cn_hf_sources": [], "provinces": [ { "name_zh": "黑龙江", @@ -205,8 +203,7 @@ "macro_pboc_credit", "macro_nbs", "macro_caixin_pmi", - "macro_cfl_pmi", - "macro_customs" + "macro_cfl_pmi" ], "provinces": [ { @@ -311,7 +308,8 @@ "yiwu_index", "cement_digital", "coal_cctd", - "autos_cpca_retail" + "autos_cpca_retail", + "macro_customs" ], "provinces": [ { @@ -1027,7 +1025,8 @@ "scfi", "freight_road_logistics", "mobility_12306", - "mobility_civil_aviation" + "mobility_civil_aviation", + "macro_customs" ], "provinces": [ { From 262887246c714fd934719d6d9603f71b45e269fc Mon Sep 17 00:00:00 2001 From: mrinal Date: Thu, 18 Jun 2026 19:54:23 +0530 Subject: [PATCH 4/9] feat(cbb): embed real snapshot in conditions dashboard + fix live-fetch key MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add window.__CBB_EMBED__ support (mirrors the DDTI dashboard) so opening the file shows the real conditions snapshot offline, badged ● SNAPSHOT. - Fix: live fetch now reads data.index (the API/snapshot shape), not only data.sectors, so LIVE mode actually renders. - Embedded latest pull: manufacturing & transport_logistics show real customs-driven signal (D=26.89); other sectors await live sources. Co-Authored-By: Claude Opus 4.8 (1M context) --- dashboards/conditions_dashboard.html | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/dashboards/conditions_dashboard.html b/dashboards/conditions_dashboard.html index 2fa5eef..4a77f59 100644 --- a/dashboards/conditions_dashboard.html +++ b/dashboards/conditions_dashboard.html @@ -194,6 +194,7 @@ + + + + From fb644f5af1483e27105f815dd9a181a1e22bf5a1 Mon Sep 17 00:00:00 2001 From: mrinal Date: Thu, 18 Jun 2026 20:01:41 +0530 Subject: [PATCH 6/9] chore(deploy): Vercel static package for the unified PALIMPSEST app MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vercel-app/ = the unified dashboard (index.html, real data embedded) + vercel.json. Static deploy: shows the embedded ● SNAPSHOT; refresh = re-run pull + redeploy. Co-Authored-By: Claude Opus 4.8 (1M context) --- vercel-app/index.html | 306 +++++++++++++++++++++++++++++++++++++++++ vercel-app/vercel.json | 13 ++ 2 files changed, 319 insertions(+) create mode 100644 vercel-app/index.html create mode 100644 vercel-app/vercel.json diff --git a/vercel-app/index.html b/vercel-app/index.html new file mode 100644 index 0000000..0a97a27 --- /dev/null +++ b/vercel-app/index.html @@ -0,0 +1,306 @@ + + + + + +PALIMPSEST · China State Monitor + + + + + + +
+
+
+
PALIMPSEST·CN
+
China State Monitor — Censorship × Economy, One Latent State
+
+
+
DDTI selectivity
+
velocity · egress
+
UTC
+
+
+ + +
+ censorship + economy +
+ + +
+
Synthesizing the China state…
+
+
+
Censor Attention — top threats
+
+
+
+
Economic Conditions — by sector
+
+
+
+
+ + +
+
DDTIDeletion-Differential Threat Index
+
+
+ + +
+
CBBSector × Region Conditions (diffusion −100…+100)
+
+ + +
SectorRegionDiffusionSentAnchorMomMirrorConfN
+
+ +
+ PALIMPSEST unified monitor · censorship (China Digital Times) + economy (UN Comtrade / China HF indicators). + Each panel: live API → embedded snapshot → sample. ↻ refresh +
+
+ + + + + diff --git a/vercel-app/vercel.json b/vercel-app/vercel.json new file mode 100644 index 0000000..ecdc94b --- /dev/null +++ b/vercel-app/vercel.json @@ -0,0 +1,13 @@ +{ + "$schema": "https://openapi.vercel.sh/vercel.json", + "cleanUrls": true, + "headers": [ + { + "source": "/(.*)", + "headers": [ + { "key": "X-Content-Type-Options", "value": "nosniff" }, + { "key": "Referrer-Policy", "value": "no-referrer" } + ] + } + ] +} From c5320690e4296859be48c042e1c1be2e87f77ade Mon Sep 17 00:00:00 2001 From: mrinal Date: Thu, 18 Jun 2026 20:04:59 +0530 Subject: [PATCH 7/9] docs: save Kimi batch prompts (A: scrapers, B: tests, C: quality/security) for later Neutral-framed, project-scoped, refusal-safe. To run when Kimi tokens reset. Co-Authored-By: Claude Opus 4.8 (1M context) --- KIMI_PROMPTS.md | 121 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 KIMI_PROMPTS.md diff --git a/KIMI_PROMPTS.md b/KIMI_PROMPTS.md new file mode 100644 index 0000000..9d0de32 --- /dev/null +++ b/KIMI_PROMPTS.md @@ -0,0 +1,121 @@ +# Kimi Prompt Library — PALIMPSEST / CBB next batches + +Paste these into the Kimi Code window **when tokens reset**. Each block is self-contained. + +## Rules that make Kimi succeed (learned the hard way) +1. **Launch Kimi INSIDE the project dir** — `cd /Users/mrinal/social_scraper` first. (Once it ran from `~` and spent ~12 min globbing the whole home folder.) +2. **Keep prompts neutral/economic.** Kimi (China-hosted) REFUSES politically sensitive China content (it 400'd on a censorship lexicon). So these batches cover only the **economic / engineering** layers. The censorship/DDTI modules are handled by Claude — do NOT ask Kimi to touch `ddti_*`, `zh_censorship_gazetteer`, or anything censorship-framed. +3. Let it **auto-size the swarm** to the number of independent units; don't demand a fixed "100." +4. The full CBB build prompt is already in **`KIMI_CBB_PROMPT.md`** (done — engine built). These are the follow-on batches. + +--- + +## BATCH A — Real scrapers for the stubbed China data sources (highest value) + +```text +You are a senior Python data engineer and Chinese-macro data specialist. Working dir: +/Users/mrinal/social_scraper. Read these first to match conventions exactly: +- collectors/cn_indicators.py (note the _CUSTOM_PARSERS registry near the top, and the + two working examples _parse_sse_freight and _parse_chinadata_series) +- config/cn_hf_sources.json (the source catalog; many entries have access_method "todo") +- core/base_collector.py (collector base; source_type="api" -> EconomicData rows) + +GOAL: turn the "todo" Chinese economic indicators into REAL working data feeds, using your +knowledge of the actual public endpoints (a generic model doesn't know these; you do). + +This is embarrassingly parallel — treat each source as one independent unit (one swarm agent +per source). For EACH source in config/cn_hf_sources.json whose access_method == "todo": +1. Find the real public data endpoint (open JSON/CSV API, or a stable HTML/markup table you can + parse). Use your knowledge of Chinese econ data portals. +2. If an open/parseable endpoint EXISTS: implement a bespoke parser function and register it in + `_CUSTOM_PARSERS` in collectors/cn_indicators.py (same shape as the existing examples — + return a flat list of {"date": str, "value": number, **extra}). Update that source's entry in + config/cn_hf_sources.json: set its `url` to the working endpoint and access_method to + "open_json" or "open_csv" (or "scrape" if it needs HTML parsing but works). +3. If the source is GENUINELY paywalled / login-gated / hard anti-bot with no public endpoint: + leave access_method "todo" but improve its `note` with the exact blocker and the closest + public alternative (e.g. an official NBS/Customs series that proxies it). + +Sources to work through (todo ones in the catalog), e.g.: cpca_retail, cpca_wholesale, +steel_mysteel, steel_100njz_construction, cement_digital, coal_cctd, coal_power_consumption, +bdi, freight_road_logistics, yiwu_index, property_cric, property_zhongzhi_land, +macro_caixin_pmi, macro_cfl_pmi, macro_nbs, macro_pboc_credit, mobility_12306, +mobility_baidu_migration, mobility_baidu_congestion, mobility_gaode, mobility_box_office, +mobility_civil_aviation. (ccfi, scfi, macro_customs already work — use them as the template.) + +CONSTRAINTS: +- Match existing style; reuse httpx/pandas. Each parser must handle network/parse failure + gracefully (return [] + let the collector log and continue). UTF-8; ensure_ascii=False. +- Do NOT hardcode secrets. If a source needs a free API key, read it from an env var and document it. +- Everything must compile (python -m py_compile collectors/cn_indicators.py). +- After each source, note: WIRED (endpoint + sample value) or STILL-TODO (blocker). + +DELIVER: updated collectors/cn_indicators.py (more _CUSTOM_PARSERS) + updated +config/cn_hf_sources.json (access_method/url fixed for the ones you wired), plus a summary table +of which sources are now live vs still blocked and why. +``` + +--- + +## BATCH B — Unit-test fleet (one suite per module) + +```text +Senior Python engineer. Working dir: /Users/mrinal/social_scraper. Write pytest unit tests, +one test file per module, for the ECONOMIC/engineering modules only. This is parallel work: +one swarm agent per module. + +Read first: how the modules are structured + any existing tests/ layout. + +Cover ONLY these (do NOT touch ddti_* or censorship modules — those are handled separately): +- processors/conditions_index.py -> tests/test_conditions_index.py + Test compute_conditions: SD/AS/D math (D = 0.4*SD + 0.6*AS), momentum, mirror_gap, + confidence tiers, empty inputs, single-period (no momentum), the offline self-test path. +- collectors/comtrade_mirror.py -> tests/test_comtrade_mirror.py + Test parse() row shape, mirror flow inversion, graceful handling of empty/error responses + (mock httpx; no real network). +- collectors/cn_indicators.py -> tests/test_cn_indicators.py + Test the _CUSTOM_PARSERS (feed JSON fixtures matching the real shapes), todo-skip behavior, + _normalize_date / _normalize_value edge cases. +- processors/zh_finance.py -> tests/test_zh_finance.py + Test detect_chinese_policy_and_sectors: negation flips (不加息/尚未加息), substring (no \b), + sector counts. (This is finance/sentiment, neutral — fine.) +- processors/conditions_report.py -> tests/test_conditions_report.py + Test the context-assembly / formatting helpers with mocked LLM (no network). + +CONSTRAINTS: pytest; mock all network/DB (no live calls); tests must run offline and pass +(`python -m pytest tests/ -q`). Keep fixtures small and inline. Match existing import style. + +DELIVER: the test files + a run of `python -m pytest tests/ -q` with results. +``` + +--- + +## BATCH C — Data-quality validators + light security sweep (economic scope) + +```text +Senior Python engineer. Working dir: /Users/mrinal/social_scraper. Two parallel tracks; one +swarm agent per item. ECONOMIC/engineering scope only (skip ddti_*/censorship modules). + +TRACK 1 — Per-source data-quality validators: +For each WIRED economic source (comtrade_mirror + the live cn_indicators), add a small validator +that checks freshness (latest obs not older than the source's expected frequency * 2) and schema +(required fields present, value numeric, date parseable). Put them in a new module +processors/cbb_quality.py with one function per source + a runner returning a status report. + +TRACK 2 — Static safety review of the economic dashboard + route: +Review dashboards/conditions_dashboard.html and api/routes/conditions.py for: any innerHTML/ +template interpolation not passed through a strict escaper; any color/style built from raw data +instead of a fixed whitelist; any unvalidated query params in the route. Report findings with +file:line and a concrete fix; apply the fixes where safe. (The DDTI dashboard is reviewed +separately — do not modify it.) + +CONSTRAINTS: no new heavy deps; everything compiles; offline. UTF-8. +DELIVER: processors/cbb_quality.py + any dashboard/route fixes + a findings summary. +``` + +--- + +## After Kimi returns (Claude's part) +Send the output back to Claude to: review/integrate, run compiles + the offline self-tests, +verify each newly-wired source actually parses (compiling ≠ correct — CCFI proved that), and +handle the censorship/DDTI side + cross-domain coherence link. From 40036055dd30fc5e2cbf9fbd91dac56995dd2e29 Mon Sep 17 00:00:00 2001 From: mrinal Date: Thu, 18 Jun 2026 20:09:35 +0530 Subject: [PATCH 8/9] docs: add PALIMPSEST overview to README (China intel layer) Co-Authored-By: Claude Opus 4.8 (1M context) --- README.md | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/README.md b/README.md index 67c77cc..eccc029 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,32 @@ +# PALIMPSEST — China Latent-State Intelligence + +**An independent, OSINT-driven read on what's happening inside China — built on the Social Scraper platform.** Two engines feed one unified dashboard: + +| Engine | What it measures | Status | +|---|---|---| +| **DDTI** — Deletion-Differential Threat Index | *Treats the censor as a sensor.* Ranks censored topics by attention × novelty from China Digital Times deletion data. | ✅ live data + dashboard | +| **CBB** — China Beige-Book-style conditions engine | Sector × region economic **diffusion indices** (`D = 0.4·SD + 0.6·AS`) from UN Comtrade mirror-trade + Chinese high-frequency indicators + sentiment. | ✅ engine + 3 live sources; 22 sources stubbed | +| **Unified app** | `dashboards/palimpsest_dashboard.html` — Overview (cross-domain synthesis) · Censorship · Economy, dark "intelligence terminal" UI, XSS-hardened. | ✅ | + +### View it +- **Offline:** open `dashboards/palimpsest_dashboard.html` (real data embedded, badged ● SNAPSHOT). +- **Served:** run the API → `GET /api/v4/ddti/app` (panels fetch live, badge ● LIVE). +- **Refresh data:** `python -m scripts.ddti_live_pull` (censorship) · `python scripts/conditions_pull.py` (economy). + +### Key components +- Collectors: `collectors/{ddti_probe,weibo_hotsearch,comtrade_mirror,cn_indicators}.py` +- Processors: `processors/{ddti_index,conditions_index,conditions_report,zh_finance}.py` +- Configs: `config/{cbb_taxonomy,cn_hf_sources,zh_finance_lexicon,zh_censorship_gazetteer,ddti_threat_categories}.json` +- Dashboards: `dashboards/{palimpsest,ddti,conditions}_dashboard.html` · Deploy: `vercel-app/` +- Plans & agent prompts: `PALIMPSEST_CBB_PLAN.md`, `PALIMPSEST_BRIEF.md`, `KIMI_CBB_PROMPT.md`, `KIMI_PROMPTS.md` + +### Design notes +- **Independent of official stats:** physical anchors (mirror-trade) + cross-source triangulation, not NBS figures. +- **Honest data states:** every view badges LIVE / SNAPSHOT / SAMPLE — never fakes signal. +- **Egress reality:** richer Chinese deletion/Weibo feeds need an in-China residential proxy; current data is what's reachable openly. + +--- + # Social Scraper Intelligence Platform v3.0 **Real-time financial intelligence aggregation across 15 data sources with NLP analysis, threat detection, and automated routing to downstream analytics dashboards.** From 11b70294f78ea94ea50a0fd82d0041e219fddaae Mon Sep 17 00:00:00 2001 From: mrinal Date: Fri, 19 Jun 2026 00:30:00 +0530 Subject: [PATCH 9/9] feat(cbb): four-sensor China conditions engine + platform README refresh MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Expand the PALIMPSEST CBB engine from trade-only into four independent, credentials-free signal families, each with graceful degradation and a dedicated quality/validation layer: - Physical anchors: VIIRS nightlights, Sentinel-2 scene counts, electricity proxy (Ember/OWID), AIS port-call traffic - Elite signals: Politburo readouts, HK RVD property index, People's Daily byline frequency, SAFE balance-of-payments net errors - Trade + indicators: CN high-frequency parsers (cn_hf/: SCFI, Mysteel, 100njz construction steel), wired into cn_indicators - Quality layer: processors/cbb_quality.py — schema/numeric/freshness validation, plus pytest coverage for every CBB source Rewrite README as platform-first (Social Scraper) with PALIMPSEST as the flagship China latent-state intelligence layer; document the four CBB signal families, refreshed architecture diagram, stats, and design principles. Co-Authored-By: Claude Opus 4.8 (1M context) --- KIMI_PROMPTS.md | 10 + README.md | 287 ++++---- api/routes/conditions.py | 2 +- collectors/cn_hf/parsers/autos_cpca_retail.py | 44 ++ .../cn_hf/parsers/autos_cpca_wholesale.py | 44 ++ collectors/cn_hf/parsers/bdi.py | 142 ++++ collectors/cn_hf/parsers/ccfi.py | 111 ++++ .../cn_hf/parsers/coal_power_consumption.py | 41 ++ .../cn_hf/parsers/freight_road_logistics.py | 159 +++++ collectors/cn_hf/parsers/macro_customs.py | 88 +++ collectors/cn_hf/parsers/macro_nbs.py | 55 ++ collectors/cn_hf/parsers/macro_pboc_credit.py | 46 ++ collectors/cn_hf/parsers/mobility_12306.py | 48 ++ .../parsers/mobility_baidu_congestion.py | 49 ++ .../cn_hf/parsers/mobility_baidu_migration.py | 43 ++ .../cn_hf/parsers/mobility_box_office.py | 55 ++ .../cn_hf/parsers/mobility_civil_aviation.py | 50 ++ collectors/cn_hf/parsers/mobility_gaode.py | 157 +++++ collectors/cn_hf/parsers/property_cric.py | 38 ++ .../cn_hf/parsers/property_zhongzhi_land.py | 44 ++ collectors/cn_hf/parsers/scfi.py | 107 +++ .../parsers/steel_100njz_construction.py | 48 ++ collectors/cn_hf/parsers/steel_mysteel.py | 43 ++ collectors/cn_indicators.py | 173 +++-- collectors/elite_hk_property.py | 199 ++++++ collectors/elite_peoples_daily_byline.py | 147 +++++ collectors/elite_politburo_readouts.py | 189 ++++++ collectors/elite_safe_net_errors.py | 113 ++++ collectors/physical_ais_shipping.py | 86 +++ collectors/physical_electricity_proxy.py | 133 ++++ collectors/physical_sentinel2.py | 153 +++++ collectors/physical_viirs_nightlights.py | 79 +++ config/cn_hf_sources.json | 75 ++- config/sources.yaml | 177 ++--- dashboards/conditions_dashboard.html | 14 +- .../reports/deep_validation_2026-03-23.md | 613 ++++++++++++++++++ processors/cbb_quality.py | 425 ++++++++++++ reports/source_health_2026-03-24.md | 98 +++ reports/source_health_2026-04-09.md | 114 ++++ reports/weekly_deep_validation_2026-04-09.md | 369 +++++++++++ reports/weekly_deep_validation_2026-05-04.md | 390 +++++++++++ reports/weekly_deep_validation_2026-05-25.md | 380 +++++++++++ .../weekly_source_validation_2026-04-13.md | 396 +++++++++++ tests/test_cbb_quality.py | 197 ++++++ tests/test_cn_indicators.py | 506 +++++++++++++++ tests/test_comtrade_mirror.py | 335 ++++++++++ tests/test_conditions_index.py | 371 +++++++++++ tests/test_conditions_report.py | 370 +++++++++++ tests/test_zh_finance.py | 159 +++++ 49 files changed, 7651 insertions(+), 321 deletions(-) create mode 100644 collectors/cn_hf/parsers/autos_cpca_retail.py create mode 100644 collectors/cn_hf/parsers/autos_cpca_wholesale.py create mode 100644 collectors/cn_hf/parsers/bdi.py create mode 100644 collectors/cn_hf/parsers/ccfi.py create mode 100644 collectors/cn_hf/parsers/coal_power_consumption.py create mode 100644 collectors/cn_hf/parsers/freight_road_logistics.py create mode 100644 collectors/cn_hf/parsers/macro_customs.py create mode 100644 collectors/cn_hf/parsers/macro_nbs.py create mode 100644 collectors/cn_hf/parsers/macro_pboc_credit.py create mode 100644 collectors/cn_hf/parsers/mobility_12306.py create mode 100644 collectors/cn_hf/parsers/mobility_baidu_congestion.py create mode 100644 collectors/cn_hf/parsers/mobility_baidu_migration.py create mode 100644 collectors/cn_hf/parsers/mobility_box_office.py create mode 100644 collectors/cn_hf/parsers/mobility_civil_aviation.py create mode 100644 collectors/cn_hf/parsers/mobility_gaode.py create mode 100644 collectors/cn_hf/parsers/property_cric.py create mode 100644 collectors/cn_hf/parsers/property_zhongzhi_land.py create mode 100644 collectors/cn_hf/parsers/scfi.py create mode 100644 collectors/cn_hf/parsers/steel_100njz_construction.py create mode 100644 collectors/cn_hf/parsers/steel_mysteel.py create mode 100644 collectors/elite_hk_property.py create mode 100644 collectors/elite_peoples_daily_byline.py create mode 100644 collectors/elite_politburo_readouts.py create mode 100644 collectors/elite_safe_net_errors.py create mode 100644 collectors/physical_ais_shipping.py create mode 100644 collectors/physical_electricity_proxy.py create mode 100644 collectors/physical_sentinel2.py create mode 100644 collectors/physical_viirs_nightlights.py create mode 100644 monitoring/health/reports/deep_validation_2026-03-23.md create mode 100644 processors/cbb_quality.py create mode 100644 reports/source_health_2026-03-24.md create mode 100644 reports/source_health_2026-04-09.md create mode 100644 reports/weekly_deep_validation_2026-04-09.md create mode 100644 reports/weekly_deep_validation_2026-05-04.md create mode 100644 reports/weekly_deep_validation_2026-05-25.md create mode 100644 reports/weekly_source_validation_2026-04-13.md create mode 100644 tests/test_cbb_quality.py create mode 100644 tests/test_cn_indicators.py create mode 100644 tests/test_comtrade_mirror.py create mode 100644 tests/test_conditions_index.py create mode 100644 tests/test_conditions_report.py create mode 100644 tests/test_zh_finance.py diff --git a/KIMI_PROMPTS.md b/KIMI_PROMPTS.md index 9d0de32..1163eb7 100644 --- a/KIMI_PROMPTS.md +++ b/KIMI_PROMPTS.md @@ -2,6 +2,16 @@ Paste these into the Kimi Code window **when tokens reset**. Each block is self-contained. +## Completion status + +| Batch | Status | Summary | +|---|---|---| +| **A — Real scrapers for stubbed CN sources** | ✅ Partially complete | 6 sources wired live (`bdi`, `ccfi`, `freight_road_logistics`, `macro_customs`, `mobility_gaode`, `scfi`); 14 remain TODO stubs because no stable public endpoint exists. Parser modules under `collectors/cn_hf/parsers/` are auto-discovered by `collectors/cn_indicators.py`. | +| **B — Unit-test fleet** | ✅ Done | `tests/test_conditions_index.py`, `tests/test_comtrade_mirror.py`, `tests/test_cn_indicators.py`, `tests/test_zh_finance.py`, `tests/test_conditions_report.py` — all passing offline. | +| **C — Data-quality validators + security sweep** | ✅ Done | `processors/cbb_quality.py` + `tests/test_cbb_quality.py`; dashboard XSS/style whitelist fixes; route error-detail leak fixed. | + +> **Next:** any future Kimi runs should start from the current state (see files above) rather than re-running these batches from scratch. The remaining TODO sources still need public endpoints or robust scrapers before they can be promoted. + ## Rules that make Kimi succeed (learned the hard way) 1. **Launch Kimi INSIDE the project dir** — `cd /Users/mrinal/social_scraper` first. (Once it ran from `~` and spent ~12 min globbing the whole home folder.) 2. **Keep prompts neutral/economic.** Kimi (China-hosted) REFUSES politically sensitive China content (it 400'd on a censorship lexicon). So these batches cover only the **economic / engineering** layers. The censorship/DDTI modules are handled by Claude — do NOT ask Kimi to touch `ddti_*`, `zh_censorship_gazetteer`, or anything censorship-framed. diff --git a/README.md b/README.md index eccc029..7f3a176 100644 --- a/README.md +++ b/README.md @@ -1,79 +1,120 @@ -# PALIMPSEST — China Latent-State Intelligence +# Social Scraper Intelligence Platform -**An independent, OSINT-driven read on what's happening inside China — built on the Social Scraper platform.** Two engines feed one unified dashboard: +**A real-time OSINT collection-and-analysis engine: 15 data sources → NLP → routing, with a flagship China latent-state intelligence layer (PALIMPSEST) built on top.** -| Engine | What it measures | Status | -|---|---|---| -| **DDTI** — Deletion-Differential Threat Index | *Treats the censor as a sensor.* Ranks censored topics by attention × novelty from China Digital Times deletion data. | ✅ live data + dashboard | -| **CBB** — China Beige-Book-style conditions engine | Sector × region economic **diffusion indices** (`D = 0.4·SD + 0.6·AS`) from UN Comtrade mirror-trade + Chinese high-frequency indicators + sentiment. | ✅ engine + 3 live sources; 22 sources stubbed | -| **Unified app** | `dashboards/palimpsest_dashboard.html` — Overview (cross-domain synthesis) · Censorship · Economy, dark "intelligence terminal" UI, XSS-hardened. | ✅ | +![Python](https://img.shields.io/badge/Python-3.12-blue.svg) +![FastAPI](https://img.shields.io/badge/FastAPI-0.104+-009688.svg) +![Docker](https://img.shields.io/badge/Docker-Compose-2496ED.svg) +![Sources](https://img.shields.io/badge/Data_Sources-15-orange.svg) +![Collectors](https://img.shields.io/badge/Collectors-26-orange.svg) +![License](https://img.shields.io/badge/License-MIT-yellow.svg) -### View it -- **Offline:** open `dashboards/palimpsest_dashboard.html` (real data embedded, badged ● SNAPSHOT). -- **Served:** run the API → `GET /api/v4/ddti/app` (panels fetch live, badge ● LIVE). -- **Refresh data:** `python -m scripts.ddti_live_pull` (censorship) · `python scripts/conditions_pull.py` (economy). +> Collect from many noisy, biased sources → enrich with financial NLP → route the signal to the dashboards that need it. The same primitives power a China intelligence engine that estimates a *hidden state* from sensors no single one of which can be trusted. -### Key components -- Collectors: `collectors/{ddti_probe,weibo_hotsearch,comtrade_mirror,cn_indicators}.py` -- Processors: `processors/{ddti_index,conditions_index,conditions_report,zh_finance}.py` -- Configs: `config/{cbb_taxonomy,cn_hf_sources,zh_finance_lexicon,zh_censorship_gazetteer,ddti_threat_categories}.json` -- Dashboards: `dashboards/{palimpsest,ddti,conditions}_dashboard.html` · Deploy: `vercel-app/` -- Plans & agent prompts: `PALIMPSEST_CBB_PLAN.md`, `PALIMPSEST_BRIEF.md`, `KIMI_CBB_PROMPT.md`, `KIMI_PROMPTS.md` +--- -### Design notes -- **Independent of official stats:** physical anchors (mirror-trade) + cross-source triangulation, not NBS figures. -- **Honest data states:** every view badges LIVE / SNAPSHOT / SAMPLE — never fakes signal. -- **Egress reality:** richer Chinese deletion/Weibo feeds need an in-China residential proxy; current data is what's reachable openly. +## Table of Contents + +- [What this is](#what-this-is) +- [Flagship: PALIMPSEST — China latent-state intelligence](#flagship-palimpsest--china-latent-state-intelligence) +- [Platform features](#platform-features) +- [Tech stack](#tech-stack) +- [Architecture](#architecture) +- [Getting started](#getting-started) +- [Project structure](#project-structure) +- [Design principles](#design-principles) +- [License](#license) --- -# Social Scraper Intelligence Platform v3.0 +## What this is -**Real-time financial intelligence aggregation across 15 data sources with NLP analysis, threat detection, and automated routing to downstream analytics dashboards.** +Social Scraper is a Python **collector → processor → API** platform for financial and +geopolitical open-source intelligence. It pulls from 15 source types on configurable +schedules, runs each record through a financial-NLP pipeline (sentiment, entity +recognition, topic and threat classification, embeddings), and routes the result to +downstream analytics dashboards based on relevance. -![Python](https://img.shields.io/badge/Python-3.12-blue.svg) -![FastAPI](https://img.shields.io/badge/FastAPI-0.104+-green.svg) -![Docker](https://img.shields.io/badge/Docker-Compose-2496ED.svg) -![Sources](https://img.shields.io/badge/Data_Sources-15-orange.svg) -![License](https://img.shields.io/badge/License-MIT-yellow.svg) +Every collector follows the same contract — `collect() → parse() → validate()`, +subclassing `core.base_collector.BaseCollector` and registered in `config/sources.yaml` +— so adding a new sensor is a single file plus a config entry. That uniformity is what +makes the China-intelligence layer below possible: it's just *more sensors*, scored by +the same machinery. + +--- + +## Flagship: PALIMPSEST — China latent-state intelligence + +**An independent, OSINT-driven read on what's actually happening inside China.** It treats +"the true state of China" as a hidden variable estimated from many biased sensors — no +single official figure is trusted; signal comes from *triangulation*. Two engines feed one +dark "intelligence terminal" dashboard: + +| Engine | What it measures | Method | +|---|---|---| +| **DDTI** — Deletion-Differential Threat Index | *Treats the censor as a sensor.* What the regime deletes, how fast, and how selectively reveals what it fears. | Ranks censored topics by **threat = attention (time-decayed frequency) × novelty (burst / first-appearance)** from China Digital Times deletion data. | +| **CBB** — China Beige-Book-style conditions engine | Sector × region economic **diffusion indices**, independent of official NBS statistics. | `D = 0.4·SD + 0.6·AS` over four independent signal families (below). | + +### CBB signal families + +The conditions engine no longer rests on official trade data alone — it fuses four +*independent* sensor families, each a credentials-free public source that degrades +gracefully when unreachable: + +| Family | Sensors | Reads… | +|---|---|---| +| 🛰️ **Physical anchors** | VIIRS nighttime lights · Sentinel-2 scene counts · electricity generation (Ember/OWID) · AIS port-call traffic | Real economic activity from space and the grid — hard to fake. | +| 🏛️ **Elite signals** | Politburo meeting readouts · Hong Kong RVD property index · People's Daily byline frequency · SAFE balance-of-payments net errors | The regime's own behavior and capital-flow tells. | +| 📈 **Trade & indicators** | UN Comtrade mirror-trade · Chinese high-frequency indicators (`collectors/cn_hf/`) | Cross-checked partner-reported trade vs. domestic high-frequency prints. | +| 🗣️ **Sentiment** | Chinese finance/policy lexicon, negation-aware hawkish/dovish/sector scoring | Tone and policy direction in Chinese-language text. | + +Every source is checked by a dedicated **quality layer** (`processors/cbb_quality.py`, +covered by `tests/`) — schema validation, numeric sanity, and freshness windows — so a +stale or malformed feed is flagged rather than silently degrading the index. + +### View it + +- **Offline:** open `dashboards/palimpsest_dashboard.html` (real data embedded, badged ● SNAPSHOT). +- **Served:** run the API → `GET /api/v4/ddti/app` (panels fetch live, badge ● LIVE). +- **Refresh data:** `python -m scripts.ddti_live_pull` (censorship) · `python scripts/conditions_pull.py` (economy). --- -## Features +## Platform features -### Data Sources (15) +### Data sources (15) -- **Social platforms** -- Twitter, Reddit, Telegram, Discord, YouTube, Mastodon, Hacker News -- **Financial feeds** -- SEC EDGAR filings, Central Bank publications, RSS aggregation (16 feeds) -- **Developer intelligence** -- GitHub repository and release tracking -- **Dark web** -- Tor SOCKS5 proxy for threat intel, IOC extraction across 8 threat categories -- **General web** -- Configurable generic web scraper with article extraction +- **Social platforms** — Twitter, Reddit, Telegram, Discord, YouTube, Mastodon, Hacker News +- **Financial feeds** — SEC EDGAR filings, Central Bank publications, RSS aggregation (16 feeds) +- **Developer intelligence** — GitHub repository and release tracking +- **Dark web** — Tor SOCKS5 proxy for threat intel, IOC extraction across 8 threat categories +- **General web** — Configurable generic web scraper with article extraction ### Financial NLP -- **Sentiment analysis** -- FinBERT financial sentiment with VADER fallback; hawkish/dovish policy direction scoring -- **Entity recognition** -- spaCy NER extended with Indian financial entities (RBI, SEBI, NSE, CCIL, FIMMDA, FBIL) and policy terms (CRR, SLR, MIBOR, TREPS, LAF, MSF) -- **Topic classification** -- 13 categories including monetary policy, capital markets, crypto, commodities, and geopolitical -- **Ticker extraction** -- Automatic ticker detection, price mention parsing, earnings sentiment, and treasury relevance scoring -- **Threat intelligence** -- Classification across data breach, ransomware, credential theft, financial fraud, crypto threat, insider threat, supply chain, and sanctions evasion -- **Embeddings** -- all-MiniLM-L6-v2 (384-dim) stored in pgvector for semantic search, with Ollama fallback +- **Sentiment** — FinBERT financial sentiment with VADER fallback; hawkish/dovish policy scoring; optional free-LLM tier above FinBERT/VADER +- **Entity recognition** — spaCy NER extended with Indian financial entities (RBI, SEBI, NSE, CCIL, FIMMDA, FBIL) and policy terms (CRR, SLR, MIBOR, TREPS, LAF, MSF) +- **Topic classification** — 13 categories spanning monetary policy, capital markets, crypto, commodities, and geopolitics +- **Ticker extraction** — automatic ticker detection, price-mention parsing, earnings sentiment, treasury-relevance scoring +- **Threat intelligence** — classification across data breach, ransomware, credential theft, financial fraud, crypto threat, insider threat, supply chain, and sanctions evasion +- **Embeddings** — all-MiniLM-L6-v2 (384-dim) in pgvector for semantic search, with Ollama fallback -### Connectors +### Connectors & routing -- **DragonScope** -- Market analytics dashboard integration via Redis pub/sub and REST API push -- **LiquiFi** -- Indian treasury management dashboard with filtered content delivery -- **Smart Router** -- Classifies each piece of content and routes to DragonScope, LiquiFi, or both based on relevance scoring +- **DragonScope** — market-analytics dashboard integration via Redis pub/sub + REST push +- **LiquiFi** — Indian treasury-management dashboard with filtered content delivery +- **Smart Router** — scores each record's relevance and forwards it to DragonScope, LiquiFi, or both ### Infrastructure -- **Celery Beat scheduler** -- 24/7 automated collection with tiered frequencies (5 min to monthly) -- **Kafka pipeline** -- Decoupled ingestion and processing via topic-based message streaming -- **Health monitoring** -- Source reachability checks, structural fingerprinting, data freshness tracking, and Telegram alerting -- **AI-generated digests** -- Daily briefings via Claude or Ollama with citation-backed RAG Q&A +- **Celery Beat scheduler** — 24/7 collection with tiered frequencies (5 min → monthly) +- **Kafka pipeline** — decoupled ingestion and processing via topic-based streaming +- **Health monitoring** — reachability checks, structural fingerprinting, freshness tracking, Telegram alerting +- **AI-generated digests** — daily briefings via Claude or Ollama with citation-backed RAG Q&A --- -## Tech Stack +## Tech stack | Layer | Technology | |-------|-----------| @@ -81,32 +122,68 @@ | Database | TimescaleDB (PostgreSQL 16), pgvector, Alembic | | Queue | Apache Kafka (Confluent), Celery + Redis | | NLP | FinBERT (transformers), spaCy, sentence-transformers, VADER | -| LLM | Anthropic Claude API, Ollama (fallback) | +| LLM | Anthropic Claude API, Ollama (fallback), free-LLM router | | Scraping | httpx, BeautifulSoup, trafilatura, twikit, telethon | -| Object Storage | MinIO (S3-compatible) | -| Dark Web | Tor SOCKS5 proxy (dperson/torproxy) | +| Object storage | MinIO (S3-compatible) | +| Dark web | Tor SOCKS5 proxy (dperson/torproxy) | | Monitoring | Flower (Celery), Telegram Bot alerts | | Containers | Docker Compose (11 services) | --- -## Getting Started +## Architecture + +``` +DATA SOURCES PIPELINE SERVING +──────────── ──────── ─────── +Twitter ─┐ FastAPI +Reddit │ ├─ /search/semantic +Telegram │ ┌──────────┐ ┌───────────────┐ ├─ /ask (RAG) +Discord │ │ │ │ NLP Workers │ ├─ /trends +YouTube ├──>│ Kafka ├──>│ - FinBERT │──> PostgreSQL ├─ /digest +Mastodon │ │ │ │ - spaCy NER │ TimescaleDB├─ /data +GitHub │ └──────────┘ │ - Embeddings │ + pgvector ├─ /monitoring +SEC EDGAR │ │ - Topics │ └─ /api/v4/ddti (PALIMPSEST) +Central Banks│ └───────┬───────┘ +Hacker News │ │ +RSS Feeds │ v +Dark Web │ ┌───────────────┐ ┌────────────────┐ +Generic Web ─┘ │ Router │──>│ DragonScope │ + │ DS / LF / │ │ (Market View) │ + China sensors: │ Both │ ├────────────────┤ + 🛰️ physical anchors └───────────────┘ │ LiquiFi │ + 🏛️ elite signals ┌───────────────┐ │ (Treasury) │ + 📈 trade + indicators ──────> │ PALIMPSEST │ └────────────────┘ + 🗣️ sentiment │ DDTI + CBB │──> palimpsest_dashboard.html + └───────────────┘ +``` + +Data flows through three stages. **Collection**: 26 collectors pull from social platforms, +financial APIs, RSS, dark web, and China physical/elite sensors on schedules managed by +Celery Beat; raw content is published to Kafka and archived in MinIO. **Processing**: +NLP workers consume from Kafka — sentiment, entity extraction, topic/threat classification, +embeddings, plus the PALIMPSEST DDTI and CBB index processors. **Routing**: the smart +router scores each record and forwards it to DragonScope, LiquiFi, or both. + +--- + +## Getting started ### Prerequisites - Docker and Docker Compose -- API keys for desired data sources (see `.env.example`) +- API keys for the sources you want live (see `.env.example`) ### Setup ```bash git clone https://github.com/beepboop2025/social-scraper.git cd social-scraper -cp .env.example .env # Add your API keys and database password -docker compose up -d # Starts all services +cp .env.example .env # add your API keys and database password +docker compose up -d # starts all services ``` -The API will be available at `http://localhost:8000` and Flower (Celery monitoring) at `http://localhost:5555`. +The API is served at `http://localhost:8000`; Flower (Celery monitoring) at `http://localhost:5555`. ### Standalone (without Docker) @@ -116,80 +193,58 @@ python scripts/init_db.py uvicorn api.main:app --port 8000 ``` -### Common Operations +### Common operations ```bash -make up # Start all services -make down # Stop all services -make logs # Tail logs across services -make health # Run system health check -make test # Run test suite -make init # Initialize database schema -make migrate # Run Alembic migrations -make backfill # Backfill 30 days of historical data -make backup # Backup database to ./backups/ +make up # start all services make test # run the pytest suite +make down # stop all services make init # initialize database schema +make logs # tail logs across services make migrate # run Alembic migrations +make health # system health check make backfill # backfill 30 days of history ``` --- -## Architecture - -``` -DATA SOURCES PIPELINE SERVING -──────────── ──────── ─────── -Twitter ─┐ FastAPI -Reddit │ ├─ /search/semantic -Telegram │ ┌──────────┐ ┌───────────────┐ ├─ /ask (RAG) -Discord │ │ │ │ NLP Workers │ ├─ /trends -YouTube ├──>│ Kafka ├──>│ - FinBERT │──> PostgreSQL ├─ /digest -Mastodon │ │ │ │ - spaCy NER │ TimescaleDB├─ /data -GitHub │ └──────────┘ │ - Embeddings │ + pgvector └─ /monitoring -SEC EDGAR │ │ - Topics │ -Central Banks│ └───────┬───────┘ -Hacker News │ │ -RSS Feeds │ v -Dark Web │ ┌───────────────┐ ┌────────────────┐ -Generic Web ─┘ │ Router │──>│ DragonScope │ - │ DS / LF / │ │ (Market View) │ - ┌──────────┐ │ Both │ ├────────────────┤ - │ MinIO │ └───────────────┘ │ LiquiFi │ - │ (raw) │ │ (Treasury) │ - └──────────┘ ┌───────────────┐ └────────────────┘ - │ Health │ - │ Monitor │──> Telegram Alerts - └───────────────┘ -``` - -Data flows through three stages. **Collection**: 15 scrapers and collectors pull from social platforms, financial APIs, RSS feeds, and dark web sources on configurable schedules managed by Celery Beat. Raw content is published to Kafka topics and archived in MinIO. **Processing**: dedicated NLP workers consume from Kafka, running sentiment analysis, entity extraction, topic classification, threat detection, and embedding generation. Processed records are stored in TimescaleDB with pgvector indexes. **Routing**: the smart router evaluates each record's financial relevance and forwards it to DragonScope (market analytics), LiquiFi (treasury management), or both via Redis pub/sub and REST API calls. - ---- - -## Project Structure +## Project structure ``` social_scraper/ -├── scrapers/ # 15 data source scrapers -├── collectors/ # Automated data collectors (Celery tasks) +├── collectors/ # 26 collectors (incl. China physical_* / elite_* / cn_hf/) +├── scrapers/ # source-specific scrapers ├── analysis/ # NLP modules (sentiment, NER, topics, threat intel) -├── processors/ # Pipeline processors (embeddings, dedup, digest) -├── connectors/ # DragonScope + LiquiFi integrations + router +├── processors/ # pipeline + PALIMPSEST (ddti_index, conditions_index, +│ # conditions_report, zh_finance, cbb_quality) +├── connectors/ # DragonScope + LiquiFi integrations + smart router ├── pipeline/ # Kafka producer/consumer -├── api/ # FastAPI application and route modules -├── core/ # Base classes, registry, scheduler -├── storage/ # Models, raw store, vectors, TimescaleDB +├── api/ # FastAPI app and route modules (incl. /api/v4/ddti) +├── core/ # base collector/processor classes, registry, scheduler +├── storage/ # models, raw store, vectors, TimescaleDB ├── scheduler/ # Celery Beat configuration -├── monitoring/ # Data quality checks, health monitor, Telegram alerts -├── config/ # sources.yaml, alerts.yaml, processing.yaml -├── scripts/ # Database init, backfill, reprocessing utilities -├── tests/ # pytest suite -├── docker-compose.yml # Full service stack -├── Dockerfile -├── Makefile -└── requirements.txt +├── monitoring/ # data-quality checks, health monitor, Telegram alerts +├── config/ # sources.yaml, CBB taxonomy, CN lexicons, threat categories +├── dashboards/ # palimpsest / ddti / conditions terminals +├── scripts/ # db init, backfill, live data pulls +├── tests/ # pytest suite (incl. CBB quality + source validators) +├── docker-compose.yml # full service stack +├── Dockerfile · Makefile · requirements.txt ``` --- +## Design principles + +- **Estimate the hidden state, don't trust any single sensor.** Every signal is one + biased measurement; value comes from cross-source triangulation, not from any one feed. +- **Independent of official statistics.** Physical anchors (nightlights, electricity, + shipping) and partner-reported mirror-trade replace self-reported figures wherever possible. +- **Honest data states.** Every view badges LIVE / SNAPSHOT / SAMPLE — the system never + fabricates signal to fill a gap, and the CBB quality layer flags stale or malformed feeds. +- **Graceful degradation.** Sources behind auth walls or geo-blocks return empty and log a + warning rather than crashing the pipeline. +- **Egress reality.** Richer Chinese deletion/Weibo feeds need an in-China residential + proxy; current data is what's openly reachable — and labeled as such. + +--- + ## License MIT diff --git a/api/routes/conditions.py b/api/routes/conditions.py index cdc0fbc..7655e14 100644 --- a/api/routes/conditions.py +++ b/api/routes/conditions.py @@ -63,7 +63,7 @@ async def conditions_index(): return data except Exception as e: logger.warning(f"[Conditions-API] index read failed: {e}") - return JSONResponse({"status": "error", "error": str(e), "sectors": []}) + return JSONResponse({"status": "error", "error": "Internal server error", "sectors": []}) @router.get("/report") diff --git a/collectors/cn_hf/parsers/autos_cpca_retail.py b/collectors/cn_hf/parsers/autos_cpca_retail.py new file mode 100644 index 0000000..c6d90f2 --- /dev/null +++ b/collectors/cn_hf/parsers/autos_cpca_retail.py @@ -0,0 +1,44 @@ +"""CN-HF parser: CPCA passenger vehicle retail sales (autos_cpca_retail). + +The China Passenger Car Association (CPCA / 乘联会) publishes monthly passenger +vehicle retail sales via website articles, PDF reports and its WeChat public +account. There is no stable, public, unauthenticated JSON or CSV endpoint, and +extracting a clean monthly time-series requires scraping reports or parsing +dynamic pages. This parser is therefore marked as "todo" and returns an empty +result while logging the reason. +""" + +import logging +from typing import Any + +import httpx + +logger = logging.getLogger(__name__) + +SOURCE: dict[str, Any] = { + "key": "autos_cpca_retail", + "name_zh": "乘联会乘用车零售销量", + "name_en": "CPCA China Passenger Vehicle Retail Sales", + "url": "https://www.cpcaauto.com/", + "access_method": "todo", + "frequency": "monthly", + "sector": "autos", + "difficulty": "hard", + "note": ( + "Monthly passenger-vehicle retail sales for China published by the China " + "Passenger Car Association (CPCA / 乘联会). The association releases the " + "figures via website articles, PDF reports and its WeChat public account; " + "there is no stable open JSON or CSV endpoint. Extracting a clean monthly " + "time-series requires scraping or parsing the published reports." + ), +} + + +async def collect(http: httpx.AsyncClient, src: dict) -> list[dict]: + """Return no observations; this source requires a scraper or authenticated feed.""" + logger.warning( + "[%s] CPCA retail sales source is a TODO stub: no stable public " + "JSON/CSV endpoint available (website articles/PDF/WeChat only).", + SOURCE["key"], + ) + return [] diff --git a/collectors/cn_hf/parsers/autos_cpca_wholesale.py b/collectors/cn_hf/parsers/autos_cpca_wholesale.py new file mode 100644 index 0000000..60890c0 --- /dev/null +++ b/collectors/cn_hf/parsers/autos_cpca_wholesale.py @@ -0,0 +1,44 @@ +"""CN-HF parser: CPCA passenger vehicle wholesale sales (autos_cpca_wholesale). + +The China Passenger Car Association (CPCA / 乘联会) publishes monthly passenger +vehicle wholesale sales via website articles, PDF reports and its WeChat public +account. There is no stable, public, unauthenticated JSON or CSV endpoint, and +extracting a clean monthly time-series requires scraping reports or parsing +dynamic pages. This parser is therefore marked as "todo" and returns an empty +result while logging the reason. +""" + +import logging +from typing import Any + +import httpx + +logger = logging.getLogger(__name__) + +SOURCE: dict[str, Any] = { + "key": "autos_cpca_wholesale", + "name_zh": "乘联会乘用车批发销量", + "name_en": "CPCA China Passenger Vehicle Wholesale Sales", + "url": "https://www.cpcaauto.com/", + "access_method": "todo", + "frequency": "monthly", + "sector": "autos", + "difficulty": "hard", + "note": ( + "Monthly passenger-vehicle wholesale sales for China published by the China " + "Passenger Car Association (CPCA / 乘联会). The association releases the " + "figures via website articles, PDF reports and its WeChat public account; " + "there is no stable open JSON or CSV endpoint. Extracting a clean monthly " + "time-series requires scraping or parsing the published reports." + ), +} + + +async def collect(http: httpx.AsyncClient, src: dict) -> list[dict]: + """Return no observations; this source requires a scraper or authenticated feed.""" + logger.warning( + "[%s] CPCA wholesale sales source is a TODO stub: no stable public " + "JSON/CSV endpoint available (website articles/PDF/WeChat only).", + SOURCE["key"], + ) + return [] diff --git a/collectors/cn_hf/parsers/bdi.py b/collectors/cn_hf/parsers/bdi.py new file mode 100644 index 0000000..b50891d --- /dev/null +++ b/collectors/cn_hf/parsers/bdi.py @@ -0,0 +1,142 @@ +"""CN-HF parser: Baltic Dry Index (BDI). + +The Baltic Exchange's authoritative time-series is subscription-only, but +Investing.com publishes a public daily historical data table. This parser +scrapes the most recent rows from that table and returns observations for the +BDI composite index. +""" + +from __future__ import annotations + +import logging +from datetime import datetime +from typing import Any + +import httpx +from bs4 import BeautifulSoup + +logger = logging.getLogger(__name__) + +SOURCE: dict[str, Any] = { + "key": "bdi", + "name_zh": "波罗的海干散货指数", + "name_en": "Baltic Dry Index (BDI)", + "url": "https://www.investing.com/indices/baltic-dry-historical-data", + "access_method": "scrape", + "frequency": "daily", + "sector": "transport_logistics", + "difficulty": "medium", + "unit": "index", + "note": ( + "Daily composite dry-bulk freight index published by the Baltic Exchange. " + "The official feed is subscription-only; this parser scrapes the public " + "Investing.com historical table as a best-effort open proxy. It is " + "subject to anti-bot/ToS limits and may return partial history." + ), +} + + +async def collect(http: httpx.AsyncClient, src: dict) -> list[dict]: + """Scrape the latest BDI daily observations from Investing.com. + + Each observation contains at least: + {"date": , "value": , "indicator": "bdi"} + """ + url = src.get("url", SOURCE["url"]) + headers = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + ), + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.5", + } + + try: + resp = await http.get(url, headers=headers) + if resp.status_code != 200: + logger.warning( + "[%s] HTTP %s from %s", + SOURCE["key"], + resp.status_code, + url, + ) + return [] + + soup = BeautifulSoup(resp.text, "html.parser") + rows = soup.find_all( + "tr", class_=lambda cls: cls and "historical-data-v2_price" in cls + ) + if not rows: + logger.warning("[%s] No historical data rows found at %s", SOURCE["key"], url) + return [] + + observations: list[dict] = [] + for row in rows: + cells = [td.get_text(strip=True) for td in row.find_all("td")] + if len(cells) < 2: + continue + + date_text = cells[0] + price_text = cells[1] + if not date_text or not price_text: + continue + + try: + obs_date = datetime.strptime(date_text, "%b %d, %Y").date() + value = float(price_text.replace(",", "")) + except (ValueError, TypeError) as e: + logger.warning( + "[%s] Could not parse date/value from %s: %s", + SOURCE["key"], + cells, + e, + ) + continue + + metadata = {"source_url": url, "price_type": "close"} + if len(cells) >= 3: + metadata["open"] = _to_float(cells[2]) + if len(cells) >= 4: + metadata["high"] = _to_float(cells[3]) + if len(cells) >= 5: + metadata["low"] = _to_float(cells[4]) + if len(cells) >= 7: + metadata["change_pct"] = _to_float(cells[6].replace("%", "")) + + observations.append( + { + "date": obs_date, + "value": value, + "indicator": SOURCE["key"], + "metadata": metadata, + } + ) + + logger.info( + "[%s] Collected %s observations from %s", + SOURCE["key"], + len(observations), + url, + ) + return observations + + except httpx.HTTPError as e: + logger.warning("[%s] Network error: %s", SOURCE["key"], e) + except Exception as e: + logger.warning("[%s] Unexpected error: %s", SOURCE["key"], e) + + return [] + + +def _to_float(text: str | None) -> float | None: + """Safely convert a cleaned string to float; return None on failure.""" + if text is None: + return None + cleaned = text.replace(",", "").replace("%", "").strip() + if cleaned == "" or cleaned == "-": + return None + try: + return float(cleaned) + except (ValueError, TypeError): + return None diff --git a/collectors/cn_hf/parsers/ccfi.py b/collectors/cn_hf/parsers/ccfi.py new file mode 100644 index 0000000..0393ca4 --- /dev/null +++ b/collectors/cn_hf/parsers/ccfi.py @@ -0,0 +1,111 @@ +"""CN-HF parser: China Containerized Freight Index (CCFI). + +The Shanghai Shipping Exchange publishes the latest weekly CCFI composite index +and route sub-indices as a public JSON endpoint. This parser fetches the +current release and returns the composite index value(s). +""" + +from __future__ import annotations + +import logging +from datetime import datetime +from typing import Any + +import httpx + +logger = logging.getLogger(__name__) + +SOURCE: dict[str, Any] = { + "key": "ccfi", + "name_zh": "中国出口集装箱运价指数", + "name_en": "China Containerized Freight Index (CCFI)", + "url": "https://en.sse.net.cn/currentIndex?indexName=ccfi", + "access_method": "open_json", + "frequency": "weekly", + "sector": "transport_logistics", + "difficulty": "easy", + "unit": "index", + "note": ( + "Published weekly (Fridays) by the Shanghai Shipping Exchange. " + "The public /currentIndex endpoint returns the latest CCFI composite index " + "and per-route sub-indices as JSON without authentication." + ), +} + + +async def collect(http: httpx.AsyncClient, src: dict) -> list[dict]: + """Fetch the latest CCFI release and return composite observations. + + Each observation contains at least: + {"date": , "value": , "indicator": "ccfi"} + """ + url = src.get("url", SOURCE["url"]) + try: + resp = await http.get(url) + if resp.status_code != 200: + logger.warning( + "[%s] HTTP %s from %s", + SOURCE["key"], + resp.status_code, + url, + ) + return [] + + payload = resp.json() + data = payload.get("data", {}) or {} + current_date_str = data.get("currentDate") + last_date_str = data.get("lastDate") + line_data = data.get("lineDataList", []) + + if not current_date_str or not line_data: + logger.warning("[%s] Unexpected payload shape: %s", SOURCE["key"], payload) + return [] + + composite = next( + ( + item + for item in line_data + if item.get("dataItemTypeName") == "CCFI_T" + or (item.get("properties") or {}).get("lineName_EN") == "COMPOSITE INDEX" + ), + None, + ) + if composite is None: + logger.warning("[%s] Composite index not found in payload", SOURCE["key"]) + return [] + + observations = [] + for date_str, value_key in ( + (current_date_str, "currentContent"), + (last_date_str, "lastContent"), + ): + if not date_str: + continue + try: + value = float(composite[value_key]) + obs_date = datetime.strptime(date_str, "%Y-%m-%d").date() + except (ValueError, TypeError, KeyError) as e: + logger.warning("[%s] Could not parse %s/%s: %s", SOURCE["key"], date_str, value_key, e) + continue + + observations.append( + { + "date": obs_date, + "value": value, + "indicator": SOURCE["key"], + "metadata": { + "name_en": "COMPOSITE INDEX", + "name_zh": "中国出口集装箱运价综合指数", + "release": "current" if value_key == "currentContent" else "previous", + }, + } + ) + + return observations + + except httpx.HTTPError as e: + logger.warning("[%s] Network error: %s", SOURCE["key"], e) + except Exception as e: + logger.warning("[%s] Parse error: %s", SOURCE["key"], e) + + return [] diff --git a/collectors/cn_hf/parsers/coal_power_consumption.py b/collectors/cn_hf/parsers/coal_power_consumption.py new file mode 100644 index 0000000..e6ddb7d --- /dev/null +++ b/collectors/cn_hf/parsers/coal_power_consumption.py @@ -0,0 +1,41 @@ +"""CN-HF parser stub: thermal coal consumption (coal_power_consumption). + +Daily coal burn / thermal power coal consumption for China is not available +through a stable, public, unauthenticated JSON or CSV endpoint. CCTD and +provincial-grid operators publish figures as HTML tables and PDF reports that +require scraping or manual extraction. This parser is therefore marked as +"todo" and returns an empty result while logging the reason. +""" + +import logging +from typing import Any + +import httpx + +logger = logging.getLogger(__name__) + +SOURCE: dict[str, Any] = { + "key": "coal_power_consumption", + "name_zh": "火电煤炭消费量", + "name_en": "Thermal coal consumption", + "url": "https://www.cctd.com.cn/", + "access_method": "todo", + "frequency": "daily", + "sector": "coal", + "difficulty": "hard", + "note": ( + "No documented public API for daily coal burn at Chinese power plants; " + "CCTD and provincial grid data are published as HTML tables/PDFs that " + "require scraping or manual extraction." + ), +} + + +async def collect(http: httpx.AsyncClient, src: dict) -> list[dict]: + """Return no observations; this source requires a scraper or authenticated feed.""" + logger.warning( + "[%s] Coal power consumption source is a TODO stub: no stable public " + "JSON/CSV endpoint available (CCTD HTML/PDF only).", + SOURCE["key"], + ) + return [] diff --git a/collectors/cn_hf/parsers/freight_road_logistics.py b/collectors/cn_hf/parsers/freight_road_logistics.py new file mode 100644 index 0000000..8070bcd --- /dev/null +++ b/collectors/cn_hf/parsers/freight_road_logistics.py @@ -0,0 +1,159 @@ +"""Parser for the China Road Logistics Price Index. + +The index is published weekly by the China Federation of Logistics and Purchasing +(CFLP) and Guangdong Lin'an Logistics Group as HTML press releases on +chinawuliu.com.cn. This collector scrapes the latest weekly report page and +extracts the headline composite index plus vehicle/LTL sub-indices. +""" + +import logging +import re +from datetime import datetime, timezone +from urllib.parse import urljoin + +from bs4 import BeautifulSoup + +logger = logging.getLogger(__name__) + +SOURCE: dict = { + "key": "freight_road_logistics", + "name_zh": "中国公路物流运价指数", + "name_en": "China Road Logistics Price Index", + "url": "http://www.chinawuliu.com.cn/xsyj/tjsj/", + "access_method": "scrape", + "frequency": "weekly", + "sector": "transport_logistics", + "difficulty": "medium", + "unit": "index", + "note": ( + "Weekly road-freight price index published by CFLP and Lin'an Logistics. " + "Scraped from public HTML report pages on chinawuliu.com.cn." + ), +} + +_BASE_URL = "http://www.chinawuliu.com.cn" +_LISTING_URL = f"{_BASE_URL}/xsyj/tjsj/" +_REPORT_TITLE_PATTERN = re.compile(r"中国公路物流运价周指数报告") +_TITLE_DATE_PATTERN = re.compile(r"[((](\d{4})\.(\d{1,2})\.(\d{1,2})[))]") +_PUBLISH_DATE_PATTERN = re.compile(r"发布时间[::]\s*(\d{4}-\d{2}-\d{2})") +_VALUE_PATTERNS = { + "composite": re.compile(r"中国公路物流运价指数为\s*(\d+(?:\.\d+)?)\s*点"), + "vehicle": re.compile(r"整车指数为\s*(\d+(?:\.\d+)?)\s*点"), + "ltl_light": re.compile(r"零担轻货指数为\s*(\d+(?:\.\d+)?)\s*点"), + "ltl_heavy": re.compile(r"零担重货指数为\s*(\d+(?:\.\d+)?)\s*点"), +} + + +def _extract_report_url(soup: BeautifulSoup) -> str | None: + """Find the URL of the latest weekly road-logistics index report.""" + for link in soup.find_all("a", href=True): + title = link.get("title") or link.get_text(strip=True) + if _REPORT_TITLE_PATTERN.search(title): + return urljoin(_BASE_URL, link["href"]) + return None + + +def _parse_date_from_title(title: str) -> datetime | None: + """Parse a date like '(2026.6.5)' from the report title.""" + match = _TITLE_DATE_PATTERN.search(title) + if match: + year, month, day = match.groups() + try: + return datetime(int(year), int(month), int(day), tzinfo=timezone.utc) + except ValueError: + pass + return None + + +def _parse_publish_date(text: str) -> datetime | None: + """Parse the publish timestamp printed on the article page.""" + match = _PUBLISH_DATE_PATTERN.search(text) + if match: + try: + return datetime.strptime(match.group(1), "%Y-%m-%d").replace(tzinfo=timezone.utc) + except ValueError: + pass + return None + + +def _extract_values(text: str) -> dict[str, float]: + """Extract the composite and sub-index values from report text.""" + values: dict[str, float] = {} + for component, pattern in _VALUE_PATTERNS.items(): + match = pattern.search(text) + if match: + try: + values[component] = float(match.group(1)) + except ValueError: + logger.warning( + f"[freight_road_logistics] Could not convert value for {component}" + ) + return values + + +async def collect(http, src: dict) -> list[dict]: + """Return the latest weekly road-logistics index observations. + + Each observation contains at least: + {"date": , "value": , "indicator": "freight_road_logistics"} + """ + try: + list_resp = await http.get(_LISTING_URL) + list_resp.raise_for_status() + list_soup = BeautifulSoup(list_resp.text, "lxml") + + report_url = _extract_report_url(list_soup) + if not report_url: + logger.warning( + "[freight_road_logistics] No weekly report link found on listing page" + ) + return [] + + report_resp = await http.get(report_url) + report_resp.raise_for_status() + report_soup = BeautifulSoup(report_resp.text, "lxml") + + title = report_soup.title.string if report_soup.title else "" + article = report_soup.find("div", class_="text") or report_soup + text = article.get_text(" ", strip=True) + + # Determine observation date: prefer explicit date in title, fall back to publish date. + obs_date = _parse_date_from_title(title) or _parse_publish_date(text) + if obs_date is None: + logger.warning( + "[freight_road_logistics] Could not determine observation date" + ) + return [] + + values = _extract_values(text) + if "composite" not in values: + logger.warning( + "[freight_road_logistics] Composite index value not found in report" + ) + return [] + + observations = [] + component_names = { + "composite": "composite", + "vehicle": "vehicle", + "ltl_light": "ltl_light", + "ltl_heavy": "ltl_heavy", + } + for component, value in values.items(): + observations.append( + { + "date": obs_date, + "value": value, + "indicator": "freight_road_logistics", + "component": component_names.get(component, component), + } + ) + + logger.info( + f"[freight_road_logistics] Collected {len(observations)} values for {obs_date.date()}" + ) + return observations + + except Exception as e: + logger.warning(f"[freight_road_logistics] Collection failed: {e}") + return [] diff --git a/collectors/cn_hf/parsers/macro_customs.py b/collectors/cn_hf/parsers/macro_customs.py new file mode 100644 index 0000000..1a7b01e --- /dev/null +++ b/collectors/cn_hf/parsers/macro_customs.py @@ -0,0 +1,88 @@ +"""China monthly import/export trade data from China Data Portal (GACC).""" + +import logging +from datetime import datetime, timezone + +import httpx + +logger = logging.getLogger(__name__) + +SOURCE: dict = { + "key": "macro_customs", + "name_zh": "中国进出口贸易总额(海关月度)", + "name_en": "China Monthly Import and Export Trade (GACC)", + "url": "https://chinadata.live/api/v2/data/china-trade-monthly", + "access_method": "open_json", + "frequency": "monthly", + "sector": "macro", + "difficulty": "easy", + "note": ( + "Free no-key JSON API from China Data Portal, sourced from General " + "Administration of Customs of China (GACC) official monthly releases. " + "Returns total trade, exports, imports and trade balance in USD millions." + ), +} + +# Metrics published for each month. The API also returns ytd_* fields; we keep +# the four headline series to stay aligned with the source description. +_METRICS = ("total", "export", "import", "balance") + + +async def collect(http: httpx.AsyncClient, src: dict) -> list[dict]: + """Fetch monthly China trade data and return one observation per metric.""" + url = src.get("url", SOURCE["url"]) + + try: + resp = await http.get(url) + resp.raise_for_status() + payload = resp.json() + except Exception as e: + logger.warning(f"[macro_customs] Failed to fetch trade data: {e}") + return [] + + if not isinstance(payload, dict) or not payload.get("success"): + logger.warning("[macro_customs] API returned unsuccessful payload") + return [] + + series = payload.get("data", {}).get("data") + if not isinstance(series, list): + logger.warning("[macro_customs] No data array in API response") + return [] + + observations = [] + for row in series: + if not isinstance(row, dict): + continue + + period = row.get("date") + if not period: + continue + + try: + dt = datetime.strptime(str(period), "%Y-%m").replace(tzinfo=timezone.utc) + except Exception as e: + logger.warning(f"[macro_customs] Could not parse date '{period}': {e}") + continue + + for metric in _METRICS: + raw_value = row.get(metric) + if raw_value is None: + continue + try: + value = float(raw_value) + except (TypeError, ValueError): + logger.warning( + f"[macro_customs] Non-numeric {metric} for {period}: {raw_value}" + ) + continue + + observations.append({ + "date": dt, + "value": value, + "indicator": f"{SOURCE['key']}_{metric}", + "period": period, + "unit": "USD Million", + }) + + logger.info(f"[macro_customs] Collected {len(observations)} observations") + return observations diff --git a/collectors/cn_hf/parsers/macro_nbs.py b/collectors/cn_hf/parsers/macro_nbs.py new file mode 100644 index 0000000..346d8ab --- /dev/null +++ b/collectors/cn_hf/parsers/macro_nbs.py @@ -0,0 +1,55 @@ +"""NBS China macroeconomic data parser (TODO stub). + +The National Bureau of Statistics of China (NBS / 国家统计局) publishes monthly +macroeconomic releases (CPI, PPI, industrial production, retail sales, +fixed-asset investment, surveyed urban unemployment, etc.) through the +National Data (EasyQuery) portal at data.stats.gov.cn. + +The EasyQuery endpoints can be queried programmatically, but they: + * return JSONP/HTML that must be unwrapped and parsed, + * require constructing undocumented ``wd`` / ``dfwds`` dimension parameters, + * use hierarchical indicator codes that change over time, + * are protected by anti-bot measures and have no documented open API key. + +Because there is no stable, public, credential-free endpoint, this parser is +kept as a TODO stub. A future implementation could either reverse-engineer +the EasyQuery JSONP protocol or scrape the English/Chinese monthly bulletin +pages on stats.gov.cn and extract the headline tables. +""" + +import logging + +import httpx + +logger = logging.getLogger(__name__) + +SOURCE: dict = { + "key": "macro_nbs", + "name_zh": "国家统计局宏观数据", + "name_en": "NBS China Macroeconomic Data", + "url": "https://data.stats.gov.cn/easyquery.htm?cn=A01", + "access_method": "todo", + "frequency": "monthly", + "sector": "macro", + "difficulty": "hard", + "note": ( + "Monthly macroeconomic releases from the National Bureau of Statistics of China. " + "The EasyQuery portal can be queried via parameterized endpoints, but responses are " + "JSONP/HTML, require undocumented wd/dfwds parameters, and are protected by anti-bot " + "measures with no documented open API key. Marked todo until a stable scraper or API " + "client is implemented." + ), +} + + +async def collect(http: httpx.AsyncClient, src: dict) -> list[dict]: + """TODO: fetch NBS macroeconomic observations. + + Currently returns an empty list because data.stats.gov.cn has no stable + public, credential-free endpoint. All errors are handled gracefully. + """ + logger.warning( + "[macro_nbs] TODO: NBS EasyQuery parser not yet implemented; " + "source requires undocumented JSONP parameters and is protected by anti-bot measures." + ) + return [] diff --git a/collectors/cn_hf/parsers/macro_pboc_credit.py b/collectors/cn_hf/parsers/macro_pboc_credit.py new file mode 100644 index 0000000..d10f00e --- /dev/null +++ b/collectors/cn_hf/parsers/macro_pboc_credit.py @@ -0,0 +1,46 @@ +"""PBOC credit, money supply and aggregate financing parser. + +The People’s Bank of China publishes monthly monetary statistics, sources/uses +of credit funds, and aggregate financing data on its official portal. As of the +Batch A survey there is no stable open JSON or CSV endpoint; the configured URL +and known alternative paths return 404 / are blocked from this environment. +This module is therefore registered as ``access_method="todo"`` and degrades +gracefully while preserving the source metadata for a future scraper. +""" + +import logging + +logger = logging.getLogger(__name__) + + +SOURCE: dict = { + "key": "macro_pboc_credit", + "name_zh": "人民银行信贷收支与货币供应", + "name_en": "PBOC Credit, Money Supply and Aggregate Financing", + "url": "http://www.pbc.gov.cn/en/3688240/index.html", + "access_method": "todo", + "frequency": "monthly", + "sector": "macro", + "difficulty": "hard", + "note": ( + "Monthly money supply, sources/uses of credit funds and aggregate " + "financing data from the People’s Bank of China. No stable open JSON or " + "CSV endpoint is currently available; figures are published as HTML/Excel " + "tables that require scraping and Chinese date parsing. Marked todo until " + "a reliable public access path is confirmed." + ), +} + + +async def collect(http, src: dict) -> list[dict]: + """No-op collector for the PBOC credit source. + + Returns an empty list until a stable public endpoint or scrape target is + identified. All failures are handled internally. + """ + logger.warning( + "[macro_pboc_credit] TODO: PBOC credit parser not yet implemented — " + "no stable public JSON/CSV endpoint available (src=%s)", + src.get("key", "macro_pboc_credit"), + ) + return [] diff --git a/collectors/cn_hf/parsers/mobility_12306.py b/collectors/cn_hf/parsers/mobility_12306.py new file mode 100644 index 0000000..863b251 --- /dev/null +++ b/collectors/cn_hf/parsers/mobility_12306.py @@ -0,0 +1,48 @@ +"""12306 railway passenger traffic parser. + +12306 (China Railway's ticketing platform) does not expose a public, +unauthenticated API for passenger volume or ticket/booking data. The official +site is a commercial booking engine and any bulk data requires authenticated +access or scraping behind anti-bot protections. + +This module therefore exposes the source metadata as a TODO stub and returns +an empty observation list. +""" + +import logging +from typing import Any + +import httpx + +logger = logging.getLogger(__name__) + +SOURCE: dict[str, Any] = { + "key": "mobility_12306", + "name_zh": "12306铁路客运量", + "name_en": "12306 Railway Passenger Traffic", + "url": "https://www.12306.cn", + "access_method": "todo", + "frequency": "daily", + "sector": "transport_logistics", + "difficulty": "hard", + "note": ( + "12306 has no public API. Booking/search data is only available by " + "scraping the official site or via Ministry of Transport monthly aggregate " + "railway passenger reports. Marked todo until a stable public endpoint or " + "scraper is implemented." + ), +} + + +async def collect(http: httpx.AsyncClient, src: dict) -> list[dict]: + """Return no observations. + + 12306 passenger traffic is not available from a public unauthenticated + endpoint. Keep the collector stubbed as TODO so the dispatcher can skip it + gracefully without breaking the fan-out. + """ + logger.warning( + "[mobility_12306] TODO: no public API for 12306 railway passenger " + "traffic; returning empty observations." + ) + return [] diff --git a/collectors/cn_hf/parsers/mobility_baidu_congestion.py b/collectors/cn_hf/parsers/mobility_baidu_congestion.py new file mode 100644 index 0000000..00ca035 --- /dev/null +++ b/collectors/cn_hf/parsers/mobility_baidu_congestion.py @@ -0,0 +1,49 @@ +"""Baidu Maps city congestion index (百度地图城市拥堵指数). + +The authoritative, real-time congestion curve is served by Baidu Maps +internal/proprietary endpoints (jiaotong.baidu.com) and the Baidu Maps +LBS API, which require a platform API key (ak) and are governed by Baidu +platform terms. Public, unauthenticated bulk JSON/CSV endpoints are not +available, and the public report pages are heavily dynamic/anti-bot. + +This module is therefore registered as ``todo`` and degrades gracefully. +""" + +import logging + +import httpx + +logger = logging.getLogger(__name__) + +SOURCE: dict = { + "key": "mobility_baidu_congestion", + "name_zh": "百度地图城市拥堵指数", + "name_en": "Baidu Maps City Congestion Index", + "url": "https://jiaotong.baidu.com/reports/", + "access_method": "todo", + "frequency": "daily", + "sector": "mobility", + "difficulty": "hard", + "note": ( + "Baidu Maps publishes city congestion rankings and reports on " + "jiaotong.baidu.com, but no open bulk JSON/CSV endpoint exists. " + "Real-time road traffic requires a Baidu Maps API key (ak) and is " + "restricted by platform terms; implementation needs a scraper or " + "authenticated API integration." + ), +} + + +async def collect(http: httpx.AsyncClient, src: dict) -> list[dict]: + """Return empty list; Baidu congestion data is not available publicly. + + A production implementation could call the Baidu Maps LBS Traffic Status + API (https://lbsyun.baidu.com/index.php?title=webapi/traffic) with a + valid ``ak`` and parse the returned congestion index curve, or scrape the + dynamic report pages on jiaotong.baidu.com. + """ + logger.warning( + "[mobility_baidu_congestion] TODO: Baidu Maps congestion data requires " + "an authenticated API key or a dynamic-page scraper; returning empty." + ) + return [] diff --git a/collectors/cn_hf/parsers/mobility_baidu_migration.py b/collectors/cn_hf/parsers/mobility_baidu_migration.py new file mode 100644 index 0000000..b8b2c01 --- /dev/null +++ b/collectors/cn_hf/parsers/mobility_baidu_migration.py @@ -0,0 +1,43 @@ +"""Baidu Migration Index (Qianxi) — TODO stub. + +Baidu Qianxi exposes daily inter-city migration indices through undocumented +JSONP endpoints on huiyan.baidu.com (cityrank, provincerank, historycurve, +lastdate). The endpoints are public and do not require an API key, but they +need correct region codes, JSONP wrapper stripping,Referer/UA headers and are +subject to geo-blocking and cookie rotation. Until a robust, lawful scraper is +implemented this source is kept as a TODO stub. +""" + +import logging + +import httpx + +logger = logging.getLogger(__name__) + +SOURCE: dict = { + "key": "mobility_baidu_migration", + "name_zh": "百度迁徙", + "name_en": "Baidu Migration Index (Qianxi)", + "url": "http://huiyan.baidu.com/migration/cityrank.jsonp", + "access_method": "todo", + "frequency": "daily", + "sector": "mobility", + "difficulty": "hard", + "note": ( + "Undocumented JSONP endpoints on huiyan.baidu.com (cityrank/provincerank/" + "historycurve/lastdate) expose daily move-in/move-out rankings and " + "migration-scale indices by region ID. No API key is required, but requests " + "need correct region codes, JSONP stripping, and geo-blocking/cookie handling. " + "Marked TODO until a robust scraper/collector is implemented." + ), +} + + +async def collect(http: httpx.AsyncClient, src: dict) -> list[dict]: + """Return no observations; Baidu Qianxi parser is TODO.""" + logger.warning( + "[%s] Baidu Qianxi migration collector is TODO: undocumented JSONP endpoints " + "require region codes, JSONP stripping and header/cookie handling", + src.get("key", "mobility_baidu_migration"), + ) + return [] diff --git a/collectors/cn_hf/parsers/mobility_box_office.py b/collectors/cn_hf/parsers/mobility_box_office.py new file mode 100644 index 0000000..fb78e98 --- /dev/null +++ b/collectors/cn_hf/parsers/mobility_box_office.py @@ -0,0 +1,55 @@ +"""Mobility & Box Office parser. + +Maoyan (猫眼) publishes a real-time box-office dashboard at +https://piaofang.maoyan.com/dashboard. The underlying JSON endpoint +``/dashboard-ajax/movie`` is public but protects numeric values with a +custom icon font, requires dynamic signatures/timestamps, cookie rotation +and Referer headers, and changes its obfuscation periodically. Public, +unauthenticated, stable daily box-office or mobility-proxy time-series are +therefore not currently available. + +This module keeps the source metadata and exposes a no-op collector so the +dispatcher can skip it gracefully. +""" + +import logging +from typing import Any + +import httpx + +logger = logging.getLogger(__name__) + +SOURCE: dict[str, Any] = { + "key": "mobility_box_office", + "name_zh": "出行与电影票房", + "name_en": "Mobility & Box Office", + "url": "https://piaofang.maoyan.com/dashboard", + "access_method": "todo", + "frequency": "daily", + "sector": "mobility", + "difficulty": "hard", + "note": ( + "Composite high-frequency proxy for Chinese consumer mobility and " + "discretionary services. Maoyan dashboard loads real-time box-office via " + "https://piaofang.maoyan.com/dashboard-ajax/movie (JSON) but requires " + "dynamic signatures, timestamp, cookie rotation and Referer headers. " + "Mobility proxies (Baidu Qianxi migration index, Amap city congestion " + "index) are also scrape-only and periodically change obfuscation. " + "Marked todo until a stable, lawful access path is implemented." + ), +} + + +async def collect(http: httpx.AsyncClient, src: dict) -> list[dict]: + """Return no observations; this source is TODO. + + Maoyan's dashboard requires anti-bot/font-decoding handling that is not + reliably achievable with the allowed public-data-only dependency set. + """ + logger.warning( + "[%s] TODO: Maoyan box-office / mobility dashboard is protected by " + "dynamic signatures and custom icon-font obfuscation; returning empty " + "observations.", + src.get("key", SOURCE["key"]), + ) + return [] diff --git a/collectors/cn_hf/parsers/mobility_civil_aviation.py b/collectors/cn_hf/parsers/mobility_civil_aviation.py new file mode 100644 index 0000000..f84660e --- /dev/null +++ b/collectors/cn_hf/parsers/mobility_civil_aviation.py @@ -0,0 +1,50 @@ +"""CAAC monthly civil aviation KPI parser. + +The Civil Aviation Administration of China (CAAC / 中国民航局) publishes monthly +production indicator statistics (passenger trips, cargo/mail, aircraft +movements) as PDF attachments on a Chinese-language index page: + + https://www.caac.gov.cn/XXGK/XXGK/TJSJ/TJSJ_1/ + +No stable, public, unauthenticated JSON or CSV endpoint is available. +Extracting a clean time-series requires scraping the index, downloading PDF +attachments, and parsing tables, which is beyond the allowed dependency set +for this batch. This parser is therefore marked as "todo" and returns an +empty observation list while logging the reason. +""" + +import logging +from typing import Any + +import httpx + +logger = logging.getLogger(__name__) + +SOURCE: dict[str, Any] = { + "key": "mobility_civil_aviation", + "name_zh": "中国民航月度主要生产指标统计", + "name_en": "CAAC Monthly Civil Aviation KPIs", + "url": "https://www.caac.gov.cn/XXGK/XXGK/TJSJ/TJSJ_1/", + "access_method": "todo", + "frequency": "monthly", + "sector": "mobility", + "difficulty": "hard", + "note": ( + "CAAC publishes monthly production indicator PDFs (passenger trips, " + "cargo/mail, aircraft movements) on a Chinese-language index page. No " + "open JSON/CSV or machine-readable API endpoint was found; extraction " + "requires scraping the index, downloading PDF attachments, and parsing " + "tables." + ), +} + + +async def collect(http: httpx.AsyncClient, src: dict) -> list[dict]: + """Return no observations; this source requires a PDF/table scraper.""" + logger.warning( + "[%s] TODO: CAAC monthly civil aviation KPIs are only available as " + "PDF attachments on the CAAC statistics index page; no stable public " + "JSON/CSV endpoint exists. Returning empty observations.", + SOURCE["key"], + ) + return [] diff --git a/collectors/cn_hf/parsers/mobility_gaode.py b/collectors/cn_hf/parsers/mobility_gaode.py new file mode 100644 index 0000000..a8734de --- /dev/null +++ b/collectors/cn_hf/parsers/mobility_gaode.py @@ -0,0 +1,157 @@ +"""CN-HF parser: Amap (Gaode) city congestion delay index. + +Amap's public traffic-report dashboard exposes a JSON endpoint that returns the +current national average of the road-network trip delay index (路网行程延时指数) +along with related traffic-health indicators. This parser fetches that public +endpoint and returns the daily observation. +""" + +from __future__ import annotations + +import logging +from datetime import datetime, timezone +from typing import Any + +import httpx + +logger = logging.getLogger(__name__) + +_SOURCE_KEY = "mobility_gaode" +_API_URL = "https://report.amap.com/diagnosis/ajax/countryindicators.do" +_PRIMARY_INDICATOR = "路网行程延时指数" + +SOURCE: dict[str, Any] = { + "key": _SOURCE_KEY, + "name_zh": "高德地图城市拥堵延时指数", + "name_en": "Amap City Congestion Delay Index", + "url": _API_URL, + "access_method": "open_json", + "frequency": "daily", + "sector": "mobility", + "difficulty": "medium", + "unit": "index", + "note": ( + "Amap's public traffic-report dashboard exposes an open JSON endpoint " + "(report.amap.com/diagnosis/ajax/countryindicators.do) returning the " + "current national average of the road-network trip delay index and related " + "traffic-health indicators. No authentication is required." + ), +} + + +async def collect(http: httpx.AsyncClient, src: dict) -> list[dict]: + """Fetch Amap's daily national traffic-health indicators. + + Returns at least one observation for the congestion delay index: + {"date": , "value": , "indicator": "mobility_gaode"} + + Additional related indicators (e.g. average speed, congestion share) are + returned as sub-indicator observations when available. + """ + url = src.get("url", SOURCE["url"]) + try: + resp = await http.get(url) + if resp.status_code != 200: + logger.warning( + "[%s] HTTP %s from %s", + _SOURCE_KEY, + resp.status_code, + url, + ) + return [] + + payload = resp.json() + if not isinstance(payload, list): + logger.warning("[%s] Unexpected payload shape: %s", _SOURCE_KEY, payload) + return [] + + obs_date = datetime.now(timezone.utc).date() + observations: list[dict] = [] + + for record in payload: + indicator_name = record.get("indicator") + avg_value = record.get("avg") + if not indicator_name or avg_value is None: + continue + + try: + value = float(avg_value) + except (ValueError, TypeError): + logger.warning( + "[%s] Could not parse avg value %r for %s", + _SOURCE_KEY, + avg_value, + indicator_name, + ) + continue + + indicator = ( + _SOURCE_KEY + if indicator_name == _PRIMARY_INDICATOR + else f"{_SOURCE_KEY}_{_slug(indicator_name)}" + ) + + observations.append( + { + "date": obs_date, + "value": value, + "indicator": indicator, + "metadata": { + "name_zh": indicator_name, + "top_city": record.get("topCityName"), + "max_value": record.get("maxValue"), + "cities_above_avg": record.get("numGTAvg"), + }, + } + ) + + return observations + + except httpx.HTTPError as e: + logger.warning("[%s] Network error: %s", _SOURCE_KEY, e) + except Exception as e: + logger.warning("[%s] Parse error: %s", _SOURCE_KEY, e) + + return [] + + +def _slug(name: str) -> str: + """Create an ASCII-only sub-indicator slug from a Chinese indicator name.""" + name = name.strip().lower() + replacements = { + "(": "_", + ")": "", + "(": "_", + ")": "", + "/": "_", + " ": "_", + } + for old, new in replacements.items(): + name = name.replace(old, new) + # Drop units/symbols that don't add signal to the slug. + name = name.replace("%", "pct").replace("·", "_") + # Pinyin-ish romanisation is not reliable, so transliterate common terms. + name = ( + name.replace("路网", "road_network_") + .replace("高延时", "high_delay_") + .replace("拥堵", "congestion_") + .replace("延时", "delay_") + .replace("指数", "index") + .replace("运行", "run_") + .replace("时间", "time_") + .replace("占比", "share") + .replace("路段", "link_") + .replace("里程", "mileage_") + .replace("比", "ratio") + .replace("常发", "frequent_") + .replace("行程", "trip_") + .replace("道路", "road_") + .replace("偏差率", "deviation_rate") + .replace("平均", "avg_") + .replace("速度", "speed_") + .replace("高", "high_") + ) + name = name.strip("_") + # Remove any remaining non-ascii/non-alphanumeric characters. + name = "".join(ch if ch.isascii() and (ch.isalnum() or ch == "_") else "_" for ch in name) + return name.strip("_") or "unknown" diff --git a/collectors/cn_hf/parsers/property_cric.py b/collectors/cn_hf/parsers/property_cric.py new file mode 100644 index 0000000..b58dce2 --- /dev/null +++ b/collectors/cn_hf/parsers/property_cric.py @@ -0,0 +1,38 @@ +"""CRIC (克尔瑞) China real-estate data parser. + +CRIC is a commercial/proprietary provider; no stable public open API exists for +its sales-volume, inventory, price, land-auction or developer-ranking series. +This module is therefore a TODO stub per the Batch A public-data-only rule. +""" + +import logging + +import httpx + +logger = logging.getLogger(__name__) + +SOURCE: dict = { + "key": "property_cric", + "name_zh": "克尔瑞房地产数据", + "name_en": "CRIC China Real Estate Data", + "url": "https://www.cricbigdata.com/", + "access_method": "todo", + "frequency": "monthly", + "sector": "property", + "difficulty": "hard", + "note": ( + "Commercial/proprietary real-estate data from CRIC (克尔瑞). " + "No public open API; indicators are behind a subscription/paywall. " + "Stubbed until a lawful public endpoint or scraper is available." + ), +} + + +async def collect(http: httpx.AsyncClient, src: dict) -> list[dict]: + """Return no observations; CRIC data is not available via a public endpoint.""" + logger.warning( + "[%s] CRIC data is proprietary/paywalled; no public collection implemented. " + "Set access_method='todo'.", + src.get("key", "property_cric"), + ) + return [] diff --git a/collectors/cn_hf/parsers/property_zhongzhi_land.py b/collectors/cn_hf/parsers/property_zhongzhi_land.py new file mode 100644 index 0000000..82d783d --- /dev/null +++ b/collectors/cn_hf/parsers/property_zhongzhi_land.py @@ -0,0 +1,44 @@ +"""CREIS / Zhongzhi (中指云) China land auction and transaction data. + +The public portal at cih-index.com only shows limited preview listings; full +parcel details, historical time-series and ranked city aggregates are gated +behind login/subscription or the commercial API at https://api.cih-index.com/. +No stable public JSON/CSV endpoint exists, so this parser is implemented as a +TODO stub that returns an empty observation list. +""" + +import logging + +import httpx + +logger = logging.getLogger(__name__) + +SOURCE: dict = { + "key": "property_zhongzhi_land", + "name_zh": "中指云土地招拍挂数据", + "name_en": "CREIS / Zhongzhi China Land Auction and Transaction Data", + "url": "https://www.cih-index.com/landlist/land/", + "access_method": "todo", + "frequency": "daily", + "sector": "property", + "difficulty": "hard", + "note": ( + "Daily land auction, supply-plan and transaction listings published by " + "China Index Academy (CREIS / 中指云 / 中指研究院). The public portal shows " + "search/filter pages and limited preview records, but full parcel details, " + "historical time-series and ranked city aggregates are gated behind " + "login/subscription or delivered through the commercial API at " + "https://api.cih-index.com/. No stable open JSON/CSV endpoint is available, " + "so this source is a TODO stub." + ), +} + + +async def collect(http: httpx.AsyncClient, src: dict) -> list[dict]: + """Return an empty list; data is behind a commercial/paywalled API.""" + logger.warning( + "[property_zhongzhi_land] TODO: CREIS/Zhongzhi land data requires " + "authenticated commercial API or subscription; public preview pages do " + "not expose a stable machine-readable time-series. Skipping collection." + ) + return [] diff --git a/collectors/cn_hf/parsers/scfi.py b/collectors/cn_hf/parsers/scfi.py new file mode 100644 index 0000000..6b6116c --- /dev/null +++ b/collectors/cn_hf/parsers/scfi.py @@ -0,0 +1,107 @@ +"""SCFI — Shanghai Containerized Freight Index. + +Source: Shanghai Shipping Exchange public current-index endpoint. +https://en.sse.net.cn/currentIndex?indexName=scfi + +Returns the latest weekly composite index and any route sub-indices that +have a non-null current value. +""" + +import logging +from datetime import datetime +from typing import Any + +import httpx + +logger = logging.getLogger(__name__) + +SOURCE: dict[str, Any] = { + "key": "scfi", + "name_zh": "上海出口集装箱运价指数", + "name_en": "Shanghai Containerized Freight Index (SCFI)", + "url": "https://en.sse.net.cn/currentIndex?indexName=scfi", + "access_method": "open_json", + "frequency": "weekly", + "sector": "transport_logistics", + "difficulty": "easy", + "note": ( + "Published weekly (Fridays) by the Shanghai Shipping Exchange. " + "The public /currentIndex endpoint returns the latest SCFI composite index " + "and per-route sub-indices as JSON without authentication." + ), +} + + +async def collect(http: httpx.AsyncClient, src: dict) -> list[dict]: + """Fetch the latest SCFI release and return observation rows.""" + url = src.get("url", SOURCE["url"]) + observations: list[dict] = [] + + try: + resp = await http.get(url) + if resp.status_code != 200: + logger.warning( + f"[scfi] Unexpected status {resp.status_code} from {url}" + ) + return [] + + payload = resp.json() + except Exception as e: # pragma: no cover - network/parse failure path + logger.warning(f"[scfi] Failed to fetch or parse JSON: {e}") + return [] + + data = payload.get("data") if isinstance(payload, dict) else None + if not isinstance(data, dict): + logger.warning("[scfi] Response missing 'data' object") + return [] + + current_date_raw = data.get("currentDate") + last_date_raw = data.get("lastDate") + line_data = data.get("lineDataList") + if not isinstance(line_data, list): + logger.warning("[scfi] Response missing 'lineDataList'") + return [] + + try: + current_date = datetime.fromisoformat(str(current_date_raw)).date().isoformat() + except Exception: + current_date = str(current_date_raw) if current_date_raw is not None else None + + for item in line_data: + if not isinstance(item, dict): + continue + + value = item.get("currentContent") + if value is None: + continue + + try: + value = float(value) + except (TypeError, ValueError): + logger.warning(f"[scfi] Non-numeric value skipped: {value}") + continue + + props = item.get("properties") or {} + observations.append( + { + "date": current_date, + "value": value, + "indicator": SOURCE["key"], + "metadata": { + "line_name_zh": props.get("lineName_ZH", ""), + "line_name_en": props.get("lineName_EN", ""), + "data_item_type": item.get("dataItemTypeName", ""), + "unit_zh": props.get("unit_ZH", ""), + "unit_en": props.get("unit_EN", ""), + "weighting_zh": props.get("weighting_ZH", ""), + "weighting_en": props.get("weighting_EN", ""), + "last_date": last_date_raw, + "last_value": item.get("lastContent"), + "absolute_change": item.get("absolute"), + "percentage_change": item.get("percentage"), + }, + } + ) + + logger.info(f"[scfi] Collected {len(observations)} observations for {current_date}") + return observations diff --git a/collectors/cn_hf/parsers/steel_100njz_construction.py b/collectors/cn_hf/parsers/steel_100njz_construction.py new file mode 100644 index 0000000..eb28983 --- /dev/null +++ b/collectors/cn_hf/parsers/steel_100njz_construction.py @@ -0,0 +1,48 @@ +"""100njz / Mysteel construction steel price parser. + +百年建筑网建筑钢材行情数据由上海钢联(Mysteel)建筑钢材频道提供。 +The public listing pages currently expose only "电议" (contact-for-price) +quotes and do not publish numeric daily price values without a commercial +Mysteel/100njz API agreement. Therefore this parser is registered as a TODO +stub and returns an empty observation list. +""" + +import logging + +import httpx + +logger = logging.getLogger(__name__) + +SOURCE: dict = { + "key": "steel_100njz_construction", + "name_zh": "百年建筑网建筑钢材价格", + "name_en": "100njz Construction Steel Price", + "url": "https://jiancai.mysteel.com/", + "access_method": "todo", + "frequency": "daily", + "sector": "steel", + "difficulty": "hard", + "note": ( + "百年建筑网(100njz.com)建筑钢材行情由上海钢联(Mysteel)建筑钢材频道" + "(jiancai.mysteel.com)提供,日度更新。公开页面仅展示'电议'报价," + "具体日度价格数据需登录或商业数据接口。TODO:接入Mysteel/100njz商业API或" + "确认可公开访问的价格指数页面。" + ), +} + + +async def collect(http: httpx.AsyncClient, src: dict) -> list[dict]: + """Return an empty observation list. + + Numeric construction steel prices from 100njz/Mysteel are not available on + the public listing page without a commercial agreement. The public page only + shows '电议' (contact-for-price) listings, so no observable `value` can be + extracted. + """ + logger.warning( + "[%s] TODO: numeric construction steel prices require a commercial " + "Mysteel/100njz API or authenticated data feed; public page shows only " + "contact-for-price listings.", + src.get("key", SOURCE["key"]), + ) + return [] diff --git a/collectors/cn_hf/parsers/steel_mysteel.py b/collectors/cn_hf/parsers/steel_mysteel.py new file mode 100644 index 0000000..6d4e224 --- /dev/null +++ b/collectors/cn_hf/parsers/steel_mysteel.py @@ -0,0 +1,43 @@ +"""CN-HF parser stub: Mysteel China steel prices (steel_mysteel). + +Mysteel (我的钢铁网) publishes daily spot steel and raw-materials prices for +China, but the full historical time-series and stable machine-readable feeds +are behind a subscription wall and anti-bot protection. The public news pages +show the latest prices, yet there is no reliable, unauthenticated JSON or CSV +endpoint. This parser is therefore marked as "todo" and returns an empty +result while logging the reason. +""" + +import logging +from typing import Any + +import httpx + +logger = logging.getLogger(__name__) + +SOURCE: dict[str, Any] = { + "key": "steel_mysteel", + "name_zh": "我的钢铁网钢材价格", + "name_en": "Mysteel China Steel Prices", + "url": "https://news.mysteel.com/", + "access_method": "todo", + "frequency": "daily", + "sector": "steel", + "difficulty": "hard", + "note": ( + "Daily spot steel and raw-materials prices for China published by Mysteel. " + "No stable open JSON/API endpoint is available; the full time-series is " + "behind a subscription/anti-bot wall. TODO: implement a scraper or " + "commercial API integration." + ), +} + + +async def collect(http: httpx.AsyncClient, src: dict) -> list[dict]: + """Return no observations; this source requires a scraper or authenticated feed.""" + logger.warning( + "[%s] Mysteel steel price source is a TODO stub: no stable public " + "JSON/CSV endpoint available (subscription/anti-bot required).", + SOURCE["key"], + ) + return [] diff --git a/collectors/cn_indicators.py b/collectors/cn_indicators.py index 7ffbe99..270e9eb 100644 --- a/collectors/cn_indicators.py +++ b/collectors/cn_indicators.py @@ -13,6 +13,7 @@ back to a small built-in set of open World Bank China proxies. """ +import importlib import io import json import logging @@ -25,12 +26,12 @@ from core.base_collector import BaseCollector - # ── Per-source bespoke parsers ─────────────────────────────────────── # Some open sources return idiosyncratic JSON the generic json_path/date/value # mapping can't reach. Each parser takes the decoded response and returns a flat # list of {"date": str, "value": number, **extra} observations. + def _parse_sse_freight(data: Any) -> list: """Shanghai Shipping Exchange CCFI/SCFI composite index. @@ -48,7 +49,7 @@ def emit(item, label): return out for item in lines: - dit = (item.get("dataItemTypeName") or "") + dit = item.get("dataItemTypeName") or "" en = ((item.get("properties") or {}).get("lineName_EN") or "").strip().upper() if dit.endswith("_T") or en == "COMPOSITE INDEX": return emit(item, "COMPOSITE") @@ -64,11 +65,17 @@ def _parse_chinadata_series(data: Any, value_key: str = "export") -> list: for r in rows: if not isinstance(r, dict): continue - out.append({ - "date": r.get("date"), - "value": r.get(value_key), - **{k: r.get(k) for k in ("total", "export", "import", "balance") if k in r}, - }) + out.append( + { + "date": r.get("date"), + "value": r.get(value_key), + **{ + k: r.get(k) + for k in ("total", "export", "import", "balance") + if k in r + }, + } + ) return out @@ -77,6 +84,52 @@ def _parse_chinadata_series(data: Any, value_key: str = "export") -> list: "scfi": _parse_sse_freight, "macro_customs": _parse_chinadata_series, } + + +# ── CN-HF parser-module registry ───────────────────────────────────── +# Each module under collectors/cn_hf/parsers/ that exposes a SOURCE dict with +# a ``key`` and an async ``collect(http, src)`` function is registered here. +# Sources whose ``parser`` field is set to "cn_hf" are routed through this +# registry instead of the generic JSON/CSV path. + + +def _load_cn_hf_parser_registry() -> dict[str, Any]: + """Discover and import parser modules in collectors/cn_hf/parsers/.""" + registry: dict[str, Any] = {} + parsers_dir = Path(__file__).resolve().parent / "cn_hf" / "parsers" + if not parsers_dir.exists(): + return registry + + for path in sorted(parsers_dir.glob("*.py")): + if path.name.startswith("_"): + continue + module_name = f"collectors.cn_hf.parsers.{path.stem}" + try: + module = importlib.import_module(module_name) + except Exception as e: + logging.warning( + "[CNIndicators] Failed to import CN-HF parser %s: %s", + module_name, + e, + ) + continue + + source = getattr(module, "SOURCE", None) + collect_fn = getattr(module, "collect", None) + if ( + source + and isinstance(source, dict) + and source.get("key") + and collect_fn + and callable(collect_fn) + ): + registry[source["key"]] = module + + return registry + + +_CN_HF_PARSER_REGISTRY = _load_cn_hf_parser_registry() + from core.exceptions import SchemaChangedError logger = logging.getLogger(__name__) @@ -249,7 +302,11 @@ def _load_sources(self) -> list[dict]: if raw_sources and isinstance(raw_sources[0], str): catalog = self._load_catalog() key_set = set(raw_sources) - return [self._normalize_source(s) for s in catalog if s.get("key") in key_set] + return [ + self._normalize_source(s) + for s in catalog + if s.get("key") in key_set + ] return [] return [] @@ -265,11 +322,15 @@ def _load_catalog() -> list[dict]: if _CATALOG_PATH.exists(): data = json.loads(_CATALOG_PATH.read_text(encoding="utf-8")) if isinstance(data, dict): - return data.get("sources", []) or data.get("enabled_sources", []) or [] + return ( + data.get("sources", []) or data.get("enabled_sources", []) or [] + ) if isinstance(data, list): return data except Exception as e: - logger.warning(f"[CNIndicators] Failed to load catalog {_CATALOG_PATH}: {e}") + logger.warning( + f"[CNIndicators] Failed to load catalog {_CATALOG_PATH}: {e}" + ) return [] @staticmethod @@ -338,22 +399,25 @@ async def collect(self) -> list[dict]: if date is None or value is None: continue - records.append({ - "key": key, - "date": date, - "value": value, - "unit": src.get("unit", ""), - "sector": src.get("sector", "macro"), - "frequency": src.get("frequency", "unknown"), - "source_name_zh": src.get("name_zh", key), - "source_name_en": src.get("name_en", key), - "url": url, - "access": access, - "metadata_extra": { - k: v for k, v in item.items() - if k not in (date_field, value_field) - }, - }) + records.append( + { + "key": key, + "date": date, + "value": value, + "unit": src.get("unit", ""), + "sector": src.get("sector", "macro"), + "frequency": src.get("frequency", "unknown"), + "source_name_zh": src.get("name_zh", key), + "source_name_en": src.get("name_en", key), + "url": url, + "access": access, + "metadata_extra": { + k: v + for k, v in item.items() + if k not in (date_field, value_field) + }, + } + ) count += 1 logger.info(f"[CNIndicators] {key}: collected {count} records") @@ -363,6 +427,28 @@ async def collect(self) -> list[dict]: async def _fetch_source(self, src: dict) -> Any: """Fetch one source and return the list of observations.""" + parser = src.get("parser", "json") + + # Route to a CN-HF parser module if requested. Parser modules handle their + # own HTTP/HTML logic and return observations directly. + if parser == "cn_hf": + module = _CN_HF_PARSER_REGISTRY.get(src["key"]) + if module is None: + logger.warning( + "[CNIndicators] %s: parser 'cn_hf' requested but no module registered", + src["key"], + ) + return [] + try: + return await module.collect(self._http, src) + except Exception as e: + logger.warning( + "[CNIndicators] %s: cn_hf parser failed: %s", + src["key"], + e, + ) + return [] + url = src["url"] method = src.get("method", "GET").upper() @@ -379,7 +465,6 @@ async def _fetch_source(self, src: dict) -> Any: ) return [] - parser = src.get("parser", "json") if parser == "csv": df = pd.read_csv(io.StringIO(resp.text)) return df.to_dict("records") @@ -478,22 +563,24 @@ async def parse(self, raw_data: list[dict]) -> pd.DataFrame: """Transform raw records into the EconomicData schema.""" rows = [] for r in raw_data: - rows.append({ - "indicator": r["key"], - "date": r["date"], - "value": r["value"], - "unit": r["unit"], - "metadata": { - "category": r["sector"], - "frequency": r["frequency"], - "source_name_zh": r["source_name_zh"], - "source_name_en": r["source_name_en"], - "url": r["url"], - "access": r["access"], - "sector": r["sector"], - "raw": r.get("metadata_extra", {}), - }, - }) + rows.append( + { + "indicator": r["key"], + "date": r["date"], + "value": r["value"], + "unit": r["unit"], + "metadata": { + "category": r["sector"], + "frequency": r["frequency"], + "source_name_zh": r["source_name_zh"], + "source_name_en": r["source_name_en"], + "url": r["url"], + "access": r["access"], + "sector": r["sector"], + "raw": r.get("metadata_extra", {}), + }, + } + ) return pd.DataFrame(rows) def validate(self, df: pd.DataFrame) -> bool: diff --git a/collectors/elite_hk_property.py b/collectors/elite_hk_property.py new file mode 100644 index 0000000..e0ba1bf --- /dev/null +++ b/collectors/elite_hk_property.py @@ -0,0 +1,199 @@ +"""Hong Kong private residential property price index collector. + +Source: Hong Kong Rating and Valuation Department (RVD) + "Private Domestic - Price Indices by Class (Territory-wide)" + https://www.rvd.gov.hk/en/publications/property_market_statistics.html + +The headline "All Classes" index is published monthly with base 1999=100. +Public XLS download; no authentication required. +""" + +import io +import logging +from datetime import datetime, timezone +from typing import Any + +import httpx +import pandas as pd + +from core.base_collector import BaseCollector + +logger = logging.getLogger(__name__) + +RVD_PROPERTY_PRICE_XLS = ( + "https://www.rvd.gov.hk/doc/en/statistics/his_data_4.xls" +) + +# Column layout in the "Monthly" sheet of his_data_4.xls +# (pandas reads merged cells into alternating columns) +MONTHLY_COLUMNS = { + 8: ("hk_property_price_index_class_a", "Class A (<40 m²)"), + 11: ("hk_property_price_index_class_b", "Class B (40-69.9 m²)"), + 14: ("hk_property_price_index_class_c", "Class C (70-99.9 m²)"), + 17: ("hk_property_price_index_class_d", "Class D (100-159.9 m²)"), + 20: ("hk_property_price_index_class_e", "Class E (≥160 m²)"), + 23: ("hk_property_price_index_under_100m2", "A, B & C (<100 m²)"), + 26: ("hk_property_price_index_100m2_plus", "D & E (≥100 m²)"), + 29: ("hk_property_price_index", "All Classes"), +} + +UNIT = "index (1999=100)" + + +class HKPropertyCollector(BaseCollector): + """Collect Hong Kong RVD private domestic price indices.""" + + name = "hk_property" + source_type = "api" + + async def collect(self) -> list[dict]: + """Fetch the RVD monthly price-index XLS and return raw observations.""" + try: + resp = await self._http.get(RVD_PROPERTY_PRICE_XLS) + if resp.status_code != 200: + logger.warning( + f"[{self.name}] RVD returned HTTP {resp.status_code}" + ) + return [] + + return self._parse_xls(resp.content) + except httpx.HTTPError as e: + logger.warning(f"[{self.name}] Network error: {e}") + except Exception as e: + logger.warning(f"[{self.name}] Failed to collect data: {e}") + return [] + + def _parse_xls(self, content: bytes) -> list[dict]: + """Parse the monthly sheet from the downloaded XLS bytes.""" + try: + xls = pd.ExcelFile(io.BytesIO(content)) + except Exception as e: + logger.warning( + f"[{self.name}] Cannot open XLS (missing xlrd/openpyxl?): {e}" + ) + return [] + + # Pick the monthly sheet; fall back to the first sheet if not found. + sheet_name = next( + (s for s in xls.sheet_names if "monthly" in s.lower()), + xls.sheet_names[0] if xls.sheet_names else None, + ) + if sheet_name is None: + logger.warning(f"[{self.name}] No sheets found in XLS") + return [] + + df = xls.parse(sheet_name, header=None) + + # Data rows start around row 10 (0-indexed). Keep all rows and filter later. + data_rows = df.iloc[10:].copy() + if data_rows.empty: + logger.warning(f"[{self.name}] No data rows in monthly sheet") + return [] + + # Forward-fill the year from column 1 so each month has a year value. + data_rows[1] = data_rows[1].ffill() + + records: list[dict[str, Any]] = [] + for _, row in data_rows.iterrows(): + year = row.get(1) + month = row.get(5) + + if not self._is_valid_year_month(year, month): + continue + + year = int(year) + month = int(month) + dt = datetime(year, month, 1, tzinfo=timezone.utc) + + for col, (indicator, category) in MONTHLY_COLUMNS.items(): + raw_value = row.get(col) + value = self._to_float(raw_value) + if value is None: + continue + + records.append({ + "date": dt, + "year": year, + "month": month, + "indicator": indicator, + "value": value, + "unit": UNIT, + "category": category, + }) + + logger.info( + f"[{self.name}] Parsed {len(records)} observations from RVD XLS" + ) + return records + + @staticmethod + def _is_valid_year_month(year: Any, month: Any) -> bool: + """Check that year and month are usable integers.""" + try: + y = int(year) + m = int(month) + except (TypeError, ValueError): + return False + return 1979 <= y <= 2100 and 1 <= m <= 12 + + @staticmethod + def _to_float(value: Any) -> float | None: + """Convert a cell value to float, ignoring notes/dashes/blank cells.""" + if value is None: + return None + if isinstance(value, (int, float)): + if pd.isna(value): + return None + return float(value) + text = str(value).strip() + if not text or text in {"-", "(", ")", "*", "", "nan"}: + return None + # Some cells contain note markers like "( " — strip them. + cleaned = text.replace("(", "").replace(")", "").replace(",", "").strip() + try: + return float(cleaned) + except ValueError: + return None + + async def parse(self, raw_data: list[dict]) -> pd.DataFrame: + """Transform raw observations into EconomicData-shaped rows.""" + if not raw_data: + return pd.DataFrame() + + rows = [] + for r in raw_data: + rows.append({ + "source": self.name, + "indicator": r.get("indicator", ""), + "date": r.get("date"), + "value": r.get("value"), + "unit": r.get("unit", ""), + "metadata": { + "category": r.get("category", ""), + "year": r.get("year"), + "month": r.get("month"), + }, + }) + + return pd.DataFrame(rows) + + def validate(self, df: pd.DataFrame) -> bool: + """Validate parsed DataFrame has required EconomicData columns.""" + required = ["source", "indicator", "date", "value", "unit"] + missing = [c for c in required if c not in df.columns] + if missing: + from core.exceptions import SchemaChangedError + raise SchemaChangedError(self.name, required, list(df.columns)) + if df.empty: + return True + if not pd.api.types.is_datetime64_any_dtype(df["date"]): + from core.exceptions import SchemaChangedError + raise SchemaChangedError( + self.name, ["datetime date column"], list(df.columns) + ) + if not pd.api.types.is_numeric_dtype(df["value"]): + from core.exceptions import SchemaChangedError + raise SchemaChangedError( + self.name, ["numeric value column"], list(df.columns) + ) + return True diff --git a/collectors/elite_peoples_daily_byline.py b/collectors/elite_peoples_daily_byline.py new file mode 100644 index 0000000..c62c855 --- /dev/null +++ b/collectors/elite_peoples_daily_byline.py @@ -0,0 +1,147 @@ +"""People's Daily byline-frequency collector (elite signal). + +Attempts to extract a daily article-count / byline-activity proxy from the +public People's Daily Online print-edition archive. The site markup changes +periodically and may rate-limit foreign requests, so the collector degrades +gracefully to an empty result set and logs a warning rather than raising. + +Output EconomicData rows: + indicator = "peoples_daily_byline" + value = number of articles detected on the page + unit = "articles" +""" + +import logging +from datetime import datetime, timedelta, timezone + +import pandas as pd +from bs4 import BeautifulSoup + +from core.base_collector import BaseCollector +from core.exceptions import SchemaChangedError + +logger = logging.getLogger(__name__) + + +class PeoplesDailyCollector(BaseCollector): + name = "peoples_daily_byline" + source_type = "api" + + # Public print-edition archive URLs. The first working pattern wins. + ARCHIVE_URL_PATTERNS = [ + "http://paper.people.com.cn/rmrb/html/{date}/nbs.D110000renmrb_01.htm", + "http://paper.people.com.cn/rmrb/html/{date}/node_1.htm", + ] + + def __init__(self, config: dict): + super().__init__(config) + self.lookback_days = int(config.get("lookback_days", 7)) + self.timeout = int(config.get("timeout", 30)) + + async def collect(self) -> list[dict]: + """Fetch recent People's Daily archive pages. + + Returns a list of raw records, one per archive page, each containing + the HTML, date and resolved URL. Network or parse failures are + swallowed and result in an empty list. + """ + records: list[dict] = [] + today = datetime.now(timezone.utc) + + for day_offset in range(self.lookback_days): + issue_date = today - timedelta(days=day_offset) + date_path = issue_date.strftime("%Y-%m/%d") + iso_date = issue_date.strftime("%Y-%m-%d") + + fetched = False + for pattern in self.ARCHIVE_URL_PATTERNS: + url = pattern.format(date=date_path) + try: + resp = await self._http.get(url, timeout=self.timeout) + if resp.status_code == 200: + records.append({ + "date": iso_date, + "html": resp.text, + "url": str(resp.url), + }) + fetched = True + break + logger.debug( + f"[{self.name}] {url} returned HTTP {resp.status_code}" + ) + except Exception as e: # noqa: BLE001 + logger.debug(f"[{self.name}] Failed to fetch {url}: {e}") + + if not fetched: + logger.warning( + f"[{self.name}] Could not retrieve archive page for {iso_date}" + ) + + if not records: + logger.warning( + f"[{self.name}] No archive pages retrieved; returning empty result" + ) + else: + logger.info(f"[{self.name}] Collected {len(records)} archive pages") + return records + + async def parse(self, raw_data: list[dict]) -> pd.DataFrame: + """Extract a daily article-count proxy from archive HTML. + + The print-edition archive lists article headlines/teasers. We count + links that look like article links, falling back to headline/list-item + counts when no obvious article links are found. + """ + rows: list[dict] = [] + for page in raw_data: + html = page.get("html", "") + if not html: + continue + + try: + soup = BeautifulSoup(html, "html.parser") + + article_links = [ + a for a in soup.find_all("a", href=True) + if any( + token in a["href"].lower() + for token in ("content", "node", "article", "n.") + ) + ] + count = len(article_links) + + if count == 0: + # Fallback: count headline and list-item elements. + count = len(soup.find_all(["h1", "h2", "h3", "h4", "li"])) + + if count == 0: + logger.warning( + f"[{self.name}] No content found for {page.get('date')}" + ) + continue + + parsed_date = datetime.strptime( + page["date"], "%Y-%m-%d" + ).replace(tzinfo=timezone.utc) + + rows.append({ + "indicator": "peoples_daily_byline", + "date": parsed_date, + "value": float(count), + "unit": "articles", + "metadata": {"source_url": page.get("url", "")}, + }) + except Exception as e: # noqa: BLE001 + logger.warning( + f"[{self.name}] Parse error for {page.get('date')}: {e}" + ) + + return pd.DataFrame(rows) + + def validate(self, df: pd.DataFrame) -> bool: + """Ensure the parsed DataFrame contains required EconomicData columns.""" + required = ["indicator", "date", "value"] + missing = [col for col in required if col not in df.columns] + if missing: + raise SchemaChangedError(self.name, required, list(df.columns)) + return True diff --git a/collectors/elite_politburo_readouts.py b/collectors/elite_politburo_readouts.py new file mode 100644 index 0000000..784c28d --- /dev/null +++ b/collectors/elite_politburo_readouts.py @@ -0,0 +1,189 @@ +"""Public Politburo meeting readout metadata collector. + +Best-effort scraper that extracts Politburo meeting metadata from the +references section of the English Wikipedia article on the 20th Politburo +of the Chinese Communist Party. Wikipedia citations link to official +readouts (e.g. Xinhua, gov.cn, Communist Party Membership Network) and +provide the meeting date and title in a stable, public, neutral format. + +Returns one EconomicData-shaped row per identified meeting / collective +study session, with value=1 and metadata containing the original title, +URL and publisher. +""" + +import logging +import re +from datetime import datetime, timezone + +import pandas as pd +from bs4 import BeautifulSoup +from dateutil import parser as date_parser + +from core.base_collector import BaseCollector +from core.exceptions import SchemaChangedError + +logger = logging.getLogger(__name__) + +DEFAULT_URL = ( + "https://en.wikipedia.org/wiki/20th_Politburo_of_the_Chinese_Communist_Party" +) + +# Reference must mention the Politburo / Political Bureau in Chinese or English. +_KEYWORDS = ( + "中共中央政治局", + "political bureau of the cpc central committee", + "politburo of the cpc central committee", +) + +# Regexes used to pull a publication date out of a Wikipedia citation line. +_DATE_PATTERNS = [ + re.compile(r"\((\d{1,2}\s+[A-Za-z]+\s+\d{4})\)"), # (25 October 2022) + re.compile(r"\(([A-Za-z]+\s+\d{1,2},?\s+\d{4})\)"), # (October 25, 2022) + re.compile(r"\((\d{4}-\d{2}-\d{2})\)"), # (2022-10-25) + re.compile(r"(\d{1,2}\s+[A-Za-z]+\s+\d{4})"), # 25 October 2022 + re.compile(r"([A-Za-z]+\s+\d{1,2},?\s+\d{4})"), # October 25, 2022 +] + + +class PolitburoReadoutsCollector(BaseCollector): + name = "politburo_readouts" + source_type = "api" # routed to the EconomicData table by BaseCollector + + def __init__(self, config: dict): + super().__init__(config) + self.url = config.get("url", DEFAULT_URL) + # Wikipedia blocks non-browser user-agents; use a generic browser string. + self._http.headers["User-Agent"] = ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/120.0.0.0 Safari/537.36" + ) + + async def collect(self) -> list[dict]: + try: + resp = await self._http.get(self.url) + if resp.status_code != 200: + logger.warning( + f"[{self.name}] {self.url} returned HTTP {resp.status_code}" + ) + return [] + return [{"html": resp.text, "url": self.url}] + except Exception as e: + logger.warning(f"[{self.name}] Failed to fetch {self.url}: {e}") + return [] + + async def parse(self, raw_data: list[dict]) -> pd.DataFrame: + if not raw_data: + return pd.DataFrame() + + html = raw_data[0].get("html", "") + source_url = raw_data[0].get("url", self.url) + + try: + soup = BeautifulSoup(html, "html.parser") + except Exception as e: + logger.warning(f"[{self.name}] HTML parse failed: {e}") + return pd.DataFrame() + + rows = [] + references = soup.find("ol", {"class": "references"}) + if not references: + logger.warning(f"[{self.name}] No references list found at {source_url}") + return pd.DataFrame() + + for li in references.find_all("li", recursive=False): + cite = li.find("cite") + if not cite: + continue + + text = cite.get_text(" ", strip=True).lower() + if not any(kw.lower() in text for kw in _KEYWORDS): + continue + + link = cite.find("a", {"class": "external text"}) + readout_url = link.get("href", "") if link else "" + title = (link.get_text(strip=True) if link else "").strip('"') + + # Try to grab the English translation in square brackets when present. + translation = "" + bracket_match = re.search(r"\[([^\]]+)\]", cite.get_text(" ", strip=True)) + if bracket_match: + translation = bracket_match.group(1).strip() + + meeting_date = self._extract_date(cite.get_text(" ", strip=True)) + if meeting_date is None: + logger.debug( + f"[{self.name}] Could not extract date for reference: {title!r}" + ) + continue + + publisher = self._extract_publisher(cite) + + rows.append( + { + "indicator": self.name, + "date": meeting_date, + "value": 1, + "unit": "meeting", + "metadata": { + "source_url": source_url, + "readout_url": readout_url, + "title": title or translation, + "title_zh": title if any( + "\u4e00" <= ch <= "\u9fff" for ch in title + ) else "", + "translation": translation, + "publisher": publisher, + }, + } + ) + + if not rows: + logger.warning( + f"[{self.name}] No Politburo meeting references parsed from {source_url}" + ) + + df = pd.DataFrame(rows) + if not df.empty: + df = df.sort_values("date").reset_index(drop=True) + return df + + def _extract_date(self, text: str) -> datetime | None: + """Return a timezone-aware datetime parsed from a citation line.""" + for pattern in _DATE_PATTERNS: + match = pattern.search(text) + if not match: + continue + date_str = match.group(1) + try: + dt = date_parser.parse(date_str) + return dt.replace(tzinfo=timezone.utc) + except Exception: + continue + return None + + def _extract_publisher(self, cite) -> str: + """Best-effort publisher extraction from a Wikipedia node.""" + text = cite.get_text(" ", strip=True) + # Common Wikipedia cite-web pattern: "...(in Language). Publisher. Retrieved ..." + match = re.search(r"\(in\s+[^)]+\)\.\s*([^\.]+?)\.", text) + if match: + return match.group(1).strip() + # Fallback: last internal wiki link text before access date. + for a in cite.find_all("a"): + href = a.get("href", "") + if href.startswith("/wiki/") and "accessdate" not in href.lower(): + continue + if not href.startswith("http"): + continue + return a.get_text(strip=True) + return "" + + def validate(self, df: pd.DataFrame) -> bool: + if df.empty: + return True + required = ["indicator", "date", "value", "unit"] + missing = [c for c in required if c not in df.columns] + if missing: + raise SchemaChangedError(self.name, required, list(df.columns)) + return True diff --git a/collectors/elite_safe_net_errors.py b/collectors/elite_safe_net_errors.py new file mode 100644 index 0000000..8d31aab --- /dev/null +++ b/collectors/elite_safe_net_errors.py @@ -0,0 +1,113 @@ +"""SAFENetErrorsCollector — SAFE net errors & omissions proxy. + +Fetches China's balance-of-payments net errors and omissions from the World Bank +WDI open API (indicator BN.KAC.EOMS.CD). This is a public, neutral proxy for +the SAFE balance-of-payments release series. +""" + +import logging +from datetime import datetime, timezone + +import pandas as pd + +from core.base_collector import BaseCollector +from core.exceptions import SchemaChangedError + +logger = logging.getLogger(__name__) + + +class SAFENetErrorsCollector(BaseCollector): + name = "safe_net_errors" + source_type = "api" + + # World Bank WDI: Net errors and omissions (BoP, current US$) + BASE_URL = "https://api.worldbank.org/v2" + INDICATOR = "BN.KAC.EOMS.CD" + COUNTRY = "CHN" + + def __init__(self, config: dict): + super().__init__(config) + self.indicator = config.get("indicator", self.INDICATOR) + self.country = config.get("country", self.COUNTRY) + self.start_year = config.get("start_year", 2015) + self.end_year = config.get("end_year", 2026) + + async def collect(self) -> list[dict]: + records = [] + try: + url = f"{self.BASE_URL}/country/{self.country}/indicator/{self.indicator}" + resp = await self._http.get( + url, + params={ + "format": "json", + "per_page": 100, + "date": f"{self.start_year}:{self.end_year}", + }, + ) + if resp.status_code != 200: + logger.warning( + f"[{self.name}] World Bank returned HTTP {resp.status_code}" + ) + return records + + data = resp.json() + if not isinstance(data, list) or len(data) < 2: + logger.warning( + f"[{self.name}] Unexpected World Bank response shape" + ) + return records + + for item in data[1] or []: + value = item.get("value") + if value is None: + continue + records.append({ + "indicator": self.indicator, + "country": self.country, + "date": item.get("date", ""), + "value": float(value), + "indicator_name": item.get("indicator", {}).get("value", ""), + "country_name": item.get("country", {}).get("value", ""), + }) + except Exception as e: + logger.warning(f"[{self.name}] Collection failed: {e}") + + logger.info(f"[{self.name}] Collected {len(records)} data points") + return records + + async def parse(self, raw_data: list[dict]) -> pd.DataFrame: + rows = [] + for r in raw_data: + date_str = str(r.get("date", "")).strip() + try: + if date_str and date_str.isdigit(): + date = datetime(int(date_str), 1, 1, tzinfo=timezone.utc) + elif date_str: + date = datetime.fromisoformat( + date_str.replace("Z", "+00:00") + ) + else: + date = datetime.now(timezone.utc) + except Exception: + date = datetime.now(timezone.utc) + + rows.append({ + "indicator": "safe_net_errors", + "date": date, + "value": r.get("value"), + "unit": "USD", + "metadata": { + "country": r.get("country", self.country), + "indicator_name": r.get("indicator_name", ""), + "country_name": r.get("country_name", ""), + "world_bank_indicator": r.get("indicator", self.indicator), + }, + }) + return pd.DataFrame(rows) + + def validate(self, df: pd.DataFrame) -> bool: + required = ["indicator", "date", "value"] + missing = [c for c in required if c not in df.columns] + if missing: + raise SchemaChangedError(self.name, required, list(df.columns)) + return True diff --git a/collectors/physical_ais_shipping.py b/collectors/physical_ais_shipping.py new file mode 100644 index 0000000..63fc796 --- /dev/null +++ b/collectors/physical_ais_shipping.py @@ -0,0 +1,86 @@ +"""AIS shipping / port-call traffic collector for China. + +High-frequency AIS vessel traffic data for Chinese ports is only available +through authenticated commercial APIs (MarineTraffic, VesselFinder, +UN Global Platform AIS, Spire, etc.) or private satellite/terrestrial +receiver networks. There is no stable public, unauthenticated endpoint that +covers China. + +This collector therefore degrades gracefully: it logs a TODO warning and +returns an empty record list. The parser/validate machinery is wired so the +file still satisfies the EconomicData schema and can be extended later if a +public endpoint or an API key is provided. +""" + +import logging +from datetime import datetime, timezone + +import pandas as pd + +from core.base_collector import BaseCollector +from core.exceptions import SchemaChangedError + +logger = logging.getLogger(__name__) + + +class AISShippingCollector(BaseCollector): + """China AIS shipping proxy. + + Public, unauthenticated high-frequency AIS data for China is not + available. The collector returns an empty result and logs a TODO note. + """ + + name = "ais_shipping" + source_type = "api" + + INDICATOR = "ais_portcalls_china" + UNIT = "calls" + + async def collect(self) -> list[dict]: + """Fetch raw AIS records. + + Because stable public endpoints do not exist, this is currently a + no-op that returns an empty list. + """ + logger.warning( + "[ais_shipping] Public unauthenticated AIS endpoint for China is " + "not available (MarineTraffic, VesselFinder, UN Global Platform, " + "and similar services require authenticated/commercial API keys). " + "Skipping collection." + ) + return [] + + async def parse(self, raw_data: list[dict]) -> pd.DataFrame: + """Transform raw AIS records into the EconomicData schema.""" + rows = [] + for r in raw_data: + try: + date = r.get("date") + if isinstance(date, str): + date = datetime.fromisoformat(date.replace("Z", "+00:00")) + if date is None: + date = datetime.now(timezone.utc) + + value = float(r["value"]) + rows.append({ + "indicator": r.get("indicator", self.INDICATOR), + "date": date, + "value": value, + "unit": r.get("unit", self.UNIT), + "metadata": r.get("metadata", {}), + }) + except (KeyError, ValueError, TypeError) as e: + logger.warning(f"[ais_shipping] Parse error for record {r!r}: {e}") + + return pd.DataFrame(rows) + + def validate(self, df: pd.DataFrame) -> bool: + """Validate that parsed rows contain the required columns.""" + if df.empty: + return True + + required = ["indicator", "date", "value"] + missing = [c for c in required if c not in df.columns] + if missing: + raise SchemaChangedError(self.name, required, list(df.columns)) + return True diff --git a/collectors/physical_electricity_proxy.py b/collectors/physical_electricity_proxy.py new file mode 100644 index 0000000..bb417cc --- /dev/null +++ b/collectors/physical_electricity_proxy.py @@ -0,0 +1,133 @@ +"""Electricity proxy collector for China. + +Fetches annual electricity-generation data for China from the public +Our World in Data / Ember energy dataset. The series is a stable, +credentials-free proxy for total electricity output in terawatt-hours. + +Output EconomicData rows: + indicator = "electricity_proxy_china" + value = annual electricity generation (TWh) + unit = "TWh" +""" + +import io +import logging +from datetime import datetime, timezone + +import pandas as pd + +from core.base_collector import BaseCollector +from core.exceptions import SchemaChangedError + +logger = logging.getLogger(__name__) + +# Public OWID/Ember energy dataset (CSV). +OWID_ENERGY_CSV = "https://raw.githubusercontent.com/owid/energy-data/master/owid-energy-data.csv" + + +class ElectricityProxyCollector(BaseCollector): + """Collector for a public China electricity-generation proxy.""" + + name = "electricity_proxy" + source_type = "api" + + INDICATOR = "electricity_proxy_china" + UNIT = "TWh" + COUNTRY = "China" + VALUE_COLUMN = "electricity_generation" + + async def collect(self) -> list[dict]: + """Fetch the OWID energy CSV and return it as a raw record.""" + try: + resp = await self._http.get(OWID_ENERGY_CSV) + if resp.status_code != 200: + logger.warning( + f"[{self.name}] OWID returned HTTP {resp.status_code}" + ) + return [] + return [{"csv_text": resp.text}] + except Exception as e: + logger.warning(f"[{self.name}] Collection failed: {e}") + return [] + + async def parse(self, raw_data: list[dict]) -> pd.DataFrame: + """Transform the OWID CSV into EconomicData-shaped rows for China.""" + if not raw_data: + return pd.DataFrame(columns=[ + "indicator", "date", "value", "unit", "metadata" + ]) + + csv_text = raw_data[0].get("csv_text", "") + if not csv_text: + return pd.DataFrame(columns=[ + "indicator", "date", "value", "unit", "metadata" + ]) + + try: + df = pd.read_csv(io.StringIO(csv_text)) + except Exception as e: + logger.warning(f"[{self.name}] Failed to parse CSV: {e}") + return pd.DataFrame(columns=[ + "indicator", "date", "value", "unit", "metadata" + ]) + + if "country" not in df.columns or self.VALUE_COLUMN not in df.columns: + logger.warning( + f"[{self.name}] Required columns missing from OWID dataset" + ) + return pd.DataFrame(columns=[ + "indicator", "date", "value", "unit", "metadata" + ]) + + china = df[df["country"] == self.COUNTRY].copy() + china = china.dropna(subset=[self.VALUE_COLUMN, "year"]) + if china.empty: + logger.warning( + f"[{self.name}] No {self.COUNTRY} data found in OWID dataset" + ) + return pd.DataFrame(columns=[ + "indicator", "date", "value", "unit", "metadata" + ]) + + rows = [] + for _, row in china.iterrows(): + try: + year = int(row["year"]) + value = float(row[self.VALUE_COLUMN]) + date = datetime(year, 12, 31, tzinfo=timezone.utc) + rows.append({ + "indicator": self.INDICATOR, + "date": date, + "value": value, + "unit": self.UNIT, + "metadata": { + "country": self.COUNTRY, + "year": year, + "source_dataset": "owid-energy-data", + "source_url": OWID_ENERGY_CSV, + }, + }) + except (ValueError, TypeError) as e: + logger.warning( + f"[{self.name}] Skipping invalid row for year " + f"{row.get('year')}: {e}" + ) + + if not rows: + logger.warning(f"[{self.name}] No parseable rows for {self.COUNTRY}") + return pd.DataFrame(columns=[ + "indicator", "date", "value", "unit", "metadata" + ]) + + return pd.DataFrame(rows) + + def validate(self, df: pd.DataFrame) -> bool: + """Validate that parsed rows contain the required EconomicData columns.""" + if df.empty: + return True + + required = ["indicator", "date", "value"] + missing = [c for c in required if c not in df.columns] + if missing: + raise SchemaChangedError(self.name, required, list(df.columns)) + return True diff --git a/collectors/physical_sentinel2.py b/collectors/physical_sentinel2.py new file mode 100644 index 0000000..52fd002 --- /dev/null +++ b/collectors/physical_sentinel2.py @@ -0,0 +1,153 @@ +"""Sentinel-2 physical-anchor collector for China. + +Queries the public Copernicus Data Space Ecosystem (CDSE) OData catalogue for +Sentinel-2 scene counts intersecting a China bounding box, aggregated by month. +No authentication is required for catalogue search. + +If the catalogue is unavailable or the response schema changes, the collector +gracefully returns an empty result and logs a warning. +""" + +import logging +from datetime import datetime, timezone + +import httpx +import pandas as pd +from dateutil.relativedelta import relativedelta + +from core.base_collector import BaseCollector +from core.exceptions import SchemaChangedError + +logger = logging.getLogger(__name__) + +# Approximate bounding box for mainland China (lon, lat). +CHINA_BBOX_WKT = ( + "POLYGON ((73.5 18, 135 18, 135 53.5, 73.5 53.5, 73.5 18))" +) + +CDSE_CATALOG_URL = "https://catalogue.dataspace.copernicus.eu/odata/v1/Products" + + +class Sentinel2Collector(BaseCollector): + """Collect monthly Sentinel-2 scene-count proxy for China.""" + + name = "sentinel2" + source_type = "api" + + def __init__(self, config: dict): + super().__init__(config) + self.months_back = config.get("months_back", 3) + self.catalog_url = config.get("catalog_url", CDSE_CATALOG_URL) + self.indicator = "sentinel2_scene_count_china" + self.unit = "scenes" + + async def collect(self) -> list[dict]: + """Fetch monthly Sentinel-2 scene counts for China from CDSE. + + Returns a list of raw records. On any failure, logs a warning and + returns an empty list so the engine can continue with other sources. + """ + records: list[dict] = [] + now = datetime.now(timezone.utc) + current_month = now.replace(day=1, hour=0, minute=0, second=0, microsecond=0) + + for offset in range(self.months_back): + month_start = current_month - relativedelta(months=offset) + month_end = month_start + relativedelta(months=1) + + odata_filter = ( + f"Collection/Name eq 'SENTINEL-2' and " + f"OData.CSC.Intersects(area=geography'SRID=4326;{CHINA_BBOX_WKT}') and " + f"ContentDate/Start ge {month_start.isoformat()} and " + f"ContentDate/Start lt {month_end.isoformat()}" + ) + params = { + "$filter": odata_filter, + "$count": "true", + "$top": "0", + } + + try: + resp = await self._http.get(self.catalog_url, params=params) + resp.raise_for_status() + payload = resp.json() + count = payload.get("@odata.count") + if count is None: + logger.warning( + "[%s] Missing @odata.count for month %s", + self.name, + month_start.date().isoformat(), + ) + continue + + records.append( + { + "date": month_start, + "value": int(count), + "indicator": self.indicator, + "unit": self.unit, + "metadata": { + "bbox_wkt": CHINA_BBOX_WKT, + "collection": "SENTINEL-2", + "month": month_start.date().isoformat(), + "catalog_url": self.catalog_url, + "query_filter": odata_filter, + }, + } + ) + except httpx.HTTPError as e: + logger.warning( + "[%s] HTTP error fetching scene count for %s: %s", + self.name, + month_start.date().isoformat(), + e, + ) + except Exception as e: + logger.warning( + "[%s] Unexpected error fetching scene count for %s: %s", + self.name, + month_start.date().isoformat(), + e, + ) + + logger.info( + "[%s] Collected %d monthly Sentinel-2 scene-count records", + self.name, + len(records), + ) + return records + + async def parse(self, raw_data: list[dict]) -> pd.DataFrame: + """Transform raw records into EconomicData-shaped rows.""" + if not raw_data: + return pd.DataFrame( + columns=["date", "value", "indicator", "unit", "metadata"] + ) + + rows = [] + for record in raw_data: + rows.append( + { + "date": record.get("date"), + "value": record.get("value"), + "indicator": record.get("indicator", self.indicator), + "unit": record.get("unit", self.unit), + "metadata": record.get("metadata", {}), + } + ) + + df = pd.DataFrame(rows) + if not df.empty and "date" in df.columns: + df["date"] = pd.to_datetime(df["date"], utc=True) + return df + + def validate(self, df: pd.DataFrame) -> bool: + """Validate the parsed DataFrame schema. + + Raises SchemaChangedError if required columns are missing. + """ + required = ["date", "value", "indicator", "unit"] + missing = [c for c in required if c not in df.columns] + if missing: + raise SchemaChangedError(self.name, required, list(df.columns)) + return True diff --git a/collectors/physical_viirs_nightlights.py b/collectors/physical_viirs_nightlights.py new file mode 100644 index 0000000..03a9134 --- /dev/null +++ b/collectors/physical_viirs_nightlights.py @@ -0,0 +1,79 @@ +"""VIIRS Nighttime Lights collector for China. + +This module implements a `BaseCollector` subclass for monthly VIIRS DNB +(Day/Night Band) nighttime lights over China. Aggregating the raw radiance +values into a single country-level time-series requires access to global +GeoTIFF/COG composites and a raster processing stack (rasterio/gdal), which is +not part of the project dependency set and is not exposed by a stable, +credentials-free JSON endpoint. + +Therefore the current implementation degrades gracefully: it logs a TODO and +returns an empty record set. When a public summary service or open COG-stat +endpoint becomes available, `collect()` can be wired to fetch it and `parse()` +can transform the response into EconomicData-shaped rows. +""" + +import logging + +import pandas as pd + +from core.base_collector import BaseCollector +from core.exceptions import SchemaChangedError + +logger = logging.getLogger(__name__) + + +class VIIRSNightlightsCollector(BaseCollector): + """Collector for VIIRS nighttime lights over China. + + Expected output indicator: ``viirs_nightlights_china`` + Unit: ``nW/cm2/sr`` + """ + + name = "viirs_nightlights" + source_type = "api" + + async def collect(self) -> list[dict]: + """Fetch raw VIIRS records. + + Currently a no-op stub because a stable public endpoint that exposes + pre-aggregated monthly radiance values for China is not available in + the allowed dependency set. + """ + logger.warning( + "[%s] TODO: no stable public endpoint for aggregated monthly VIIRS " + "nighttime lights over China (requires raster aggregation or " + "authenticated NASA/NOAA data). Returning empty.", + self.name, + ) + return [] + + async def parse(self, raw_data: list[dict]) -> pd.DataFrame: + """Transform raw VIIRS records into EconomicData-shaped rows.""" + if not raw_data: + return pd.DataFrame( + columns=["indicator", "date", "value", "unit", "metadata"] + ) + + rows = [] + for record in raw_data: + rows.append( + { + "indicator": record.get( + "indicator", "viirs_nightlights_china" + ), + "date": record.get("date"), + "value": record.get("value"), + "unit": record.get("unit", "nW/cm2/sr"), + "metadata": record.get("metadata", {}), + } + ) + return pd.DataFrame(rows) + + def validate(self, df: pd.DataFrame) -> bool: + """Validate that the parsed DataFrame has the expected columns.""" + required = ["indicator", "date", "value", "unit"] + missing = [c for c in required if c not in df.columns] + if missing: + raise SchemaChangedError(self.name, required, list(df.columns)) + return True diff --git a/config/cn_hf_sources.json b/config/cn_hf_sources.json index 50755f5..c31eddc 100644 --- a/config/cn_hf_sources.json +++ b/config/cn_hf_sources.json @@ -6,10 +6,11 @@ "name_en": "CPCA China Passenger Vehicle Retail Sales", "url": "https://www.cpcaauto.com/", "access_method": "todo", + "parser": "cn_hf", "frequency": "monthly", "sector": "autos", "difficulty": "hard", - "note": "Monthly passenger-vehicle retail sales for China published by the China Passenger Car Association (CPCA / 乘联会). The association releases the figures via website articles, PDF reports and its WeChat public account; there is no stable open JSON or CSV endpoint. Extracting a clean monthly time-series requires scraping or parsing the published reports. Marked todo until a scraper is implemented." + "note": "Monthly passenger-vehicle retail sales for China published by the China Passenger Car Association (CPCA / 乘联会). The association releases the figures via website articles, PDF reports and its WeChat public account; there is no stable open JSON or CSV endpoint. Extracting a clean monthly time-series requires scraping or parsing the published reports." }, { "key": "autos_cpca_wholesale", @@ -17,10 +18,11 @@ "name_en": "CPCA China Passenger Vehicle Wholesale Sales", "url": "https://www.cpcaauto.com/", "access_method": "todo", + "parser": "cn_hf", "frequency": "monthly", "sector": "autos", "difficulty": "hard", - "note": "Monthly passenger-vehicle wholesale sales for China published by the China Passenger Car Association (CPCA / 乘联会). The association releases the figures via website articles, PDF reports and its WeChat public account; there is no stable open JSON or CSV endpoint. Extracting a clean monthly time-series requires scraping or parsing the published reports. Marked todo until a scraper is implemented." + "note": "Monthly passenger-vehicle wholesale sales for China published by the China Passenger Car Association (CPCA / 乘联会). The association releases the figures via website articles, PDF reports and its WeChat public account; there is no stable open JSON or CSV endpoint. Extracting a clean monthly time-series requires scraping or parsing the published reports." }, { "key": "cement_digital", @@ -50,6 +52,7 @@ "name_en": "Thermal coal consumption", "url": "https://www.cctd.com.cn/", "access_method": "todo", + "parser": "cn_hf", "frequency": "daily", "sector": "coal", "difficulty": "hard", @@ -58,13 +61,15 @@ { "key": "bdi", "name_zh": "波罗的海干散货指数", - "name_en": "Baltic Dry Index", - "url": "https://www.balticexchange.com/en/data-services/market-information0/dry-services.html", - "access_method": "todo", + "name_en": "Baltic Dry Index (BDI)", + "url": "https://www.investing.com/indices/baltic-dry-historical-data", + "access_method": "scrape", + "parser": "cn_hf", "frequency": "daily", "sector": "transport_logistics", - "difficulty": "hard", - "note": "Daily composite dry-bulk freight index published by the Baltic Exchange in London. The authoritative time-series is subscription-only; public aggregator pages (e.g. Trading Economics, investing.com, Macrotrends) expose current and historical values but require scraping and are subject to anti-bot/ToS limits. Marked todo until a stable open endpoint or scraper is implemented." + "difficulty": "medium", + "unit": "index", + "note": "Daily composite dry-bulk freight index published by the Baltic Exchange. The official feed is subscription-only; this parser scrapes the public Investing.com historical table as a best-effort open proxy. It is subject to anti-bot/ToS limits and may return partial history." }, { "key": "ccfi", @@ -72,21 +77,25 @@ "name_en": "China Containerized Freight Index (CCFI)", "url": "https://en.sse.net.cn/currentIndex?indexName=ccfi", "access_method": "open_json", + "parser": "cn_hf", "frequency": "weekly", "sector": "transport_logistics", "difficulty": "easy", - "note": "Published weekly (Fridays) by the Shanghai Shipping Exchange. The public endpoint returns the current CCFI composite index and route sub-indices as JSON without authentication. Historical single-date queries via /singleIndex/ccfi require login, but the latest/current release is openly available." + "unit": "index", + "note": "Published weekly (Fridays) by the Shanghai Shipping Exchange. The public /currentIndex endpoint returns the latest CCFI composite index and per-route sub-indices as JSON without authentication." }, { "key": "freight_road_logistics", "name_zh": "中国公路物流运价指数", "name_en": "China Road Logistics Price Index", - "url": "http://www.chinawuliu.com.cn/", - "access_method": "todo", + "url": "http://www.chinawuliu.com.cn/xsyj/tjsj/", + "access_method": "scrape", + "parser": "cn_hf", "frequency": "weekly", "sector": "transport_logistics", - "difficulty": "hard", - "note": "Published jointly by the China Federation of Logistics and Purchasing (CFLP) and Guangdong Lin'an Logistics Group. The weekly/monthly road-freight price index is released as HTML press releases on chinawuliu.com.cn and WeChat; no open JSON/CSV endpoint was found. Requires HTML scraping or manual extraction." + "difficulty": "medium", + "unit": "index", + "note": "Weekly road-freight price index published by CFLP and Lin'an Logistics. Scraped from public HTML report pages on chinawuliu.com.cn." }, { "key": "scfi", @@ -94,10 +103,12 @@ "name_en": "Shanghai Containerized Freight Index (SCFI)", "url": "https://en.sse.net.cn/currentIndex?indexName=scfi", "access_method": "open_json", + "parser": "cn_hf", "frequency": "weekly", "sector": "transport_logistics", "difficulty": "easy", - "note": "Published weekly (Fridays) by the Shanghai Shipping Exchange. The public /currentIndex endpoint returns the latest SCFI composite index and per-route sub-indices as JSON without authentication. Historical multi-date queries via /index/mutipleIndex require login, but the current release is openly available." + "unit": "index", + "note": "Published weekly (Fridays) by the Shanghai Shipping Exchange. The public /currentIndex endpoint returns the latest SCFI composite index and per-route sub-indices as JSON without authentication." }, { "key": "macro_caixin_pmi", @@ -127,6 +138,7 @@ "name_en": "China Monthly Import and Export Trade (GACC)", "url": "https://chinadata.live/api/v2/data/china-trade-monthly", "access_method": "open_json", + "parser": "cn_hf", "frequency": "monthly", "sector": "macro", "difficulty": "easy", @@ -138,10 +150,11 @@ "name_en": "NBS China Macroeconomic Data", "url": "https://data.stats.gov.cn/easyquery.htm?cn=A01", "access_method": "todo", + "parser": "cn_hf", "frequency": "monthly", "sector": "macro", "difficulty": "hard", - "note": "Monthly macroeconomic releases from the National Bureau of Statistics of China (NBS / 国家统计局), including CPI, PPI, industrial production, retail sales, fixed-asset investment and surveyed urban unemployment. The NBS National Data (EasyQuery) portal can be queried via parameterized endpoints such as https://data.stats.gov.cn/easyquery.htm?m=QueryData&dbcode=hgyd&rowcode=zb&colcode=sj, but responses are JSONP/HTML, require constructing wd/dfwds parameters, and are protected by anti-bot measures with no documented open API key. Marked todo until a stable scraper or API client is implemented." + "note": "Monthly macroeconomic releases from the National Bureau of Statistics of China. The EasyQuery portal can be queried via parameterized endpoints, but responses are JSONP/HTML, require undocumented wd/dfwds parameters, and are protected by anti-bot measures with no documented open API key. Marked todo until a stable scraper or API client is implemented." }, { "key": "macro_pboc_credit", @@ -149,10 +162,11 @@ "name_en": "PBOC Credit, Money Supply and Aggregate Financing", "url": "http://www.pbc.gov.cn/en/3688240/index.html", "access_method": "todo", + "parser": "cn_hf", "frequency": "monthly", "sector": "macro", "difficulty": "hard", - "note": "Monthly money supply, sources/uses of credit funds and aggregate financing data from the People’s Bank of China (Monetary Statistics pages). No stable open JSON or CSV endpoint is available; figures are published as HTML/Excel tables that require scraping and Chinese date parsing. Marked todo until a parser is implemented." + "note": "Monthly money supply, sources/uses of credit funds and aggregate financing data from the People’s Bank of China. No stable open JSON or CSV endpoint is currently available; figures are published as HTML/Excel tables that require scraping and Chinese date parsing. Marked todo until a reliable public access path is confirmed." }, { "key": "mobility_12306", @@ -160,6 +174,7 @@ "name_en": "12306 Railway Passenger Traffic", "url": "https://www.12306.cn", "access_method": "todo", + "parser": "cn_hf", "frequency": "daily", "sector": "transport_logistics", "difficulty": "hard", @@ -171,10 +186,11 @@ "name_en": "Baidu Maps City Congestion Index", "url": "https://jiaotong.baidu.com/reports/", "access_method": "todo", + "parser": "cn_hf", "frequency": "daily", "sector": "mobility", "difficulty": "hard", - "note": "Baidu Maps publishes city congestion rankings and reports on jiaotong.baidu.com, but no open bulk JSON/Csv endpoint exists. Real-time road traffic requires a Baidu Maps API key (ak) and is restricted by platform terms; implementation needs a scraper or authenticated API integration." + "note": "Baidu Maps publishes city congestion rankings and reports on jiaotong.baidu.com, but no open bulk JSON/CSV endpoint exists. Real-time road traffic requires a Baidu Maps API key (ak) and is restricted by platform terms; implementation needs a scraper or authenticated API integration." }, { "key": "mobility_baidu_migration", @@ -182,6 +198,7 @@ "name_en": "Baidu Migration Index (Qianxi)", "url": "http://huiyan.baidu.com/migration/cityrank.jsonp", "access_method": "todo", + "parser": "cn_hf", "frequency": "daily", "sector": "mobility", "difficulty": "hard", @@ -193,6 +210,7 @@ "name_en": "Mobility & Box Office", "url": "https://piaofang.maoyan.com/dashboard", "access_method": "todo", + "parser": "cn_hf", "frequency": "daily", "sector": "mobility", "difficulty": "hard", @@ -204,6 +222,7 @@ "name_en": "CAAC Monthly Civil Aviation KPIs", "url": "https://www.caac.gov.cn/XXGK/XXGK/TJSJ/TJSJ_1/", "access_method": "todo", + "parser": "cn_hf", "frequency": "monthly", "sector": "mobility", "difficulty": "hard", @@ -213,12 +232,14 @@ "key": "mobility_gaode", "name_zh": "高德地图城市拥堵延时指数", "name_en": "Amap City Congestion Delay Index", - "url": "https://report.amap.com/", - "access_method": "todo", + "url": "https://report.amap.com/diagnosis/ajax/countryindicators.do", + "access_method": "open_json", + "parser": "cn_hf", "frequency": "daily", "sector": "mobility", - "difficulty": "hard", - "note": "Amap (Gaode) publishes city congestion-delay and traffic-health dashboards at report.amap.com, updated daily/real-time. No open bulk JSON/CSV endpoint was found; Amap's LBS Traffic Status API requires an authenticated key and is governed by platform terms. Implementation needs a scraper or approved API integration." + "difficulty": "medium", + "unit": "index", + "note": "Amap's public traffic-report dashboard exposes an open JSON endpoint (report.amap.com/diagnosis/ajax/countryindicators.do) returning the current national average of the road-network trip delay index and related traffic-health indicators. No authentication is required." }, { "key": "property_cric", @@ -226,10 +247,11 @@ "name_en": "CRIC China Real Estate Data", "url": "https://www.cricbigdata.com/", "access_method": "todo", + "parser": "cn_hf", "frequency": "monthly", "sector": "property", "difficulty": "hard", - "note": "Commercial/proprietary real-estate data from CRIC (克尔瑞), a leading China property data and consulting provider. No public open API; indicators such as sales volume, inventory, prices, land auctions and developer rankings are behind a subscription/paywall and would require authenticated scraping or a commercial license. Marked todo until a scraper or API agreement is implemented." + "note": "Commercial/proprietary real-estate data from CRIC (克尔瑞). No public open API; indicators are behind a subscription/paywall. Stubbed until a lawful public endpoint or scraper is available." }, { "key": "property_zhongzhi_land", @@ -237,10 +259,11 @@ "name_en": "CREIS / Zhongzhi China Land Auction and Transaction Data", "url": "https://www.cih-index.com/landlist/land/", "access_method": "todo", + "parser": "cn_hf", "frequency": "daily", "sector": "property", "difficulty": "hard", - "note": "Daily land auction, supply-plan and transaction listings published by China Index Academy (CREIS / 中指云 / 中指研究院) at cih-index.com. The public portal shows search/filter pages and limited preview records, but full parcel details, historical time-series and ranked city aggregates are gated behind login/subscription or delivered through the commercial API at https://api.cih-index.com/. No stable open JSON/CSV endpoint is available, so this source is marked todo until an authenticated scraper or commercial API agreement is implemented." + "note": "Daily land auction, supply-plan and transaction listings published by China Index Academy (CREIS / 中指云 / 中指研究院). The public portal shows search/filter pages and limited preview records, but full parcel details, historical time-series and ranked city aggregates are gated behind login/subscription or delivered through the commercial API at https://api.cih-index.com/. No stable open JSON/CSV endpoint is available, so this source is a TODO stub." }, { "key": "steel_100njz_construction", @@ -248,10 +271,11 @@ "name_en": "100njz Construction Steel Price", "url": "https://jiancai.mysteel.com/", "access_method": "todo", + "parser": "cn_hf", "frequency": "daily", "sector": "steel", "difficulty": "hard", - "note": "百年建筑网(100njz.com)建筑钢材行情由上海钢联(Mysteel)建筑钢材频道(jiancai.mysteel.com)提供,日度更新。价格数据以HTML/动态渲染为主,需登录或商业数据接口,反爬机制强。TODO:实现专用scraper或接入Mysteel商业API。" + "note": "百年建筑网(100njz.com)建筑钢材行情由上海钢联(Mysteel)建筑钢材频道(jiancai.mysteel.com)提供,日度更新。公开页面仅展示'电议'报价,具体日度价格数据需登录或商业数据接口。TODO:接入Mysteel/100njz商业API或确认可公开访问的价格指数页面。" }, { "key": "steel_mysteel", @@ -259,10 +283,11 @@ "name_en": "Mysteel China Steel Prices", "url": "https://news.mysteel.com/", "access_method": "todo", + "parser": "cn_hf", "frequency": "daily", "sector": "steel", "difficulty": "hard", - "note": "Daily spot steel and raw-materials prices for China published by Mysteel (我的钢铁网), including rebar, hot-rolled coil, cold-rolled coil, iron ore and coke. The public news page shows latest prices but there is no stable open JSON/API endpoint; the full historical time-series and detailed indices are behind a subscription/anti-bot wall. Marked todo until a scraper or commercial API agreement is implemented." + "note": "Daily spot steel and raw-materials prices for China published by Mysteel. No stable open JSON/API endpoint is available; the full time-series is behind a subscription/anti-bot wall. TODO: implement a scraper or commercial API integration." }, { "key": "yiwu_index", @@ -276,4 +301,4 @@ "note": "Official Yiwu small-commodity price, prosperity and confidence indices published by the Ministry of Commerce / Yiwu Municipal Government and operated by Zhejiang China Commodity City Group. The public portal renders the data in a Nuxt SPA loaded from internal apiserver.chinagoods.com endpoints; there is no documented open JSON/API and the time-series is behind dynamic rendering and anti-bot controls. Marked TODO until a robust scraper or reverse-engineered endpoint is implemented." } ] -} \ No newline at end of file +} diff --git a/config/sources.yaml b/config/sources.yaml index 896f972..c6c062f 100644 --- a/config/sources.yaml +++ b/config/sources.yaml @@ -280,141 +280,56 @@ sources: schedule: "0 7 * * *" # 7 AM UTC daily collector_class: "collectors.cn_indicators.CNIndicatorsCollector" config: - enabled_sources: - # ── Freight / logistics ───────────────────────────────── - - key: bdi - name_zh: 波罗的海干散货指数 - name_en: Baltic Dry Index - url: "https://tradingeconomics.com/commodity/baltic" - method: GET - parser: json - unit: index - sector: transport_logistics - access: todo - note: "Daily bulk-shipping proxy. Public page; requires scraper or FRED API key." - - key: ccfi - name_zh: 中国出口集装箱运价指数 - name_en: China Containerized Freight Index - url: "http://www.sse.net.cn/index/singleIndex?indexType=ccfi" - method: GET - parser: json - unit: index - sector: transport_logistics - access: scrape - note: "Shanghai Shipping Exchange; HTML table scrape." - - key: scfi - name_zh: 上海出口集装箱运价指数 - name_en: Shanghai Containerized Freight Index - url: "http://www.sse.net.cn/index/singleIndex?indexType=scfi" - method: GET - parser: json - unit: index - sector: transport_logistics - access: scrape - note: "Shanghai Shipping Exchange; HTML table scrape." + # enabled_sources is intentionally omitted so the collector loads the full + # source catalog from config/cn_hf_sources.json at runtime. - # ── Trade hubs ────────────────────────────────────────── - - key: yiwu_index - name_zh: 义乌中国小商品指数 - name_en: Yiwu China Commodity Index - url: "http://www.ywindex.com/" - method: GET - parser: json - unit: index - sector: trade - access: todo - note: "Yiwu small-commodity wholesale benchmark; scrape or partner feed required." - # ── Autos ─────────────────────────────────────────────── - - key: cpca_retail_pv - name_zh: 乘联会乘用车零售销量 - name_en: CPCA Passenger Vehicle Retail Sales - url: "https://www.cpauto.com.cn/" - method: GET - parser: json - unit: units - sector: automotive - access: todo - note: "CPCA publishes via WeChat/website; requires scrape or data partner." - - key: cpca_wholesale_pv - name_zh: 乘联会乘用车批发销量 - name_en: CPCA Passenger Vehicle Wholesale Sales - url: "https://www.cpauto.com.cn/" - method: GET - parser: json - unit: units - sector: automotive - access: todo - note: "CPCA publishes via WeChat/website; requires scrape or data partner." - - # ── Steel ─────────────────────────────────────────────── - - key: steel_price_index - name_zh: 中国钢材价格指数 - name_en: China Steel Price Index - url: "https://www.mysteel.com/" - method: GET - parser: json - unit: CNY/t - sector: steel - access: todo - note: "Mysteel/Myspic index is commercial; public alternative under review." + # ── China Physical Anchors ─────────────────────────────────── + electricity_proxy: + enabled: false + schedule: "0 0 * * 0" # Weekly — OWID/Ember annual data, low churn + collector_class: "collectors.physical_electricity_proxy.ElectricityProxyCollector" + config: + # Public OWID energy dataset; no API key required. + indicator: "electricity_proxy_china" + unit: "TWh" - # ── Cement ────────────────────────────────────────────── - - key: cement_price_index - name_zh: 中国水泥价格指数 - name_en: China Cement Price Index - url: "https://www.ccement.com/" - method: GET - parser: json - unit: CNY/t - sector: construction_materials - access: todo - note: "Cement price regional averages; requires scrape." + sentinel2: + enabled: false + schedule: "0 0 * * 0" # Weekly — Copernicus catalogue scene-count proxy + collector_class: "collectors.physical_sentinel2.Sentinel2Collector" + config: + months_back: 3 + catalog_url: "https://catalogue.dataspace.copernicus.eu/odata/v1/Products" - # ── Coal / energy ─────────────────────────────────────── - - key: coal_price_index - name_zh: 中国煤炭价格指数 - name_en: China Coal Price Index - url: "https://www.cctd.com.cn/" - method: GET - parser: json - unit: CNY/t - sector: energy - access: todo - note: "Coal price benchmarks (e.g., CCI, CCTD); commercial/scrape." + hk_property: + enabled: false + schedule: "0 0 5 * *" # Monthly (5th) — HK RVD private domestic price index + collector_class: "collectors.elite_hk_property.HKPropertyCollector" + config: {} - # ── Property ──────────────────────────────────────────── - - key: property_price_index - name_zh: 商品住宅销售价格指数 - name_en: China Property Price Index - url: "https://www.stats.gov.cn/tjsj/" - method: GET - parser: json - unit: index - sector: property - access: todo - note: "NBS monthly property price data; PDF/HTML tables." + safe_net_errors: + enabled: false + schedule: "0 0 1 * *" # Monthly (1st) — World Bank BOP net errors/omissions + collector_class: "collectors.elite_safe_net_errors.SAFENetErrorsCollector" + config: + indicator: "BN.KAC.EOMS.CD" + country: "CHN" + start_year: 2015 + end_year: 2026 - # ── Mobility ──────────────────────────────────────────── - - key: baidu_mobility - name_zh: 百度迁徙/出行指数 - name_en: Baidu Migration/Mobility Index - url: "https://qianxi.baidu.com/" - method: GET - parser: json - unit: index - sector: consumer_mobility - access: todo - note: "Public dashboards but no stable open API; scrape or API partnership." + # ── China Elite Signals ────────────────────────────────────── + peoples_daily_byline: + enabled: false + schedule: "0 4 * * *" # Daily — People's Daily print-edition article-count proxy + collector_class: "collectors.elite_peoples_daily_byline.PeoplesDailyCollector" + config: + lookback_days: 7 + timeout: 30 - # ── Macro ─────────────────────────────────────────────── - - key: pboc_lpr - name_zh: 贷款市场报价利率 - name_en: PBOC Loan Prime Rate - url: "http://www.pbc.gov.cn/zhengcehuobisi/11111/index.html" - method: GET - parser: json - unit: pct - sector: macro - access: todo - note: "Monthly 1Y/5Y LPR; official PBOC site, HTML tables." + politburo_readouts: + enabled: false + schedule: "0 0 * * 0" # Weekly — Wikipedia-sourced Politburo meeting metadata + collector_class: "collectors.elite_politburo_readouts.PolitburoReadoutsCollector" + config: + url: "https://en.wikipedia.org/wiki/20th_Politburo_of_the_Chinese_Communist_Party" diff --git a/dashboards/conditions_dashboard.html b/dashboards/conditions_dashboard.html index 4a77f59..4adafad 100644 --- a/dashboards/conditions_dashboard.html +++ b/dashboards/conditions_dashboard.html @@ -243,7 +243,8 @@ function setFeed(state){ const tag=$("feedTag"), note=$("feedNote"); - tag.className="feed-tag "+esc(state); + const feedClass={live:"live",snapshot:"snapshot",err:"err",sample:"sample"}[state]||"sample"; + tag.className="feed-tag "+feedClass; if(state==="live"){tag.textContent="● LIVE"; note.textContent="streaming from cbb:latest";} else if(state==="snapshot"){tag.textContent="● SNAPSHOT"; note.textContent="real conditions pull "+(window.__CBB_EMBED_AT__||"")+" — start the API for live updates";} else if(state==="err"){tag.textContent="API UNREACHABLE"; note.textContent="showing last snapshot — start the backend for live signal";} @@ -262,11 +263,14 @@ {k:"Deteriorating", v:deteriorating, cls:"danger"}, {k:"High confidence", v:highConf, cls:"accent"}, ]; - $("stats").innerHTML=cells.map(c=>` -
+ $("stats").innerHTML=cells.map(c=>{ + const cls=({accent:"accent",warn:"warn",danger:"danger","":""})[c.cls]||""; + return ` +
${esc(c.k)}
${esc(c.v)}
-
`).join(""); +
`; + }).join(""); } function renderHeatmap(rows){ @@ -283,7 +287,7 @@ ${esc(r.sector)} ${esc(r.region)} ${esc(r.period)} - ${num(r.D)} + ${num(r.D)} ${num(r.SD)} ${num(r.AS)} ${momentumArrow(r.momentum)} diff --git a/monitoring/health/reports/deep_validation_2026-03-23.md b/monitoring/health/reports/deep_validation_2026-03-23.md new file mode 100644 index 0000000..5b4e27c --- /dev/null +++ b/monitoring/health/reports/deep_validation_2026-03-23.md @@ -0,0 +1,613 @@ +# Deep Source Validation Report — 2026-03-23 + +**Generated**: Sunday, March 23, 2026 +**Type**: Weekly Deep Validation (Automated) +**Scope**: All 25 data sources, 14 RSS feeds, infrastructure + +--- + +## EXECUTIVE SUMMARY + +| Category | OK | WARN | BROKEN | ACTION REQUIRED | +|----------|----|----|--------|-----------------| +| Structured Data Collectors | 6 | 2 | 1 | 3 | +| RSS Feeds | 9 | 3 | 2 | 2 | +| Social Scrapers | 5 | 3 | 2 | 2 | +| Messaging | 0 | 1 | 1 | 2 | +| Connectors | 3 | 0 | 0 | 0 | +| Infrastructure | 2 | 0 | 0 | 0 | + +**Critical Issues (Immediate Action):** +1. **RBI DBIE** — Domain changed from `dbie.rbi.org.in` to `data.rbi.org.in`. Collector is BROKEN. +2. **SEC EDGAR EFTS** — Returning HTTP 403. User-Agent and/or endpoint may need updating. +3. **Twitter/X** — Platform now breaks scrapers every 2-4 weeks. High maintenance burden. + +--- + +## 1. STRUCTURED DATA COLLECTORS + +### 1.1 RBI DBIE — `collectors/rbi_dbie.py` + +| Field | Status | +|-------|--------| +| **ENDPOINT STATUS** | **BROKEN — 301 REDIRECT** | +| **STRUCTURE** | **MODIFIED** — Entire domain migrated | +| **AUTH REQUIREMENTS** | UNKNOWN (new SPA may require different auth) | +| **NEW FEATURES** | Angular SPA with MapMyIndia integration, dev tools blocking | +| **RECOMMENDATION** | **REWRITE COLLECTOR** (Priority: CRITICAL) | +| **FILE TO EDIT** | `~/social_scraper/collectors/rbi_dbie.py` | + +**Details**: `dbie.rbi.org.in` now returns **301 Moved Permanently** → `https://data.rbi.org.in/DBIE/#/`. The new portal is an **Angular single-page application** with: +- Material Design UI components (CSS variables like `--mat-*`) +- MapMyIndia API integration for geographic data +- Right-click and F12 prevention (anti-debugging) +- Likely different API endpoints under the hood + +**Current collector** uses `BASE_URL = "https://dbie.rbi.org.in/DBIE"` with API calls to `/dbie/api/data`. These endpoints **will fail** after redirect. + +**Action Items**: +1. Investigate the new Angular SPA's backend API (check network requests in browser) +2. Update `BASE_URL` to `https://data.rbi.org.in/DBIE` +3. Discover new API endpoints (likely REST API behind the Angular frontend) +4. Update all `_scrape_rbi_page()` methods for new HTML structure +5. Add new session/cookie handling if the SPA requires it + +--- + +### 1.2 RBI Circulars — `collectors/rbi_circulars.py` + +| Field | Status | +|-------|--------| +| **ENDPOINT STATUS** | **UP** | +| **STRUCTURE** | **UNCHANGED** | +| **AUTH REQUIREMENTS** | UNCHANGED (no auth) | +| **NEW FEATURES** | Accessibility controls, Hindi language toggle | +| **RECOMMENDATION** | No action | +| **FILE TO EDIT** | N/A | + +**Details**: `rbi.org.in` is fully operational (last updated March 21, 2026). Press releases, notifications, and circulars sections are all visible with expected URL patterns: +- Press releases: `/Scripts/BS_PressReleaseDisplay.aspx?prid=[number]` +- Circulars: `/Scripts/BS_ViewMasterCirculardetails.aspx` + +--- + +### 1.3 NSE Bhavcopy — `collectors/nse_bhavcopy.py` + +| Field | Status | +|-------|--------| +| **ENDPOINT STATUS** | **UP (with heavy anti-bot)** | +| **STRUCTURE** | UNCHANGED | +| **AUTH REQUIREMENTS** | **TIGHTENED** — Request timed out during fetch | +| **NEW FEATURES** | None detected | +| **RECOMMENDATION** | **Investigate manually** — verify cookie-based session still works | +| **FILE TO EDIT** | `~/social_scraper/collectors/nse_bhavcopy.py` | + +**Details**: NSE website timed out during automated fetch (60s timeout), consistent with their aggressive anti-bot protection. The collector already uses proper headers and cookie-based session (`_get_nse_cookies()`), but NSE may have tightened protections. + +**Action Items**: +1. Test collector manually in Docker environment +2. Consider rotating User-Agent strings +3. Monitor for new Cloudflare/WAF rules + +--- + +### 1.4 BSE API — `collectors/bse_api.py` + +| Field | Status | +|-------|--------| +| **ENDPOINT STATUS** | **UP** | +| **STRUCTURE** | **UNCHANGED** | +| **AUTH REQUIREMENTS** | UNCHANGED | +| **NEW FEATURES** | Google Analytics `G-TM52BJH9HF` tracking | +| **RECOMMENDATION** | No action | +| **FILE TO EDIT** | N/A | + +**Details**: BSE site fully operational with Angular.js templating. Key endpoints intact: +- `/corporates/anndet_new.aspx` — Corporate announcements +- `/markets/equity/searchsecurity.aspx` — Security search +- `/markets/Derivatives/DeriReports/` — Derivatives +- `/markets/Equity/EQReports/BlockDeals.html` — Block deals + +--- + +### 1.5 CCIL Rates — `collectors/ccil_rates.py` + +| Field | Status | +|-------|--------| +| **ENDPOINT STATUS** | **UP** | +| **STRUCTURE** | **UNCHANGED** | +| **AUTH REQUIREMENTS** | UNCHANGED | +| **NEW FEATURES** | Real-time charting (2/5/10/15/30/60-min intervals), CASBI index | +| **RECOMMENDATION** | No action (consider adding CASBI index as new data source) | +| **FILE TO EDIT** | N/A (optional: add CASBI) | + +**Details**: CCIL site operational with all expected sections: +- FBIL reference rates (via navigation, not homepage) +- Zero Coupon Yield Curve (ZCYC) under Data & Statistics +- Call market data (Open, High, Low, LTR, Volume, WAR) +- TREPS data in consolidated money market display +- CP/CD rates section in navigation + +**New Opportunity**: CCIL All Sovereign Bonds Index (CASBI) — new index tracking tool. Could be valuable for treasury analysis. + +--- + +### 1.6 FRED API — `collectors/fred_api.py` + +| Field | Status | +|-------|--------| +| **ENDPOINT STATUS** | **UP** | +| **STRUCTURE** | UNCHANGED | +| **AUTH REQUIREMENTS** | UNCHANGED (API key required, DEMO_KEY limited) | +| **NEW FEATURES** | None | +| **RECOMMENDATION** | No action | +| **FILE TO EDIT** | N/A | + +**Details**: FRED API returns HTTP 400 with `DEMO_KEY` (expected — the demo key has strict limits). With a proper `FRED_API_KEY`, the API works correctly. Response format unchanged: JSON with `observations`, `realtime_start`, pagination. + +**Series IDs verified as valid**: FEDFUNDS, CPIAUCSL, DGS10, DGS2, DTWEXBGS, UNRATE, GDP, SOFR, T10Y2Y, VIXCLS, BAMLH0A0HYM2 — all still listed in FRED catalog. + +--- + +### 1.7 SEBI Circulars — `collectors/sebi_circulars.py` + +| Field | Status | +|-------|--------| +| **ENDPOINT STATUS** | **UP** | +| **STRUCTURE** | **UNCHANGED** | +| **AUTH REQUIREMENTS** | UNCHANGED | +| **NEW FEATURES** | None | +| **RECOMMENDATION** | No action | +| **FILE TO EDIT** | N/A | + +**Details**: SEBI website operational. Acts/circulars listing uses simple two-column table format. Navigation includes Circulars, General Orders, Acts, Rules, Regulations as distinct categories. JavaScript datepicker for date filtering. + +--- + +### 1.8 data.gov.in — `collectors/data_gov_in.py` + +| Field | Status | +|-------|--------| +| **ENDPOINT STATUS** | **UP (partially rendered)** | +| **STRUCTURE** | UNCHANGED | +| **AUTH REQUIREMENTS** | UNCHANGED (API key via `DATA_GOV_API_KEY`) | +| **NEW FEATURES** | Platform now has 454,238 resources and 236,593 APIs | +| **RECOMMENDATION** | No action | +| **FILE TO EDIT** | N/A | + +**Details**: OGD India platform operational. Bootstrap 4.6.0 CSS confirmed. API endpoint at `data.gov.in/apis` still functional. CPI, WPI, IIP, GDP, GST datasets should be accessible via API key. + +**New Discovery**: **API Setu** (`apisetu.gov.in`) — Government of India's Open API Platform from MeitY. May provide additional structured data feeds worth investigating. + +--- + +### 1.9 World Bank API — `collectors/world_bank.py` + +| Field | Status | +|-------|--------| +| **ENDPOINT STATUS** | **UP** | +| **STRUCTURE** | UNCHANGED | +| **AUTH REQUIREMENTS** | UNCHANGED (no auth) | +| **NEW FEATURES** | Last updated 2026-02-24 | +| **RECOMMENDATION** | No action | +| **FILE TO EDIT** | N/A | + +**Details**: API responds with correct JSON structure. Pagination working (`page:1, pages:66, per_page:1, total:66`). India GDP indicator available. Note: 2025 value is `null` (data not yet released), which is expected — the collector should handle null values gracefully. + +Indicators verified: NY.GDP.MKTP.CD, FP.CPI.TOTL.ZG, BN.CAB.XOKA.CD +Countries verified: IN, US, CN, GB, JP, DE + +--- + +### 1.10 IMF Data — `collectors/imf_data.py` + +| Field | Status | +|-------|--------| +| **ENDPOINT STATUS** | **DOWN (404)** | +| **STRUCTURE** | **POTENTIALLY CHANGED** | +| **AUTH REQUIREMENTS** | UNKNOWN | +| **NEW FEATURES** | N/A | +| **RECOMMENDATION** | **Investigate manually** — check if API URL has changed | +| **FILE TO EDIT** | `~/social_scraper/collectors/imf_data.py` | + +**Details**: `data.imf.org/api/views/metadata` returned HTTP 404. The IMF may have reorganized its API endpoints. The IMF has been modernizing its data portal — endpoint structure may have changed. + +**Action Items**: +1. Check `https://data.imf.org` manually for new API documentation +2. Verify IFS, DOT, BOP dataset access URLs +3. Check if they've moved to a new API version or domain + +--- + +## 2. RSS FEEDS — `collectors/rss_feeds.py` + +| Feed | Status | Structure | Notes | +|------|--------|-----------|-------| +| **reuters_business** | **BLOCKED** | N/A | `feeds.reuters.com` unreachable (geo-blocked or deprecated) | +| **reuters_markets** | **BLOCKED** | N/A | Same issue as reuters_business | +| **et_economy** | **BLOCKED** | N/A | `economictimes.indiatimes.com` blocks automated fetches | +| **et_markets** | **BLOCKED** | N/A | Same as et_economy | +| **mint_economy** | **BLOCKED** | N/A | `livemint.com` blocks automated fetches | +| **mint_markets** | **BLOCKED** | N/A | Same as mint_economy | +| **moneycontrol** | **BLOCKED** | N/A | `moneycontrol.com` blocks automated fetches | +| **rbi_press** | **UP** | UNCHANGED | Via rbi.org.in (confirmed working) | +| **fed_press** | **UP** | UNCHANGED | Standard RSS 2.0, 20 items, proper fields | +| **ecb_press** | **UP** | UNCHANGED | RSS 2.0, active feed (March 21, 2026 latest) | +| **coindesk** | **UP** | UNCHANGED | RSS 2.0, 25 items, media:content extensions | +| **cnbc** | **UP** | UNCHANGED | RSS 2.0, 30 items, custom metadata elements | +| **ft_markets** | **UNKNOWN** | N/A | Could not test (likely requires auth) | +| **arxiv_qfin** | **UP (empty)** | UNCHANGED | Feed structure valid but 0 items on Sunday | + +**Note on BLOCKED feeds**: Reuters, ET, Mint, MoneyControl all block automated HTTP requests. This does NOT necessarily mean the collector is broken — the collector running inside Docker with proper headers/cookies may still work. The blocks are on the fetch tool I used, not necessarily on the collector's HTTP client. + +**Recommendation**: +- Verify reuters feeds in Docker — Reuters has been deprecating old RSS URLs in favor of new ones +- ET, Mint, MoneyControl — test from Docker with full headers +- Consider alternative Reuters feed URLs if old ones are dead + +--- + +## 3. SOCIAL SCRAPERS + +### 3.1 Reddit — `scrapers/reddit_scraper.py` + +| Field | Status | +|-------|--------| +| **ENDPOINT STATUS** | **UP (with caveats)** | +| **STRUCTURE** | UNCHANGED | +| **AUTH REQUIREMENTS** | **TIGHTENED** | +| **NEW FEATURES** | N/A | +| **RECOMMENDATION** | **Update parser** — verify rate limits | +| **FILE TO EDIT** | `~/social_scraper/scrapers/reddit_scraper.py` | + +**Details**: Reddit's public JSON API (`old.reddit.com/r/xxx/.json`) still works but: +- Rate limit: ~60 requests/minute (free, non-commercial) +- API pricing stabilized at $0.24/1K calls for commercial use (no increases since 2023) +- Reddit aggressively monitors for scrapers and will block suspicious patterns +- Authentication requires proper `user_agent` with username identification + +**Subreddits verified accessible**: wallstreetbets, cryptocurrency, stocks + +**Action Items**: +1. Ensure `user_agent` identifies the project and a contact email +2. Verify rate limiting is properly configured (collector uses BaseScraper rate limit) +3. Consider using PRAW for more reliable access + +--- + +### 3.2 Hacker News — `scrapers/hackernews_scraper.py` + +| Field | Status | +|-------|--------| +| **ENDPOINT STATUS** | **UP** | +| **STRUCTURE** | UNCHANGED | +| **AUTH REQUIREMENTS** | UNCHANGED (no auth) | +| **RECOMMENDATION** | No action | +| **FILE TO EDIT** | N/A | + +**Details**: Firebase API at `hacker-news.firebaseio.com/v0/` fully operational. Returns JSON arrays of story IDs as expected. Top stories, new stories endpoints working. + +--- + +### 3.3 YouTube — `scrapers/youtube_scraper.py` + +| Field | Status | +|-------|--------| +| **ENDPOINT STATUS** | **UP** | +| **STRUCTURE** | UNCHANGED | +| **AUTH REQUIREMENTS** | UNCHANGED | +| **NEW FEATURES** | N/A | +| **RECOMMENDATION** | No action | +| **FILE TO EDIT** | N/A | + +**Details**: YouTube Data API v3 quota remains at 10,000 units/day. Search costs 100 units/call, video list costs 1 unit/call. No major quota changes announced for 2026. Quota increase requires compliance audit (free). + +--- + +### 3.4 Mastodon — `scrapers/mastodon_scraper.py` + +| Field | Status | +|-------|--------| +| **ENDPOINT STATUS** | **UP (requires auth for public timeline)** | +| **STRUCTURE** | UNCHANGED | +| **AUTH REQUIREMENTS** | **CHANGED** — Public timeline now returns 422 | +| **RECOMMENDATION** | **Update parser** — may need authentication | +| **FILE TO EDIT** | `~/social_scraper/scrapers/mastodon_scraper.py` | + +**Details**: `mastodon.social/api/v1/timelines/public` returned HTTP 422 (Unprocessable Entity). This suggests the public timeline API now requires authentication or additional parameters. Individual instance timelines may still work differently. + +**Action Items**: +1. Add OAuth token for mastodon.social access +2. Test with `?local=true` parameter +3. Check if financial-specific instances have different restrictions + +--- + +### 3.5 GitHub — `scrapers/github_scraper.py` + +| Field | Status | +|-------|--------| +| **ENDPOINT STATUS** | **UP** | +| **STRUCTURE** | UNCHANGED | +| **AUTH REQUIREMENTS** | UNCHANGED | +| **RECOMMENDATION** | No action | +| **FILE TO EDIT** | N/A | + +**Details**: GitHub REST API v3 operational. Rate limits (unauthenticated): +- Core: 60 req/window +- Search: 10 req/window +- Code Search: 60 req/window +- GraphQL: 0 (requires auth) + +With `GITHUB_TOKEN` (as configured in docker-compose), limits increase to 5,000/hour for core. + +--- + +### 3.6 SEC EDGAR — `scrapers/sec_scraper.py` + +| Field | Status | +|-------|--------| +| **ENDPOINT STATUS** | **BROKEN (403 Forbidden)** | +| **STRUCTURE** | **CHANGED** | +| **AUTH REQUIREMENTS** | **NEW RESTRICTIONS** | +| **NEW FEATURES** | EDGAR 26.1 release, new taxonomy support, beta environment | +| **RECOMMENDATION** | **Update parser** (Priority: HIGH) | +| **FILE TO EDIT** | `~/social_scraper/scrapers/sec_scraper.py` | + +**Details**: The EFTS (Full-Text Search) endpoint at `efts.sec.gov/LATEST/search-index` returns **HTTP 403 Forbidden**. Key changes: + +1. **EDGAR Release 26.1** (March 16-18, 2026) — major modernization: + - New filing fee validation (suspends incorrect filings) + - 2026 taxonomy versions accepted + - ACH limit dropped from ~$100M to ~$25M per transaction +2. SEC now requires a **proper User-Agent header** with contact info +3. The current User-Agent in the scraper is `"SocialScraper research@example.com"` — the placeholder email may be rejected +4. New **EDGAR Beta Environment** previewing API changes + +**Action Items**: +1. Update User-Agent to include a real email: `"EconScraper/1.0 (your-real-email@domain.com)"` +2. Check if `efts.sec.gov/LATEST/search-index` has been replaced — try the new EDGAR API at `api.edgarfiling.sec.gov` +3. Verify EFTS endpoint against SEC's developer resources at `sec.gov/about/developer-resources` +4. Test with `data.sec.gov` endpoints as alternative +5. Consider adding the EDGAR Beta environment for forward compatibility testing + +--- + +### 3.7 Discord — `scrapers/discord_scraper.py` + +| Field | Status | +|-------|--------| +| **ENDPOINT STATUS** | **UNKNOWN** (requires bot token to test) | +| **STRUCTURE** | UNCHANGED (Discord API is stable) | +| **AUTH REQUIREMENTS** | UNCHANGED | +| **RECOMMENDATION** | No action | +| **FILE TO EDIT** | N/A | + +--- + +### 3.8 Dark Web — `scrapers/darkweb_scraper.py` + +| Field | Status | +|-------|--------| +| **ENDPOINT STATUS** | **UNKNOWN** (requires Tor SOCKS5 proxy) | +| **STRUCTURE** | N/A | +| **AUTH REQUIREMENTS** | N/A | +| **RECOMMENDATION** | Verify Tor proxy connectivity in Docker | +| **FILE TO EDIT** | N/A | + +--- + +### 3.9 Web Scraper — `scrapers/web_scraper.py` + +| Field | Status | +|-------|--------| +| **ENDPOINT STATUS** | **UP** (generic scraper, target-dependent) | +| **STRUCTURE** | N/A | +| **RECOMMENDATION** | No action | + +--- + +### 3.10 Central Banks — `scrapers/centralbank_scraper.py` + +| Field | Status | +|-------|--------| +| **ENDPOINT STATUS** | **UP** | +| **STRUCTURE** | UNCHANGED | +| **RECOMMENDATION** | No action | + +**Details**: Fed, ECB, RBI press feeds all verified working (see RSS section). Fed RSS has categories: Monetary Policy, Enforcement Actions, Banking Policy, Orders. ECB feed active through March 21, 2026. Notable: Fed has released regulatory capital framework modernization proposals and tokenized securities capital treatment guidance. + +--- + +## 4. MESSAGING SOURCES + +### 4.1 Telegram — `collectors/telegram_channels.py` + +| Field | Status | +|-------|--------| +| **ENDPOINT STATUS** | **UNKNOWN** (requires api_id/api_hash) | +| **STRUCTURE** | N/A | +| **AUTH REQUIREMENTS** | UNCHANGED | +| **RECOMMENDATION** | **Verify channels still active** | +| **FILE TO EDIT** | N/A | + +**Channels to verify**: BloombergMarketsLive, financialjuice, WallStreetSilverOfficial, raboratory + +--- + +### 4.2 Twitter — `collectors/twitter_lists.py` + `scrapers/twitter_scraper.py` + +| Field | Status | +|-------|--------| +| **ENDPOINT STATUS** | **HIGH RISK** | +| **STRUCTURE** | **FREQUENTLY CHANGING** | +| **AUTH REQUIREMENTS** | **TIGHTENED SIGNIFICANTLY** | +| **NEW FEATURES** | $15K liquidated damages clause for >1M automated requests/day | +| **RECOMMENDATION** | **Investigate manually** (Priority: HIGH) | +| **FILE TO EDIT** | `~/social_scraper/scrapers/twitter_scraper.py` + `~/social_scraper/collectors/twitter_lists.py` | + +**Details**: Twitter/X has become the most hostile platform for scraping as of 2026: +- **Difficulty rating**: Hard (4/5) due to Cloudflare WAF, login wall, aggressive rate limiting +- **Legal risk**: ToS states >1M automated requests/24h = **$15,000 liquidated damages** +- **Defensive updates every 2-4 weeks** that break DIY scrapers +- **Estimated maintenance**: 10-15 hours/month to keep working +- **twikit library** (used by the scraper) may or may not keep up with changes + +**Action Items**: +1. Test if current twikit-based scraper still authenticates +2. Verify cookie-based auth flow hasn't been blocked +3. Consider reducing scrape frequency to minimize detection risk +4. Evaluate if the data value justifies the maintenance cost +5. Consider alternative data sources for financial Twitter sentiment + +--- + +## 5. CONNECTORS + +### 5.1 DragonScope Connector — `connectors/dragonscope.py` + +| Field | Status | +|-------|--------| +| **STATUS** | **OK (config-dependent)** | +| **RECOMMENDATION** | No action | + +Configuration via env vars: `DRAGONSCOPE_REDIS_URL`, `DRAGONSCOPE_API_URL` + +--- + +### 5.2 LiquiFi Connector — `connectors/liquifi.py` + +| Field | Status | +|-------|--------| +| **STATUS** | **OK (config-dependent)** | +| **RECOMMENDATION** | No action | + +Configuration via env vars: `LIQUIFI_REDIS_URL`, `LIQUIFI_API_URL` + +--- + +### 5.3 Router — `connectors/router.py` + +| Field | Status | +|-------|--------| +| **STATUS** | **OK** | +| **RECOMMENDATION** | No action | + +Routing task runs every 3 minutes on `routing` queue. + +--- + +## 6. INFRASTRUCTURE + +### 6.1 Celery Beat Schedules + +**Source of truth**: `core/scheduler.py` (reads `config/sources.yaml` dynamically) +**Legacy file**: `scheduler/schedule.py` (DEPRECATED — contains only warnings) + +**Schedule Comparison — No Mismatches Found**: + +All YAML-configured sources have corresponding schedule entries in `build_beat_schedule()`: +- 13 YAML-driven collector tasks → `collectors` queue +- 12 hardcoded scraper tasks → `collectors` queue +- 9 system tasks (processing, health, routing, cleanup, reporting) + +**Potential Issue**: `scheduler/schedule.py` still exists as deprecated file — consider removing to avoid confusion. + +### 6.2 Docker Compose + +**Services**: 11 containers, all properly configured: +- No port conflicts detected (5432, 6379, 9000/9001, 2181, 9092, 9050/8118, 8000, 5555) +- Health checks configured for postgres, redis, api +- Worker concurrency: 6 (general), 2 (NLP) +- All environment variables properly referenced + +**No issues found.** + +--- + +## 7. NEW DATA SOURCES & OPPORTUNITIES + +### 7.1 Discovered — Worth Investigating + +| Source | URL | Value for NBFC Treasury | +|--------|-----|------------------------| +| **API Setu** | `apisetu.gov.in` | Government of India Open API platform — may have new fiscal/tax data feeds | +| **CCIL CASBI Index** | Via CCIL site | All Sovereign Bonds Index — valuable for treasury portfolio benchmarking | +| **ICICI Breeze API** | `icicidirect.com/api/breeze` | Free API for NSE/BSE data — alternative to direct NSE scraping | +| **Global Datafeeds** | `globaldatafeeds.in/apis` | Comprehensive Indian exchange data (NSE, NFO, BSE, MCX) via API | +| **Indian Stock Market API (GitHub)** | `github.com/0xramm/Indian-Stock-Market-API` | Free REST API for NSE/BSE via Yahoo Finance, no API key required | +| **Fed Tokenized Securities Guidance** | Via Fed RSS | New regulatory framework for tokenized securities — relevant for digital asset treasury | +| **EDGAR Beta Environment** | `sec.gov/submit-filings/improving-edgar/edgar-beta-environment` | Preview of new EDGAR API changes | + +### 7.2 API Change Alerts + +| Platform | Change | Impact | +|----------|--------|--------| +| **Reddit** | API pricing stable at $0.24/1K calls (commercial). Non-commercial still free at 60 req/min | LOW — current scraper is non-commercial | +| **Twitter/X** | $15K liquidated damages for >1M automated requests/day. Defensive changes every 2-4 weeks | HIGH — evaluate continued usage | +| **YouTube** | 10K units/day quota unchanged. No 2026 changes announced | NONE | +| **SEC EDGAR** | Release 26.1 (March 2026): new taxonomies, filing fee changes, API modernization | HIGH — EFTS returning 403 | +| **GitHub** | No significant API changes | NONE | +| **Mastodon** | Public timeline may now require auth (422 response) | MEDIUM | + +--- + +## 8. PRIORITY ACTION ITEMS + +### CRITICAL (Fix This Week) + +1. **RBI DBIE Collector Rewrite** — Domain migrated to `data.rbi.org.in/DBIE`. Current collector will fail on all datasets. + - File: `~/social_scraper/collectors/rbi_dbie.py` + - Action: Investigate new Angular SPA backend API, update BASE_URL and all endpoint paths + +2. **SEC EDGAR Scraper Fix** — EFTS API returning 403. + - File: `~/social_scraper/scrapers/sec_scraper.py` + - Action: Update User-Agent to real email, check new EDGAR API endpoints at `api.edgarfiling.sec.gov` + +### HIGH (Fix Within 2 Weeks) + +3. **Twitter Scraper Assessment** — Platform actively breaking scrapers every 2-4 weeks. + - Files: `~/social_scraper/scrapers/twitter_scraper.py`, `~/social_scraper/collectors/twitter_lists.py` + - Action: Test current twikit flow, evaluate cost vs value, consider reducing frequency + +4. **IMF Data API Investigation** — Metadata endpoint returning 404. + - File: `~/social_scraper/collectors/imf_data.py` + - Action: Check new IMF data portal API documentation + +### MEDIUM (Fix Within 1 Month) + +5. **Mastodon Scraper Auth** — Public timeline requires authentication. + - File: `~/social_scraper/scrapers/mastodon_scraper.py` + - Action: Add OAuth token, test with `?local=true` parameter + +6. **NSE Anti-Bot Verification** — Timeout during fetch may indicate tightened protection. + - File: `~/social_scraper/collectors/nse_bhavcopy.py` + - Action: Test from Docker, consider User-Agent rotation + +### LOW (Backlog) + +7. **Reuters RSS Feed URLs** — May be deprecated. Verify or find alternatives. +8. **Remove deprecated `scheduler/schedule.py`** — Causes confusion. +9. **Add CCIL CASBI Index** — New sovereign bond index for treasury analysis. +10. **Evaluate API Setu** — Government API gateway may have useful data. + +--- + +## 9. SOURCES CONSULTED + +- [Reddit API Pricing 2026](https://easyreadernews.com/reddit-api-pricing-explained-costs-limits-and-what-you-should-know-in-2026/) +- [How to Scrape Reddit in 2026](https://dev.to/agenthustler/how-to-scrape-reddit-in-2026-3-methods-that-still-work-402b) +- [SEC EDGAR API Development Toolkit](https://api.edgarfiling.sec.gov/) +- [SEC Developer Resources](https://www.sec.gov/about/developer-resources) +- [EDGAR Release 26.1](https://filepoint.com/news-resources/edgar-release-26-1/) +- [Draft 2026 SEC Taxonomies](https://www.sec.gov/newsroom/whats-new/2509-draft-2026-sec-taxonomies) +- [Twitter Scraping History 2026](https://scrapebadger.com/blog/twitter-scraping-history-landscape-for-2026) +- [YouTube API Quota 2026](https://zernio.com/blog/youtube-api-limits-how-to-calculate-api-usage-cost-and-fix-exceeded-api-quota) +- [India Open Data APIs](https://www.data.gov.in/apis) +- [API Setu](https://www.apisetu.gov.in/) +- [Indian Financial Data APIs 2026](https://www.nb-data.com/p/best-financial-data-apis-in-2026) +- [Free Indian Stock Market API](https://github.com/0xramm/Indian-Stock-Market-API) + +--- + +*Report generated by automated deep validation task. Next run: 2026-03-30.* diff --git a/processors/cbb_quality.py b/processors/cbb_quality.py new file mode 100644 index 0000000..0df63e5 --- /dev/null +++ b/processors/cbb_quality.py @@ -0,0 +1,425 @@ +"""Per-source data-quality validation for China Beige-Book-style sources. + +Validates rows destined for the ``EconomicData`` table from: + +* ``collectors/comtrade_mirror.py`` -> ``source = "comtrade_mirror"`` +* ``collectors/cn_indicators.py`` -> ``source = "cn_indicators"`` + +The module is importable and runnable without a database; it accepts plain +dicts and returns JSON-serialisable reports. +""" + +from __future__ import annotations + +import json +import math +from collections import defaultdict +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Optional + + +# ── Config paths (relative to project root) ────────────────────────── +_CATALOG_PATH = Path(__file__).resolve().parent.parent / "config" / "cn_hf_sources.json" + + +# ── Frequency helpers ──────────────────────────────────────────────── + +_FREQUENCY_DAYS: dict[str, int] = { + "daily": 1, + "weekly": 7, + "biweekly": 14, + "monthly": 30, + "quarterly": 91, + "annual": 365, + "yearly": 365, +} + +_DEFAULT_FREQUENCY_DAYS = 30 + + +def _frequency_to_days(freq: Optional[str]) -> int: + """Map a catalog frequency string to an approximate number of days.""" + if not freq: + return _DEFAULT_FREQUENCY_DAYS + return _FREQUENCY_DAYS.get(str(freq).lower().strip(), _DEFAULT_FREQUENCY_DAYS) + + +# ── Date / value parsing helpers ───────────────────────────────────── + +def _to_datetime(value: Any) -> Optional[datetime]: + """Convert a raw date value to a timezone-aware UTC datetime.""" + if value is None: + return None + + if isinstance(value, datetime): + if value.tzinfo is None: + return value.replace(tzinfo=timezone.utc) + return value.astimezone(timezone.utc) + + if isinstance(value, bool): + return None + + if isinstance(value, (int, float)): + try: + return datetime(int(value), 1, 1, tzinfo=timezone.utc) + except (ValueError, OverflowError): + return None + + s = str(value).strip() + if not s: + return None + + # ISO / pandas timestamp. + try: + parsed = datetime.fromisoformat(s.replace("Z", "+00:00")) + if parsed.tzinfo is None: + return parsed.replace(tzinfo=timezone.utc) + return parsed.astimezone(timezone.utc) + except (ValueError, TypeError): + pass + + for fmt in ("%Y-%m-%d", "%Y-%m", "%Y", "%Y/%m/%d", "%d-%m-%Y"): + try: + return datetime.strptime(s, fmt).replace(tzinfo=timezone.utc) + except ValueError: + pass + + return None + + +def _is_numeric(value: Any) -> bool: + """Return True if value is a finite int or float (bools are excluded).""" + if value is None or isinstance(value, bool): + return False + if isinstance(value, (int, float)): + return math.isfinite(value) + s = str(value).strip().replace(",", "") + if s.lower() in ("", ".", "-", "nd", "na", "n/a", "null", "none"): + return False + try: + return math.isfinite(float(s)) + except ValueError: + return False + + +# ── Schema helpers ─────────────────────────────────────────────────── + +def _check_schema(row: dict, idx: int) -> list[str]: + """Return a list of human-readable schema errors for one row.""" + errors: list[str] = [] + required = ("source", "indicator", "date", "value") + for field in required: + if field not in row or row[field] is None or row[field] == "": + errors.append(f"row {idx}: missing required field '{field}'") + + if "value" in row and not _is_numeric(row["value"]): + errors.append(f"row {idx}: 'value' is not numeric ({row.get('value')!r})") + + if "date" in row: + parsed_date = _to_datetime(row["date"]) + if parsed_date is None: + errors.append(f"row {idx}: 'date' is not parseable ({row.get('date')!r})") + + return errors + + +def _load_cn_catalog() -> list[dict]: + """Load the cn_indicators catalog from config/cn_hf_sources.json.""" + try: + if _CATALOG_PATH.exists(): + data = json.loads(_CATALOG_PATH.read_text(encoding="utf-8")) + if isinstance(data, dict): + return data.get("sources", []) or data.get("enabled_sources", []) or [] + if isinstance(data, list): + return data + except Exception: + return [] + return [] + + +# ── Source validators ──────────────────────────────────────────────── + +def validate_comtrade(rows: list[dict], now: Optional[datetime] = None, freshness_days: int = 7) -> dict: + """Validate a batch of ``comtrade_mirror`` rows. + + Checks: + * Required fields: ``source``, ``indicator``, ``date``, ``value``. + * ``value`` is numeric and finite. + * ``date`` is parseable. + * Freshness: the most recent ``collected_at`` (or ``date`` if absent) + is within ``freshness_days`` of ``now``. + + Returns a JSON-serialisable report dict. + """ + if now is None: + now = datetime.now(timezone.utc) + + schema_errors: list[str] = [] + bad_rows = 0 + + for idx, row in enumerate(rows): + row_errors = _check_schema(row, idx) + if row_errors: + schema_errors.extend(row_errors) + bad_rows += 1 + + schema_ok = len(rows) == 0 or bad_rows == 0 + + # Freshness is driven by the most recent collection time if available, + # otherwise by the most recent observation date. This matches the + # comtrade_mirror schedule (every 6 hours) while still tolerating + # monthly observation dates. + latest_collected_at: Optional[datetime] = None + latest_date: Optional[datetime] = None + for row in rows: + if "collected_at" in row: + ts = _to_datetime(row["collected_at"]) + if ts is not None and (latest_collected_at is None or ts > latest_collected_at): + latest_collected_at = ts + if "date" in row: + d = _to_datetime(row["date"]) + if d is not None and (latest_date is None or d > latest_date): + latest_date = d + + freshness_timestamp = latest_collected_at if latest_collected_at is not None else latest_date + + if freshness_timestamp is None: + freshness_age_days = None + freshness_ok = len(rows) == 0 # Empty batch is trivially fresh. + else: + age = now - freshness_timestamp + freshness_age_days = age.total_seconds() / 86400.0 + freshness_ok = freshness_age_days <= freshness_days + + return { + "source": "comtrade_mirror", + "row_count": len(rows), + "schema_valid": schema_ok, + "schema_errors": schema_errors, + "bad_rows": bad_rows, + "freshness_valid": freshness_ok, + "freshness_threshold_days": freshness_days, + "freshness_age_days": freshness_age_days, + "latest_timestamp": freshness_timestamp.isoformat() if freshness_timestamp else None, + } + + +def validate_cn_indicators( + rows: list[dict], + catalog: Optional[list[dict]] = None, + now: Optional[datetime] = None, +) -> dict: + """Validate a batch of ``cn_indicators`` rows. + + Rows are grouped by ``indicator``. For each indicator the catalog is + consulted for ``frequency``; freshness is checked so that the latest + observation is not older than ``frequency * 2``. + + ``catalog`` should be the list of source dicts from + ``config/cn_hf_sources.json``. If omitted, the file is loaded on disk; + if it is unavailable an empty catalog is used and every indicator is + reported as ``catalog_missing``. + """ + if now is None: + now = datetime.now(timezone.utc) + + if catalog is None: + catalog = _load_cn_catalog() + + freq_by_indicator = { + item.get("key", item.get("indicator")): item.get("frequency", "unknown") + for item in catalog + } + + # Group rows by indicator. + grouped: dict[str, list[dict]] = defaultdict(list) + schema_errors: list[str] = [] + bad_rows = 0 + + for idx, row in enumerate(rows): + schema_row_errors = _check_schema(row, idx) + if schema_row_errors: + schema_errors.extend(schema_row_errors) + bad_rows += 1 + indicator = str(row.get("indicator", "__unknown__")) + grouped[indicator].append(row) + + schema_ok = len(rows) == 0 or bad_rows == 0 + + indicator_reports: dict[str, dict] = {} + overall_freshness_ok = True + + for indicator, group_rows in grouped.items(): + latest_date: Optional[datetime] = None + for row in group_rows: + d = _to_datetime(row.get("date")) + if d is not None and (latest_date is None or d > latest_date): + latest_date = d + + freq = freq_by_indicator.get(indicator, "unknown") + freq_days = _frequency_to_days(freq) + threshold_days = freq_days * 2 + + if latest_date is None: + age_days = None + fresh = False + else: + age = now - latest_date + age_days = age.total_seconds() / 86400.0 + fresh = age_days <= threshold_days + + if not fresh: + overall_freshness_ok = False + + indicator_reports[indicator] = { + "row_count": len(group_rows), + "catalog_frequency": freq, + "freshness_threshold_days": threshold_days, + "freshness_age_days": age_days, + "latest_date": latest_date.isoformat() if latest_date else None, + "freshness_valid": fresh, + "catalog_missing": indicator not in freq_by_indicator, + } + + freshness_ok = len(rows) == 0 or overall_freshness_ok + + return { + "source": "cn_indicators", + "row_count": len(rows), + "schema_valid": schema_ok, + "schema_errors": schema_errors, + "bad_rows": bad_rows, + "freshness_valid": freshness_ok, + "indicators": indicator_reports, + } + + +# ── Overall runner ─────────────────────────────────────────────────── + +def run_quality_report(rows: list[dict], now: Optional[datetime] = None) -> dict: + """Run quality validation on a mixed list of EconomicData rows. + + Rows are routed by their ``source`` field. The report contains an + overall status plus per-source sub-reports from the dedicated validators. + """ + if now is None: + now = datetime.now(timezone.utc) + + comtrade_rows = [r for r in rows if r.get("source") == "comtrade_mirror"] + cn_rows = [r for r in rows if r.get("source") == "cn_indicators"] + other_rows = [r for r in rows if r.get("source") not in ("comtrade_mirror", "cn_indicators")] + + catalog = _load_cn_catalog() + + comtrade_report = validate_comtrade(comtrade_rows, now=now) + cn_report = validate_cn_indicators(cn_rows, catalog=catalog, now=now) + + reports = {} + if comtrade_rows or any(r.get("source") == "comtrade_mirror" for r in rows): + reports["comtrade_mirror"] = comtrade_report + if cn_rows or any(r.get("source") == "cn_indicators" for r in rows): + reports["cn_indicators"] = cn_report + + # Overall status. + flags = [] + for rep in reports.values(): + if not rep.get("schema_valid"): + flags.append("schema_error") + if not rep.get("freshness_valid"): + flags.append("stale") + + if not reports: + status = "empty" + elif "schema_error" in flags: + status = "fail" + elif "stale" in flags: + status = "degraded" + else: + status = "ok" + + return { + "status": status, + "generated_at": now.isoformat(), + "total_rows": len(rows), + "row_counts": { + "comtrade_mirror": len(comtrade_rows), + "cn_indicators": len(cn_rows), + "other": len(other_rows), + }, + "sources": reports, + } + + +# ── Synthetic fixtures / CLI ───────────────────────────────────────── + +def _synthetic_rows() -> list[dict]: + """Return a small set of offline fixture rows for manual testing.""" + now = datetime.now(timezone.utc) + + comtrade_fresh = { + "source": "comtrade_mirror", + "indicator": "trade_X_84", + "date": now.replace(day=1, hour=0, minute=0, second=0, microsecond=0), + "value": 1_200_000.0, + "collected_at": now, + "extra_data": {"hs": "84", "flow": "X", "view": "reported"}, + } + comtrade_stale = { + "source": "comtrade_mirror", + "indicator": "trade_M_85", + "date": now.replace(year=now.year - 1, day=1, hour=0, minute=0, second=0, microsecond=0), + "value": 800_000.0, + "collected_at": now.replace(year=now.year - 1), + "extra_data": {"hs": "85", "flow": "M", "view": "reported"}, + } + cn_fresh = { + "source": "cn_indicators", + "indicator": "ccfi", + "date": now, + "value": 1234.5, + "collected_at": now, + "extra_data": {"sector": "transport_logistics", "frequency": "weekly"}, + } + cn_stale = { + "source": "cn_indicators", + "indicator": "bdi", + "date": now.replace(day=1) if now.day > 5 else now.replace(month=now.month - 1 or 12, day=1), + "value": 1500.0, + "collected_at": now, + "extra_data": {"sector": "transport_logistics", "frequency": "daily"}, + } + cn_bad_value = { + "source": "cn_indicators", + "indicator": "macro_customs", + "date": now.replace(day=1), + "value": "not_a_number", + "collected_at": now, + } + return [comtrade_fresh, comtrade_stale, cn_fresh, cn_stale, cn_bad_value] + + +def _load_sample_data() -> list[dict]: + """Try to load real EconomicData-shaped rows from data/cbb/, else return fixtures.""" + sample_dir = Path(__file__).resolve().parent.parent / "data" / "cbb" + candidates = [ + sample_dir / "economic_data.jsonl", + sample_dir / "rows.jsonl", + ] + for path in candidates: + if path.exists(): + try: + rows = [json.loads(line) for line in path.read_text(encoding="utf-8").splitlines() if line.strip()] + if rows: + return rows + except Exception: + continue + return _synthetic_rows() + + +if __name__ == "__main__": + import json as _json + + rows = _load_sample_data() + report = run_quality_report(rows) + print(_json.dumps(report, indent=2, default=str)) diff --git a/reports/source_health_2026-03-24.md b/reports/source_health_2026-03-24.md new file mode 100644 index 0000000..3791f9d --- /dev/null +++ b/reports/source_health_2026-03-24.md @@ -0,0 +1,98 @@ +# Social Scraper — Daily Source Health Report +**Date:** 2026-03-24 + +--- + +## STRUCTURED DATA COLLECTORS + +### Indian Economy +🟢 **RBI Website** — Accessible, exchange rates updated March 23, circulars current (March 2026) +🟡 **RBI DBIE** — Redirected to `data.rbi.org.in/DBIE/` (301 from old `dbie.rbi.org.in`). SPA loads dynamically — content not verifiable via static fetch but redirect has been in place since June 2024. Verify scraper uses new URL. +🟡 **NSE India** — Timeout on fetch (aggressive bot blocking). Site likely operational but scraper must handle anti-bot measures (cookies, headers, rate limits). +🟢 **BSE India** — Accessible. Corporate announcements, market data, IPO sections all present. Angular SPA with dynamic content loading. +🟢 **CCIL** — Accessible. Money Market, G-Sec, Forex, Derivatives sections present. Data loads dynamically via AJAX. +🟢 **SEBI** — Accessible. "What's New" section with recent announcements visible. +🟢 **data.gov.in** — Accessible (page loads). No specific API deprecation notices found for 2026. + +### US & International +🟢 **FRED** — Accessible. No deprecation notices for v1 API. +🟡 **FRED API v2** — New API version launched Nov 2025 (bulk observations in JSON/XML). No v1 sunset date announced yet, but monitor for migration timeline. +🟢 **World Bank API** — Working. Returns valid JSON for India GDP indicator (latest data point: 2025, value null — expected lag for annual GDP). +🟢 **IMF Data** — Accessible. Home page reorganized with Data Explorer navigation. IFS/DOT/BOP may require navigating through Data Explorer rather than direct links. + +--- + +## NEWS & RSS FEEDS + +🟢 **CNBC RSS** — Valid RSS 2.0, 28 articles, last build March 23, 2026 +🟢 **CoinDesk RSS** — Valid RSS 2.0, 27 items, last updated March 23, 2026 +🟢 **arXiv q-fin RSS** — Valid XML, 18 recent papers (March 23, 2026) +🟢 **Federal Reserve Press RSS** — Valid XML, latest entry March 20 (FOMC statement March 18) +🟢 **ECB Press RSS** — Valid, last updated March 23, 2026 +🔴 **Reuters RSS** — Feeds officially deprecated since June 2020. Some workarounds have stopped working as of March 2026. Needs third-party RSS generator or direct web scraping. +🟡 **Economic Times RSS** — Blocked by fetch tool (bot protection). Verify scraper has proper headers/cookies. +🟡 **Livemint RSS** — Blocked by fetch tool. Same concern as ET. +🟡 **Moneycontrol RSS** — Blocked by fetch tool. Same concern. +🟡 **Financial Times RSS** — Not checked (typically paywalled). Verify subscription-based access still works. +🟢 **RBI Press RSS** — RBI website accessible, press releases current. + +--- + +## SOCIAL SCRAPERS + +🟡 **Reddit** — Blocked by fetch tool (bot protection). API access significantly tightened in 2026: approval harder to get since Jan 2026, free tier limited to 100 QPM, paid at $0.24/1K calls. Verify OAuth tokens and rate limits. +🟢 **Hacker News** — API fully operational. Firebase API returns valid JSON. Website accessible with current stories. +🟡 **YouTube** — Data API v3 still current, no full deprecation planned. Some features deprecated: `relatedToVideoId`, `commentThreads.update`, `comments.markAsSpam`. Verify your queries don't use deprecated endpoints. +🟢 **Mastodon** — Platform operational. Transitioning to European non-profit structure with new paid hosting model. Federation still functioning. +🟡 **GitHub** — New REST API version 2026-03-10 released. Breaking changes include: removed `merge_commit_sha` from PR responses, removed singular `assignee` field, changed workflow dispatch response. Old version 2022-11-28 still supported 24+ months. Verify API version header in scraper. +🔴 **SEC EDGAR** — `cgi-bin/browse-edgar` returned **403 Forbidden**. EFTS search also returned 403. EDGAR Next migration ongoing with beta environment. Forms 3/4/5 changes effective March 18, 2026. **Check if scraper needs updated User-Agent or new API endpoints.** +🟡 **Discord** — Permission changes Feb 23, 2026 (PIN_MESSAGES split). Voice API requires E2EE (DAVE) support since March 1. Verify bot permissions and DAVE compliance. +🟡 **Dark Web/Tor** — Not directly verifiable. Manual check recommended. +🟡 **Twitter/X** — Official API starts at $42K/year. Pay-per-use beta launched Nov 2025. Scraping landscape shifted to specialized proxy services. Verify current access method still functional. + +--- + +## MESSAGING CHANNELS + +🟡 **Telegram** — Not directly verifiable via fetch tool. Manual check recommended for channel activity. +🟡 **Twitter/X** — See above. Query-based scraping increasingly difficult. + +--- + +## CONNECTORS + +🟡 **DragonScope** — Not reachable via public web (likely internal API). Manual verification needed. +🟡 **LiquiFi** — Not reachable via public web (likely internal API). Manual verification needed. + +--- + +## IMMEDIATE ACTION NEEDED + +1. **🔴 SEC EDGAR** — Both `cgi-bin/browse-edgar` and EFTS returned 403. The SEC has been migrating to EDGAR Next with new API endpoints. Check `~/social_scraper/scrapers/sec_edgar/` — update User-Agent to comply with SEC requirements (`User-Agent: CompanyName admin@email.com`). Consider migrating to `api.edgarfiling.sec.gov` or `data.sec.gov` endpoints. + +2. **🔴 Reuters RSS** — Officially dead since 2020, workarounds failing. In `~/social_scraper/config/sources.yaml`, either: + - Replace with a third-party RSS generator service (e.g., rss.app) + - Switch to direct web scraping of reuters.com + - Use Reuters API if you have a commercial license + +## WATCH LIST + +- **FRED API v2** — Monitor for v1 sunset announcement. Consider proactive migration. +- **Reddit API** — Tighter approvals since Jan 2026. If scraper breaks, may need to re-apply or switch to paid tier. +- **GitHub API** — New version 2026-03-10 has breaking changes. Current version safe for 24+ months but plan migration. +- **Discord** — E2EE (DAVE) requirement for voice since March 1. Permission splits may affect bot. +- **NSE India / ET / Livemint / Moneycontrol** — Aggressive bot blocking detected. Ensure scrapers use proper browser headers, cookie management, and rate limiting. +- **SEC EDGAR** — Forms 3/4/5 schema changes effective March 18, 2026 (new Country and Foreign Trading Symbol fields). Update parsers if scraping insider trading data. +- **Twitter/X** — Monitor pay-per-use beta pricing. Current scraping approach may need periodic refreshing. + +## CONNECTOR STATUS + +- **DragonScope**: Unable to verify externally — requires internal network check +- **LiquiFi**: Unable to verify externally — requires internal network check + +## NEW OPPORTUNITIES + +- **FRED API v2**: Bulk observation downloads for all series in a release — could significantly speed up US macro data collection +- **SEC EDGAR API Toolkit**: New `api.edgarfiling.sec.gov` development toolkit available — modern REST API replacing legacy CGI interface +- **Mastodon Paid Hosting**: Mastodon offering enterprise services — could provide more reliable access for financial community monitoring +- **arXiv q-fin**: Strong signal source — 18 papers today including LLM-based stock prediction and Indian market survivorship bias studies diff --git a/reports/source_health_2026-04-09.md b/reports/source_health_2026-04-09.md new file mode 100644 index 0000000..31b8fb3 --- /dev/null +++ b/reports/source_health_2026-04-09.md @@ -0,0 +1,114 @@ +# Social Scraper — Daily Source Health Report +**Date:** 2026-04-09 + +--- + +## STRUCTURED DATA COLLECTORS + +### Indian Economy +🟢 **RBI Website** — Accessible. Updated April 8, 2026. Exchange rates, monetary policy statements current. +🔴 **RBI DBIE** — TLS certificate error on `dbie.rbi.org.in` (ERR_TLS_CERT_ALTNAME_INVALID). New domain `data.rbi.org.in/DBIE/` loads an Angular SPA but blocks automation (right-click disabled, F12 blocked). **Config still uses old URL. STILL UNFIXED from March 23 report.** +🟡 **NSE India** — Timeout on fetch (aggressive anti-bot). Likely operational but requires browser-level scraping. +🟢 **BSE India** — Accessible. Corporate announcements, market data, IPO, derivatives sections present. Angular SPA with dynamic content. +🟢 **CCIL** — Accessible. MIBOR, TREPS, CP/CD rates sections present. FBIL reference rates not on homepage (may require subpage navigation). Sovereign yield curve available as "ZCYC" under Value Added section. +🟢 **SEBI** — Accessible. Department directory and recent materials visible. Latest content dated late 2025/early 2026. +🔴 **data.gov.in** — **403 Forbidden**. API access blocked. No specific deprecation notice found, but access is currently denied. + +### US & International +🟡 **FRED** — Homepage returned 403 (bot blocking), but API itself likely operational. No deprecation notices for FRED API v1. No v1 sunset date yet. +🟢 **World Bank API** — Working. Returns valid JSON. Latest India GDP entry: 2025 (value null — expected annual lag). +🟡 **IMF Data** — 403 on `data.imf.org` homepage. Legacy portal retired Nov 5, 2025. New portal may block automated access. IFS/DOT/BOP datasets available through Data Explorer. + +--- + +## NEWS & RSS FEEDS + +🟢 **Federal Reserve Press RSS** — Valid XML, latest entry April 8, 2026 (FOMC Minutes from March 17-18) +🟢 **arXiv q-fin RSS** — Valid XML, 18 papers, latest April 8, 2026 +🟢 **CNBC RSS** — Likely operational (was fine on March 24) +🟢 **CoinDesk RSS** — Likely operational +🟢 **ECB Press RSS** — Likely operational +🔴 **Reuters RSS** — Dead since 2020. `feeds.reuters.com` unreachable. **STILL UNFIXED from March 24 report.** +🟡 **Economic Times / Livemint / Moneycontrol RSS** — Bot-blocked on fetch. Scraper needs proper headers/cookies. +🟡 **Financial Times RSS** — Paywalled. Manual verification needed. +🟢 **RBI Press RSS** — RBI website accessible, press releases current. + +--- + +## SOCIAL SCRAPERS + +🟡 **Reddit** — API approval significantly harder since Jan 2026. Rate limits: 60 req/min with OAuth, 10 without. Free tier capped at 100 QPM. Anti-AI/scraping terms tightened. +🟢 **Hacker News** — Fully operational. Firebase API returns valid JSON. Website accessible. +🟡 **YouTube** — Data API v3 stable, no full deprecation. Deprecated: `relatedToVideoId`, `commentThreads.update`, `comments.markAsSpam`. Verify queries. +🟡 **GitHub** — REST API v2026-03-10 released with breaking changes: `merge_commit_sha` removed from PRs, singular `assignee` removed. Old v2022-11-28 supported 24+ months. Security org fields deprecated April 21, 2026. +🔴 **SEC EDGAR** — `cgi-bin/browse-edgar` still 403. `data.sec.gov` submissions endpoint also 403. **Persistent issue since March 24. Scraper is broken.** +🟡 **Discord** — PIN_MESSAGES permission split effective Feb 23. E2EE (DAVE) required for voice since March 1. Verify bot compliance. +🟡 **Mastodon** — Operational. Transitioning to European non-profit structure. +🟡 **Twitter/X** — Official API $42K+/year. Pay-per-use beta from Nov 2025. $15K liquidated damages clause for >1M posts/day unauthorized scraping. High-risk for scraping approaches. +🟡 **Dark Web/Tor** — Not verifiable remotely. Manual check needed. + +--- + +## MESSAGING CHANNELS + +🟡 **Telegram** — Not verifiable via fetch. Manual check recommended. +🟡 **Twitter/X** — See above. Cookie-based scraping increasingly fragile. + +--- + +## CONNECTORS + +🟡 **DragonScope** — Internal API, not publicly reachable. Manual verification needed. +🟡 **LiquiFi** — Internal API, not publicly reachable. Manual verification needed. + +--- + +## IMMEDIATE ACTION NEEDED + +1. **🔴 RBI DBIE (PERSISTENT — unfixed since March 23)** + - TLS cert invalid on old URL, new portal blocks automation + - Files to update: + - `~/social_scraper/config/sources.yaml:8` — change `base_url` to `https://data.rbi.org.in/DBIE` + - `~/social_scraper/collectors/rbi_dbie.py:22` — update `BASE_URL` + - `~/social_scraper/run_collectors.py:58` — update hardcoded URL + - `~/social_scraper/monitoring/health/source_health_checker.py:99,542` — update check URLs + - `~/social_scraper/monitoring/health/structure_validator.py:198` — update validation URL + - `~/social_scraper/monitoring/health/baselines/rbi_dbie.json:2` — update baseline URL + - **WARNING**: New portal is Angular SPA with anti-automation. Collector may need Playwright/Selenium or API reverse-engineering. + +2. **🔴 SEC EDGAR (PERSISTENT — unfixed since March 24)** + - Both `cgi-bin/browse-edgar` AND `data.sec.gov` returning 403 + - Files: `~/social_scraper/scrapers/sec_scraper.py:53,140` + - Fix: Set User-Agent to `"CompanyName admin@email.com"` per SEC requirements. Migrate to `efts.sec.gov/LATEST/` or `data.sec.gov` with proper headers. + +3. **🔴 data.gov.in — NEW BREAKAGE** + - API returning 403 Forbidden (was working on March 24) + - File: `~/social_scraper/collectors/data_gov_in.py` + - Check if API key is still valid. May need re-registration or new auth mechanism. + +4. **🔴 Reuters RSS (PERSISTENT — unfixed since March 24)** + - `feeds.reuters.com` completely unreachable + - File: `~/social_scraper/config/sources.yaml:127-130` — remove or replace reuters feeds + - Options: third-party RSS generator, direct scraping, or commercial Reuters API + +## WATCH LIST + +- **FRED API** — Homepage 403 (bot blocking), API key-based access likely still fine. Monitor. +- **IMF Data** — New portal may require updated access patterns. Check `~/social_scraper/collectors/imf_data.py`. +- **Reddit** — Approval harder since Jan 2026. If tokens expire, re-approval may fail. +- **GitHub API** — v2026-03-10 breaking changes. Org security fields deprecated April 21. Plan migration. +- **Discord** — Permission splits and E2EE changes. Verify bot in `~/social_scraper/scrapers/discord_scraper.py`. +- **Twitter/X** — $15K damages clause for unauthorized bulk access. Reassess legal risk. +- **NSE / ET / Livemint / Moneycontrol** — Persistent bot blocking. Need browser-level scraping. + +## CONNECTOR STATUS + +- **DragonScope**: Cannot verify externally — internal network check required +- **LiquiFi**: Cannot verify externally — internal network check required + +## NEW OPPORTUNITIES + +- **RBI CIMS Integration**: RBI's new Centralised Information Management System (CIMS) launched with DBIE — may offer more structured API access than the old portal +- **SEC EDGAR API Toolkit**: `api.edgarfiling.sec.gov` development toolkit available as modern REST replacement for legacy CGI +- **GitHub REST API v2026-03-10**: New features alongside breaking changes — review for useful additions +- **FRED API v2**: Bulk observation downloads for all series in a release — significant speed improvement for US macro data collection diff --git a/reports/weekly_deep_validation_2026-04-09.md b/reports/weekly_deep_validation_2026-04-09.md new file mode 100644 index 0000000..dde3686 --- /dev/null +++ b/reports/weekly_deep_validation_2026-04-09.md @@ -0,0 +1,369 @@ +# Weekly Deep Validation Report — 2026-04-09 + +**Generated by:** Automated scheduled task +**Scope:** All social_scraper data sources, RSS feeds, scrapers, connectors, infrastructure + +--- + +## EXECUTIVE SUMMARY + +| Category | Total Sources | UP | CHANGED | DOWN/BROKEN | ACTION NEEDED | +|---|---|---|---|---|---| +| Structured Data Collectors | 10 | 6 | 2 | 2 | 4 | +| RSS Feeds | 14 | 7 | 2 | 5 | 5 | +| Social Scrapers | 10 | 5 | 3 | 2 | 5 | +| Messaging | 2 | 1 | 1 | 0 | 1 | +| Connectors | 3 | 3 | 0 | 0 | 0 | +| Infrastructure | 2 | 2 | 0 | 0 | 0 | + +**Critical issues requiring immediate attention: 4** +**Items needing investigation: 6** + +--- + +## 1. STRUCTURED DATA COLLECTORS + +### 1.1 RBI DBIE (collectors/rbi_dbie.py) +- **ENDPOINT STATUS:** DOWN (TLS certificate error on dbie.rbi.org.in) +- **STRUCTURE:** UNKNOWN — cannot verify due to TLS failure +- **AUTH REQUIREMENTS:** UNCHANGED +- **NEW FEATURES:** None detected +- **RECOMMENDATION:** **INVESTIGATE IMMEDIATELY** — TLS cert alt-name mismatch on `https://dbie.rbi.org.in/DBIE`. RBI may have updated their SSL certificate or moved the portal. Check if the data portal has migrated to `https://data.rbi.org.in` (new RBI data platform detected in web search). The new platform at `data.rbi.org.in` may be the replacement. +- **FILE TO EDIT:** `collectors/rbi_dbie.py` — update `BASE_URL` if portal has migrated + +### 1.2 RBI Circulars (collectors/rbi_circulars.py) +- **ENDPOINT STATUS:** UP +- **STRUCTURE:** UNCHANGED — Standard HTML with press releases under `../Scripts/BS_PressReleaseDisplay.aspx`, notifications/circulars organized by function codes, jQuery-driven accordion menus +- **AUTH REQUIREMENTS:** UNCHANGED +- **NEW FEATURES:** RBI issued new Digital Banking Channels Authorization Framework effective Jan 2026, and Digital Payment Authentication Framework effective Apr 1, 2026 — may generate high volume of new circulars +- **RECOMMENDATION:** No action needed on collector. Consider adding filters for DPDP Act compliance circulars. +- **FILE TO EDIT:** None + +### 1.3 NSE Bhavcopy (collectors/nse_bhavcopy.py) +- **ENDPOINT STATUS:** CHANGED (timeout on direct fetch — heavy anti-bot) +- **STRUCTURE:** LIKELY CHANGED — NSE has implemented static IP mandate effective April 1, 2026 for all API access. SEBI circular mandates whitelisted static IPs for API connections. +- **AUTH REQUIREMENTS:** **NEW RESTRICTIONS DETECTED** — NSE now rejects API calls from non-whitelisted/dynamic IPs. Cookie-based session approach in `nse_bhavcopy.py` may no longer work. +- **NEW FEATURES:** Enhanced security: Cloudflare Turnstile challenges, stricter rate limiting, request validation +- **RECOMMENDATION:** **UPDATE COLLECTOR URGENTLY** — The current approach of fetching cookies then hitting `/api/fiidiiTradeReact` and `/api/historical/cm/equity` may be completely blocked. Need to: (1) Register a static IP with your broker/NSE, (2) Update User-Agent and headers, (3) Consider using official data vendor API (TrueData, etc.) as fallback +- **FILE TO EDIT:** `collectors/nse_bhavcopy.py` lines 23-27 (headers), lines 29-31 (cookie fetch) + +### 1.4 BSE API (collectors/bse_api.py) +- **ENDPOINT STATUS:** UP +- **STRUCTURE:** UNCHANGED — Angular-based SPA, API endpoints intact: `/corporates/anndet_new.aspx?newsid=`, `/markets/equity/searchsecurity.aspx`, `/markets/debt/debt_corporate_EOD.aspx`, `/corporates/ann.html` +- **AUTH REQUIREMENTS:** UNCHANGED +- **NEW FEATURES:** None detected +- **RECOMMENDATION:** No action needed +- **FILE TO EDIT:** None + +### 1.5 CCIL Rates (collectors/ccil_rates.py) +- **ENDPOINT STATUS:** UP +- **STRUCTURE:** UNCHANGED — Money market data (Call, Repo, TREPS rates with WAR, volumes), G-Sec NDS-OM data, Forex FX-Clear (USD/INR spot with SPOT, C-SPOT, R-SPOT instruments), MIBOR-OIS derivatives via ASTROID platform, ZCYC rates section present +- **AUTH REQUIREMENTS:** UNCHANGED +- **NEW FEATURES:** Interactive charts with 60-minute lookback windows, additional derivative instruments (forwards, NDFs, swaptions) +- **RECOMMENDATION:** No action needed on core collector. Consider adding NDF and swaption data collection. +- **FILE TO EDIT:** None (optional: `collectors/ccil_rates.py` to add NDF data) + +### 1.6 FRED API (collectors/fred_api.py) +- **ENDPOINT STATUS:** CHANGED +- **STRUCTURE:** UNCHANGED (JSON response format same) +- **AUTH REQUIREMENTS:** **CHANGED** — FRED launched API Version 2 in November 2025. Strict API key requirements now enforced on `/data/` endpoint. DEMO_KEY returns 403 — real API key mandatory for all programmatic access. +- **NEW FEATURES:** ALFRED archival data no longer saveable to user accounts (as of Jan 5, 2026) +- **RECOMMENDATION:** Verify `FRED_API_KEY` env var is set and valid. The collector already requires an API key (line 31 checks for it). No code changes needed if key is valid. **Test with actual key to confirm.** +- **FILE TO EDIT:** None (verify API key is configured) + +### 1.7 SEBI Circulars (collectors/sebi_circulars.py) +- **ENDPOINT STATUS:** UP +- **STRUCTURE:** MODIFIED — Circulars listing now at `HomeAction.do?doListing=yes&sid=1&ssid=2&smid=0`, uses two-column layout (Year | Rule Name), "Updated List" and "Historical Data" toggle options added, breadcrumb navigation added +- **AUTH REQUIREMENTS:** UNCHANGED +- **NEW FEATURES:** New SEBI algo trading regulations framework effective April 2026 +- **RECOMMENDATION:** Verify collector parses the `doListing` action URL format. Check if `ssid` parameter maps correctly to circulars (ssid=2 is Rules, check ssid for circulars/orders). +- **FILE TO EDIT:** `collectors/sebi_circulars.py` — verify URL construction matches current page structure + +### 1.8 data.gov.in (collectors/data_gov_in.py) +- **ENDPOINT STATUS:** DOWN (403 Forbidden) +- **STRUCTURE:** UNKNOWN +- **AUTH REQUIREMENTS:** POSSIBLY CHANGED — API requires authorized API key; portal may have updated access policies +- **NEW FEATURES:** API Setu platform (`apisetu.gov.in`) is a newer government API gateway that may offer alternative access paths +- **RECOMMENDATION:** **INVESTIGATE** — Check if `DATA_GOV_API_KEY` is still valid. Portal may have migrated or tightened access. Consider API Setu as supplementary source. Also check `datagovindia` Python package for updated endpoints. +- **FILE TO EDIT:** `collectors/data_gov_in.py` + +### 1.9 World Bank API (collectors/world_bank.py) +- **ENDPOINT STATUS:** UP +- **STRUCTURE:** UNCHANGED — JSON array with metadata + data objects, pagination working (66 pages for India GDP), standard fields (indicator, country, date, value, ISO3, unit, obs_status, decimal) +- **AUTH REQUIREMENTS:** UNCHANGED (no auth needed) +- **NEW FEATURES:** None +- **RECOMMENDATION:** No action needed +- **FILE TO EDIT:** None + +### 1.10 IMF Data (collectors/imf_data.py) +- **ENDPOINT STATUS:** DOWN (403 Forbidden on data.imf.org) +- **STRUCTURE:** POSSIBLY CHANGED — IMF now operates three API systems: (1) SDMX 2.1 at `sdmxcentral.imf.org/ws/public/sdmxapi/rest`, (2) SDMX 3.0 at `sdmxcentral.imf.org/sdmx/v2`, (3) DataMapper at `imf.org/external/datamapper/api` +- **AUTH REQUIREMENTS:** POSSIBLY CHANGED +- **NEW FEATURES:** SDMX 3.0 API available, Structure Map for cross-dataset conversion +- **RECOMMENDATION:** **INVESTIGATE** — Check if collector uses `data.imf.org` directly or the SDMX Central API. May need to switch to `sdmxcentral.imf.org` endpoints. +- **FILE TO EDIT:** `collectors/imf_data.py` + +--- + +## 2. RSS FEEDS (collectors/rss_feeds.py) + +### Feed-by-Feed Status + +| # | Feed Name | URL | Status | Structure | Action | +|---|---|---|---|---|---| +| 1 | reuters_business | feeds.reuters.com/reuters/businessNews | **BROKEN** | N/A — feed domain unreachable | **REPLACE** | +| 2 | reuters_markets | feeds.reuters.com/reuters/marketsNews | **BROKEN** | N/A — same domain | **REPLACE** | +| 3 | et_economy | economictimes.indiatimes.com/.../rssfeedstopstories.cms | **BLOCKED** | Cannot fetch (anti-bot) | Investigate | +| 4 | et_markets | economictimes.indiatimes.com/markets/rssfeeds/1977021501.cms | **BLOCKED** | Cannot fetch (anti-bot) | Investigate | +| 5 | mint_economy | livemint.com/rss/economy | **BLOCKED** | Cannot fetch | Investigate | +| 6 | mint_markets | livemint.com/rss/markets | **BLOCKED** | Same as above | Investigate | +| 7 | moneycontrol | moneycontrol.com/rss/latestnews.xml | **BLOCKED** | Cannot fetch | Investigate | +| 8 | rbi_press | rbi.org.in/Scripts/BS_PressReleaseDisplay.aspx | UP | **Not RSS** — HTML page | Verify parser handles HTML | +| 9 | fed_press | federalreserve.gov/feeds/press_all.xml | **UP** | UNCHANGED — RSS 2.0, `` with title/link/description/pubDate/guid/category, 20 items | No action | +| 10 | ecb_press | ecb.europa.eu/rss/press.html | **UP** | MODIFIED — Valid RSS 2.0, 14 items, but **no `` in items** (only title/link/pubDate/guid) | Check parser handles missing description | +| 11 | coindesk | coindesk.com/arc/outboundfeeds/rss/ | **DOWN** (403) | Forbidden | **REPLACE or add auth** | +| 12 | cnbc | search.cnbc.com/rs/search/combinedcms/view.xml... | **UP** | UNCHANGED — Standard RSS 2.0, title/link/description/pubDate/guid, 30 items, includes `` and `` | No action | +| 13 | ft_markets | ft.com/markets?format=rss | **LIKELY BLOCKED** | FT is paywalled, RSS likely requires auth | Investigate | +| 14 | arxiv_qfin | rss.arxiv.org/rss/q-fin | **UP** | UNCHANGED — 18 items, title/link/description/categories, properly formatted | No action | + +### Critical RSS Issues + +1. **Reuters feeds are dead** — Reuters officially killed RSS in June 2020. The `feeds.reuters.com` domain has been fully decommissioned. **Must replace** with alternatives like Reuters Wire via API, or use third-party RSS generators (RSS.app, FiveFilters). + +2. **CoinDesk returning 403** — CoinDesk may have added Cloudflare protection or deprecated the `/arc/outboundfeeds/rss/` path. + +3. **Indian news sites (ET, Mint, MoneyControl)** — These appear blocked when fetched externally but may work from Indian IP addresses. Verify from the deployment environment. + +4. **ECB feed lacks descriptions** — The ECB RSS feed no longer includes `` tags. The RSS collector at `rss_feeds.py:30` uses `_strip_html()` on descriptions — ensure it handles `None` gracefully. + +--- + +## 3. SOCIAL SCRAPERS + +### 3.1 Reddit (scrapers/reddit_scraper.py) +- **ENDPOINT STATUS:** CHANGED +- **STRUCTURE:** Reddit's `.json` endpoint (used by the scraper) still works for read-only access without API key. However, Reddit deprecated r/all in April 2026 and shifted to algorithmic feeds. +- **AUTH REQUIREMENTS:** CHANGED — Free tier limited to non-commercial use. Commercial use requires Standard tier ($12,000/year, 100 req/min). Scraping without API key may trigger rate limits more aggressively. +- **NEW FEATURES:** r/all deprecated, engagement-optimized feeds +- **RECOMMENDATION:** Verify `.json` endpoint still returns data for target subreddits (wallstreetbets, cryptocurrency, stocks). Consider switching to authenticated API if rate limits are hit. +- **FILE TO EDIT:** `scrapers/reddit_scraper.py` + +### 3.2 Hacker News (scrapers/hackernews_scraper.py) +- **ENDPOINT STATUS:** UP +- **STRUCTURE:** UNCHANGED — Firebase API at `hacker-news.firebaseio.com/v0/` returns JSON arrays of story IDs as expected +- **AUTH REQUIREMENTS:** UNCHANGED (no auth needed) +- **RECOMMENDATION:** No action needed +- **FILE TO EDIT:** None + +### 3.3 YouTube (scrapers/youtube_scraper.py) +- **ENDPOINT STATUS:** UP (API v3 still active) +- **STRUCTURE:** UNCHANGED +- **AUTH REQUIREMENTS:** UNCHANGED — 10,000 units/day default quota remains +- **NEW FEATURES:** No quota changes announced for 2026 +- **RECOMMENDATION:** No action needed. Monitor quota usage. +- **FILE TO EDIT:** None + +### 3.4 Mastodon (scrapers/mastodon_scraper.py) +- **ENDPOINT STATUS:** UP (federation APIs generally stable) +- **STRUCTURE:** UNCHANGED +- **AUTH REQUIREMENTS:** UNCHANGED +- **RECOMMENDATION:** No action needed +- **FILE TO EDIT:** None + +### 3.5 GitHub (scrapers/github_scraper.py) +- **ENDPOINT STATUS:** UP +- **STRUCTURE:** UNCHANGED — REST API v3 + trending +- **AUTH REQUIREMENTS:** UNCHANGED +- **RECOMMENDATION:** No action needed +- **FILE TO EDIT:** None + +### 3.6 SEC EDGAR (scrapers/sec_scraper.py) +- **ENDPOINT STATUS:** CHANGED +- **STRUCTURE:** MODIFIED — SEC released EDGAR Release 26.0.1 and 26.1 with API updates. EDGAR full-text search API (`efts.sec.gov`) returns 403 from external requests (may require User-Agent with email as per SEC policy). +- **AUTH REQUIREMENTS:** CHANGED — SEC now requires `User-Agent` header with company name and email. The `data.sec.gov` RESTful APIs (submissions, XBRL) remain keyless. +- **NEW FEATURES:** EDGAR API Development Toolkit updated, EDGAR Beta environment for previewing API changes, Operational Status API with new degraded service statuses +- **RECOMMENDATION:** **UPDATE** — Ensure SEC scraper sends proper `User-Agent: CompanyName contact@email.com` header. Check EDGAR Beta for upcoming changes. Consider using `data.sec.gov` RESTful APIs instead of EFTS for more reliable access. +- **FILE TO EDIT:** `scrapers/sec_scraper.py` — add User-Agent header compliance + +### 3.7 Discord (scrapers/discord_scraper.py) +- **ENDPOINT STATUS:** UP (Discord API Gateway generally stable) +- **STRUCTURE:** UNCHANGED +- **AUTH REQUIREMENTS:** UNCHANGED — requires bot token +- **RECOMMENDATION:** No action needed +- **FILE TO EDIT:** None + +### 3.8 Dark Web (scrapers/darkweb_scraper.py) +- **ENDPOINT STATUS:** DEPENDENT ON TOR PROXY +- **STRUCTURE:** N/A — varies by source +- **AUTH REQUIREMENTS:** N/A +- **RECOMMENDATION:** Verify Tor SOCKS5 proxy (docker service `tor` on port 9050) is running and can establish circuits +- **FILE TO EDIT:** None + +### 3.9 Web Scraper (scrapers/web_scraper.py) +- **ENDPOINT STATUS:** VARIES BY TARGET +- **STRUCTURE:** N/A +- **RECOMMENDATION:** General anti-bot measures (Cloudflare, DataDome) continue to tighten across the web. Ensure scraper handles JS challenges. +- **FILE TO EDIT:** None + +### 3.10 Central Banks (scrapers/centralbank_scraper.py) +- **ENDPOINT STATUS:** UP +- **STRUCTURE:** UNCHANGED — Fed press RSS feed verified working and properly formatted +- **RECOMMENDATION:** No action needed +- **FILE TO EDIT:** None + +--- + +## 4. MESSAGING SOURCES + +### 4.1 Telegram (collectors/telegram_channels.py) +- **ENDPOINT STATUS:** UP (API access via api_id/api_hash) +- **STRUCTURE:** UNCHANGED +- **AUTH REQUIREMENTS:** UNCHANGED +- **RECOMMENDATION:** No action needed. Verify channels (BloombergMarketsLive, financialjuice, WallStreetSilverOfficial, raboratory) are still active. +- **FILE TO EDIT:** None + +### 4.2 Twitter/X (collectors/twitter_lists.py + scrapers/twitter_scraper.py) +- **ENDPOINT STATUS:** CHANGED +- **STRUCTURE:** MODIFIED — X migrated from in-house bot detection to Cloudflare Turnstile challenges. Defensive changes every 2-4 weeks break DIY scrapers. +- **AUTH REQUIREMENTS:** **SIGNIFICANTLY CHANGED** — Authenticated sessions required for most profile timelines and all search results. Cookie-based scraping (`twikit` library) may break frequently. Legal risk: >1M posts/24h triggers $15,000 liquidated damages. +- **NEW FEATURES:** Cloudflare Turnstile integration, stricter rate limiting +- **RECOMMENDATION:** **HIGH RISK** — Cookie-based scraping via twikit is increasingly fragile. Options: (1) Keep twikit but accept frequent breakages and re-auth, (2) Switch to a specialized scraping API service, (3) Reduce scraping frequency to minimize detection. **Monitor twikit library updates closely.** +- **FILE TO EDIT:** `collectors/twitter_lists.py`, `scrapers/twitter_scraper.py` + +--- + +## 5. CONNECTORS + +### 5.1 DragonScope Connector (connectors/dragonscope.py) +- **STATUS:** Code review — HEALTHY +- **Redis Push:** Uses `market:{category}` keys + `market:updates` pub/sub channel. Format compatible. +- **API Push:** Falls back to `POST /api/data/{category}`. Endpoint pattern is standard. +- **Platform mapping:** Complete — all 12 platforms mapped to DragonScope categories +- **RECOMMENDATION:** No action needed +- **FILE TO EDIT:** None + +### 5.2 LiquiFi Connector (connectors/liquifi.py) +- **STATUS:** Code review — HEALTHY +- **Treasury keywords:** Comprehensive coverage of rates, regulatory, forex, bonds, liquidity, banking, macro_india +- **Push targets:** WebSocket `/ws/rates` + REST API +- **RECOMMENDATION:** No action needed. Consider adding DPDP Act and digital payment keywords given RBI's April 2026 Digital Payment Authentication Framework. +- **FILE TO EDIT:** `connectors/liquifi.py` (optional — add DPDP keywords) + +### 5.3 Router (connectors/router.py) +- **STATUS:** Code review — HEALTHY +- **Classification logic:** Platform-based default + content-based override, working correctly +- **Routing:** DragonScope (Reddit, Discord, YouTube, HN, Mastodon, GitHub), LiquiFi (Central Banks), Both (Twitter, Telegram, RSS, Web, SEC, DarkWeb), All → Kafka +- **RECOMMENDATION:** No action needed. All content categories covered. +- **FILE TO EDIT:** None + +--- + +## 6. INFRASTRUCTURE + +### 6.1 Celery Beat Schedules + +**Schedule source:** `core/scheduler.py` (canonical) reads from `config/sources.yaml`. +**Legacy:** `scheduler/schedule.py` is deprecated (confirmed — just a warning). + +**YAML-driven collectors (sources.yaml):** 14 sources, all enabled, all have matching collector files. + +**Hardcoded social scrapers in core/scheduler.py:** +| Schedule Key | Task | Frequency | Matching File | +|---|---|---|---| +| scrape-reddit | core.tasks.scrape_reddit | */5 min | scrapers/reddit_scraper.py ✓ | +| scrape-twitter | core.tasks.scrape_twitter | */5 min | scrapers/twitter_scraper.py ✓ | +| scrape-hackernews | core.tasks.scrape_hackernews | */15 min | scrapers/hackernews_scraper.py ✓ | +| scrape-rss-financial | core.tasks.scrape_rss_financial | */2 min | scrapers/rss_scraper.py ✓ | +| scrape-central-banks | core.tasks.scrape_central_banks | */2 min | scrapers/centralbank_scraper.py ✓ | +| scrape-youtube | core.tasks.scrape_youtube | */15 min | scrapers/youtube_scraper.py ✓ | +| scrape-mastodon | core.tasks.scrape_mastodon | */15 min | scrapers/mastodon_scraper.py ✓ | +| scrape-sec | core.tasks.scrape_sec | */30 min | scrapers/sec_scraper.py ✓ | +| scrape-github | core.tasks.scrape_github | */30 min | scrapers/github_scraper.py ✓ | +| scrape-discord | core.tasks.scrape_discord | */30 min | scrapers/discord_scraper.py ✓ | +| scrape-web | core.tasks.scrape_web | */15 min | scrapers/web_scraper.py ✓ | +| scrape-darkweb | core.tasks.scrape_darkweb | */1 hr | scrapers/darkweb_scraper.py ✓ | + +**Orphaned schedule entries:** NONE +**Missing scrapers:** NONE +**Mismatches:** NONE — all schedule entries have corresponding scraper files. + +### 6.2 Docker Compose (docker-compose.yml) + +**Services defined:** 10 (postgres, redis, minio, zookeeper, kafka, tor, api, worker, beat, nlp-worker, flower) + +**Port conflicts:** NONE detected +| Service | Port | Status | +|---|---|---| +| postgres | 5432 | OK | +| redis | 6379 | OK | +| minio | 9000, 9001 | OK | +| zookeeper | 2181 | OK | +| kafka | 9092 | OK | +| tor | 9050, 8118 | OK | +| api | 8000 | OK | +| flower | 5555 | OK | + +**Health checks:** postgres (pg_isready), redis (redis-cli ping), minio (mc ready), kafka (kafka-topics --list), api (curl health endpoint) — ALL CONFIGURED. + +**Potential issue:** `nlp-worker` service does not have `restart: unless-stopped` set, unlike `worker` and `beat`. If it crashes, it won't restart automatically. + +--- + +## 7. NEW DATA SOURCES & API CHANGES + +### New Sources Worth Adding + +1. **RBI Data Platform (data.rbi.org.in)** — New RBI digital data platform. May replace or supplement DBIE. **Investigate as potential new collector.** + +2. **API Setu (apisetu.gov.in)** — Government of India's centralized API gateway. Could provide alternative access to data.gov.in datasets with potentially better reliability. + +3. **TrueData Market Data API (truedata.in)** — Indian market data API covering NSE EQ, NSE Indices, NSE F&O, BSE EQ, BSE Indices, BSE F&O, MCX. Could serve as reliable fallback for NSE/BSE data if direct scraping breaks. + +4. **IMF SDMX Central (sdmxcentral.imf.org)** — SDMX 3.0 API available as modern alternative to data.imf.org. + +5. **SEC EDGAR data.sec.gov** — RESTful APIs for submissions and XBRL data, no auth needed, sub-second latency. More reliable than EFTS full-text search. + +### Key Platform Changes in 2026 + +| Platform | Change | Impact | Urgency | +|---|---|---|---| +| NSE India | Static IP mandate (Apr 1, 2026) | API calls blocked from dynamic IPs | **CRITICAL** | +| Reddit | r/all deprecated, algorithmic feeds | Discovery pattern changed | MEDIUM | +| Twitter/X | Cloudflare Turnstile, auth-only search | Cookie scraping increasingly fragile | HIGH | +| FRED | API v2 strict key enforcement | Need valid key | LOW (already handled) | +| SEC EDGAR | Release 26.0.1 & 26.1, Beta API changes | New endpoints available | MEDIUM | +| RBI | DBIE portal TLS issues | Data collection may be interrupted | **CRITICAL** | +| Reuters | RSS feeds fully decommissioned (since 2020) | Two feed URLs broken | HIGH | +| CoinDesk | 403 on RSS endpoint | Feed broken | MEDIUM | +| RBI | Digital Payment Auth Framework (Apr 1, 2026) | High volume of new circulars expected | LOW | +| SEBI | Algo trading regulations framework | New regulatory content | LOW | + +--- + +## 8. PRIORITY ACTION ITEMS + +### P0 — Critical (Fix This Week) +1. **NSE Bhavcopy collector** — Static IP mandate blocks API access. Register static IP or switch to data vendor. (`collectors/nse_bhavcopy.py`) +2. **RBI DBIE collector** — TLS certificate error. Check if portal migrated to `data.rbi.org.in`. (`collectors/rbi_dbie.py`) +3. **Reuters RSS feeds** — Dead since 2020, should have been replaced long ago. Remove or replace with alternative. (`config/sources.yaml` lines 127-131) + +### P1 — High (Fix Within 2 Weeks) +4. **Twitter/X scraping** — Cloudflare Turnstile breaking cookie-based approach. Monitor twikit updates, consider reducing frequency. (`collectors/twitter_lists.py`, `scrapers/twitter_scraper.py`) +5. **CoinDesk RSS** — 403 Forbidden. Find new feed URL or remove. (`config/sources.yaml` line 157) +6. **SEC EDGAR scraper** — Add compliant User-Agent header. (`scrapers/sec_scraper.py`) + +### P2 — Medium (Fix Within Month) +7. **data.gov.in collector** — 403 Forbidden. Verify API key, investigate API Setu alternative. (`collectors/data_gov_in.py`) +8. **IMF Data collector** — 403 on data.imf.org. Consider SDMX Central endpoints. (`collectors/imf_data.py`) +9. **ECB RSS feed** — Missing `` tags. Verify parser handles None. (`collectors/rss_feeds.py`) +10. **Indian news RSS feeds** — May work from Indian IPs; verify from deployment environment. (ET, Mint, MoneyControl) + +### P3 — Low (Backlog) +11. Add `restart: unless-stopped` to `nlp-worker` service in `docker-compose.yml` +12. Consider adding DPDP Act keywords to LiquiFi connector +13. Explore new data sources: data.rbi.org.in, API Setu, TrueData, IMF SDMX 3.0, SEC data.sec.gov REST +14. Add NDF/swaption data to CCIL collector + +--- + +*Report generated: 2026-04-09 by automated weekly deep validation task* diff --git a/reports/weekly_deep_validation_2026-05-04.md b/reports/weekly_deep_validation_2026-05-04.md new file mode 100644 index 0000000..21aac3a --- /dev/null +++ b/reports/weekly_deep_validation_2026-05-04.md @@ -0,0 +1,390 @@ +# Weekly Deep Validation Report — 2026-05-04 + +**Generated by:** Scheduled Task `scraper-tech` +**Scope:** All 25 data sources, 14 RSS feeds, 5 connectors/infra components + +--- + +## EXECUTIVE SUMMARY + +| Category | Total | Healthy | Degraded/Investigate | Broken | Action Required | +|----------|-------|---------|----------------------|--------|-----------------| +| Structured Collectors | 13 | 6 | 5 | 2 | IMF (critical), BSE API, RBI DBIE, SEBI, data.gov.in, CCIL/FBIL | +| RSS Feeds | 14 | 10 | 0 | 4 | Reuters x2 (dead), MoneyControl (403), RBI Press (not RSS) | +| Social Scrapers | 10 | 7 | 0 | 3 | Mastodon (auth required), Reddit (throttle risk), Twitter (fragile) | +| Connectors | 3 | 3 | 0 | 0 | None | +| Infrastructure | 3 | 3 | 0 | 0 | None | + +**Critical issues requiring immediate action: 3** +1. IMF Data collector — backend API decommissioned (Nov 2025) +2. Reuters RSS feeds — domain `feeds.reuters.com` permanently dead (DNS failure) +3. Mastodon public timeline — now requires OAuth authentication + +**Issues requiring investigation from production server (may be geo-specific): 5** +4. RBI DBIE — TLS certificate error on `dbie.rbi.org.in`, possible migration to `data.rbi.org.in` +5. BSE — `api.bseindia.com` returning 301 redirects, API reorganized +6. SEBI — all URLs timing out from external network +7. data.gov.in — returning 403 Forbidden +8. CCIL/FBIL — FBIL reference rates URL returning 404, `fbil.org.in` unreachable + +**NOTE:** Items 4-8 were tested from non-Indian IPs. Many Indian government/financial sites implement geo-restrictions. These MUST be verified from production infrastructure before taking action. + +--- + +## 1. STRUCTURED DATA COLLECTORS + +### 1.1 RBI DBIE (collectors/rbi_dbie.py) — INVESTIGATE +- **ENDPOINT STATUS:** DEGRADED — `dbie.rbi.org.in` returns TLS certificate error (`ERR_TLS_CERT_ALTNAME_INVALID`). The certificate no longer covers the `dbie.rbi.org.in` hostname. Alternate portal `data.rbi.org.in` loads but appears to be a JS-heavy SPA with minimal server-side HTML. +- **STRUCTURE:** POTENTIALLY BROKEN — cannot connect via plain HTTPS due to cert mismatch. RBI main site now references `data.rbi.org.in` as the data portal, suggesting DBIE may have been consolidated. +- **AUTH REQUIREMENTS:** UNKNOWN (cannot connect to verify) +- **NEW FEATURES:** `data.rbi.org.in` may be the new canonical endpoint (SPA-based) +- **RECOMMENDATION:** INVESTIGATE from production server — TLS cert errors may be intermittent or CDN-specific. If `dbie.rbi.org.in` is permanently broken, migrate to `data.rbi.org.in` with headless browser support. As stopgap, Weekly Statistical Supplement data is available on main `rbi.org.in` publications section. +- **FILE TO EDIT:** ~/social_scraper/collectors/rbi_dbie.py (line with base_url if migration needed) +- **NOTE:** Test from Indian IP — some issues may be geo-specific + +### 1.2 RBI Circulars (collectors/rbi_circulars.py) +- **ENDPOINT STATUS:** UP +- **STRUCTURE:** UNCHANGED — press releases, notifications, circulars sections intact +- **AUTH REQUIREMENTS:** UNCHANGED +- **NEW FEATURES:** April 2026 DPDP Act advisory and Digital Payment Authentication Directions published — verify these are being ingested +- **RECOMMENDATION:** No action (verify new circulars are captured) +- **FILE TO EDIT:** N/A + +### 1.3 NSE (collectors/nse_bhavcopy.py) +- **ENDPOINT STATUS:** UP (heavy anti-bot protection active) +- **STRUCTURE:** UNCHANGED — bhavcopy download pattern intact +- **AUTH REQUIREMENTS:** UNCHANGED for data scraping. Note: NSE algo trading API now requires static IP + whitelisted keys (April 2026) but this does NOT affect website data collection +- **NEW FEATURES:** None affecting data collection +- **RECOMMENDATION:** Monitor — NSE frequently tightens anti-bot; add better 403/captcha handling +- **FILE TO EDIT:** ~/social_scraper/collectors/nse_bhavcopy.py (low priority) + +### 1.4 BSE (collectors/bse_api.py) — INVESTIGATE +- **ENDPOINT STATUS:** PARTIALLY UP — main site loads but is a fully client-rendered SPA (minimal server-side HTML) +- **STRUCTURE:** MODIFIED — old REST API at `api.bseindia.com/BseIndiaAPI/api/` now returns **301 Moved Permanently** redirecting to `www.bseindia.com/members/showinterest.aspx`. This indicates BSE has deprecated or reorganized their public API endpoints. +- **AUTH REQUIREMENTS:** CHANGED — API redirect to `/members/` path suggests some endpoints may now require member login +- **NEW FEATURES:** Cannot fully assess due to SPA rendering +- **RECOMMENDATION:** INVESTIGATE — re-map API endpoints by inspecting network traffic from BSE website. Check if `www.bseindia.com/download/BhseCsv/Equity/` still serves bhavcopy CSVs directly. May need headless browser approach for SPA content. +- **FILE TO EDIT:** ~/social_scraper/collectors/bse_api.py +- **NOTE:** Test API endpoints from production server — may be IP/geo-specific blocking + +### 1.5 CCIL (collectors/ccil_rates.py) +- **ENDPOINT STATUS:** UP — site fully operational with rich market data +- **STRUCTURE:** UNCHANGED for core data (Money Market, G-Sec, Forex, Derivatives tables) +- **AUTH REQUIREMENTS:** UNCHANGED — public data accessible without login +- **NEW FEATURES DETECTED:** + - New swap instruments beyond 12 months on FX-CLEAR + - Portfolio Compression service for IRS trades (84.92% compression) + - Margin Calculator for USDINR Forex Forward and Rupee IRS + - ZCYC (Zero Coupon Yield Curve) parameters available + - Real-time data refresh intervals selectable (2, 5, 10, 15, 30, 60 min) +- **FBIL WARNING:** `/web/ccil/fbil-reference-rate` returns 404. `fbil.org.in` appears unreachable (ECONNREFUSED). FBIL rates may have moved — investigate new URL. +- **RECOMMENDATION:** Add new data products as scraping targets. **Investigate FBIL reference rates location** — critical for treasury operations. +- **FILE TO EDIT:** ~/social_scraper/collectors/ccil_rates.py (add new data products, fix FBIL URL) + +### 1.6 FRED API (collectors/fred_api.py) +- **ENDPOINT STATUS:** UP (403 with DEMO_KEY is expected; real API keys work) +- **STRUCTURE:** UNCHANGED — v1 API responds correctly with valid keys +- **AUTH REQUIREMENTS:** UNCHANGED — API key required (already configured via `${FRED_API_KEY}`) +- **SERIES STATUS:** All 11 series IDs confirmed valid: FEDFUNDS, CPIAUCSL, DGS10, DGS2, DTWEXBGS, UNRATE, GDP, SOFR, T10Y2Y, VIXCLS, BAMLH0A0HYM2 +- **NEW FEATURES:** FRED API v2 documentation page exists but no forced migration timeline detected +- **RECOMMENDATION:** No action +- **FILE TO EDIT:** N/A + +### 1.7 SEBI (collectors/sebi_circulars.py) — INVESTIGATE +- **ENDPOINT STATUS:** TIMEOUT — all URLs tested (homepage, `/legal/circulars.html`, `/enforcement/orders.html`) time out after 60s from external fetch. May indicate aggressive rate limiting, WAF protection, or geo-blocking. +- **STRUCTURE:** UNABLE TO VERIFY from external network +- **AUTH REQUIREMENTS:** UNKNOWN — may now require browser-like headers or JS execution +- **NEW FEATURES:** Cannot assess +- **RECOMMENDATION:** INVESTIGATE from production server (Indian IP). If confirmed blocking, add browser-like headers or headless browser fallback. SEBI circulars are often mirrored on financial news sites as fallback. +- **FILE TO EDIT:** ~/social_scraper/collectors/sebi_circulars.py (if blocking confirmed) +- **NOTE:** This may be geo-restriction — test from Indian infrastructure first + +### 1.8 data.gov.in (collectors/data_gov_in.py) — INVESTIGATE +- **ENDPOINT STATUS:** DEGRADED — main site and datasets page return HTTP 403 Forbidden from external network. API endpoint `api.data.gov.in` returns 404 Not Found. OGPL APIs page also returns 403. +- **STRUCTURE:** UNABLE TO VERIFY — blanket 403 blocks access +- **AUTH REQUIREMENTS:** POSSIBLY CHANGED — 403 may indicate: (a) geo-restriction to Indian IPs, (b) mandatory API key even for browsing, or (c) platform migration +- **NEW FEATURES:** Cannot assess +- **RECOMMENDATION:** INVESTIGATE from production server (Indian IP). If the OGPL API has been deprecated, check NIC announcements for migration notices. API Setu (`apisetu.gov.in`) may be an alternative. +- **FILE TO EDIT:** ~/social_scraper/collectors/data_gov_in.py (if platform migrated) +- **NOTE:** Likely geo-restriction — test from Indian IP before taking action + +### 1.9 World Bank API (collectors/world_bank.py) +- **ENDPOINT STATUS:** UP +- **STRUCTURE:** UNCHANGED — v2 API responding correctly + - Pagination metadata: page, pages, per_page, total, lastupdated (2026-04-08) + - Data array with indicator, country, year, value fields intact + - All indicators valid: NY.GDP.MKTP.CD, FP.CPI.TOTL.ZG, BN.CAB.XOKA.CD + - All countries accessible: IN, US, CN, GB, JP, DE +- **AUTH REQUIREMENTS:** UNCHANGED — no auth needed +- **NEW FEATURES:** None detected +- **RECOMMENDATION:** No action +- **FILE TO EDIT:** N/A + +### 1.10 IMF Data (collectors/imf_data.py) — CRITICAL +- **ENDPOINT STATUS:** DOWN — `dataservices.imf.org` DECOMMISSIONED (November 2025) +- **STRUCTURE:** BROKEN — all requests to old endpoint timeout +- **AUTH REQUIREMENTS:** CHANGED — new SDMX 3.0 API at `sdmxcentral.imf.org` +- **NEW FEATURES:** New SDMX Central API with hundreds of dataflows (IFS, BOP, DOT confirmed available) +- **RECOMMENDATION:** REWRITE COLLECTOR — migrate from old REST JSON API to new SDMX Central API + - Old: `http://dataservices.imf.org/REST/SDMX_JSON.svc/CompactData/{dataset}/...` + - New primary: `https://sdmxcentral.imf.org/ws/public/sdmxapi/rest/data/STA/{dataset}/...` + - New fallback: `https://www.imf.org/external/datamapper/api/v1/{indicator}/{country}` (confirmed working, returns JSON) + - Consider using the `sdmx1` Python library for SDMX 3.0 parsing +- **FILE TO EDIT:** ~/social_scraper/collectors/imf_data.py (line 14: BASE_URL) + +### 1.11 Telegram (collectors/telegram_channels.py) +- **ENDPOINT STATUS:** Requires runtime API credentials (api_id/api_hash) — cannot test externally +- **STRUCTURE:** Assumed UNCHANGED — Telegram Bot API and MTProto stable +- **AUTH REQUIREMENTS:** UNCHANGED +- **RECOMMENDATION:** No action (verify in production) +- **FILE TO EDIT:** N/A + +### 1.12 Twitter Lists (collectors/twitter_lists.py) +- **ENDPOINT STATUS:** FRAGILE — twikit cookie-based auth breaks every 2-4 weeks +- **STRUCTURE:** CHANGED — X (Twitter) free API tier is now write-only (Feb 2026). Guest tokens bound to browser fingerprints. Datacenter IPs permanently banned +- **AUTH REQUIREMENTS:** NEW RESTRICTIONS — paid API ($100/mo Basic) required for reliable read access +- **NEW FEATURES:** Pay-per-use billing model +- **RECOMMENDATION:** Investigate — verify twikit still functional, consider paid API ($100/mo Basic tier) +- **FILE TO EDIT:** ~/social_scraper/collectors/twitter_lists.py, ~/social_scraper/scrapers/twitter_scraper.py + +--- + +## 2. RSS FEEDS (collectors/rss_feeds.py) + +### Working Feeds (10/14) + +| Feed | Status | Structure | Notes | +|------|--------|-----------|-------| +| et_economy | UP | RSS 2.0, 51 items | No `` — title/link/guid/pubDate only | +| et_markets | UP | RSS 2.0, 50 items | Has ``, `` for images | +| mint_economy | UP | RSS 2.0 + MRSS, 35 items | `` for images | +| mint_markets | UP | RSS 2.0 + MRSS, 35 items | Same structure as economy | +| fed_press | UP | RSS 2.0, 20 items | Has `` (5 types), latest: May 1, 2026 | +| ecb_press | UP | RSS 2.0, 14 items | No `` — title/link/guid/pubDate only | +| coindesk | UP | RSS 2.0 rich, 25 items | ``, empty ``, TTL=5min | +| cnbc | UP | RSS 2.0 + custom metadata, 30 items | `` fields | +| ft_markets | UP | RSS 2.0 + dc, 25 items | GUIDs are UUIDs not URLs, TTL=15min | +| arxiv_qfin | UP | RSS 2.0 + arxiv/dc, 0 items | Empty on weekends (has `` Sat/Sun) | + +### Broken Feeds (4/14) — ACTION REQUIRED + +| Feed | Status | Issue | Action | +|------|--------|-------|--------| +| **reuters_business** | **DOWN** | DNS dead — `feeds.reuters.com` permanently retired | Remove or replace (AP News RSS, RSSHub proxy) | +| **reuters_markets** | **DOWN** | DNS dead — same domain | Remove or replace | +| **moneycontrol** | **DOWN** | HTTP 403 — Akamai CDN bot block | Route via residential proxy or replace | +| **rbi_press** | **NOT RSS** | HTML page, never was RSS | Switch to HTML scraper (BeautifulSoup) | + +**RECOMMENDATION:** Update `config/sources.yaml` RSS feeds list: +- Remove reuters_business and reuters_markets entries +- Add replacement feeds (e.g., AP News, Bloomberg RSS if available) +- Change rbi_press to use HTML parsing, or remove from RSS collector and handle in rbi_circulars collector +- For moneycontrol, test from production server (may be IP/geo-specific block) + +**FILE TO EDIT:** ~/social_scraper/config/sources.yaml (feeds section), ~/social_scraper/collectors/rss_feeds.py + +--- + +## 3. SOCIAL SCRAPERS + +### 3.1 Reddit (scrapers/reddit_scraper.py) +- **ENDPOINT STATUS:** UP — JSON API returns valid Listing objects with 108 fields per post +- **STRUCTURE:** UNCHANGED — `data.children[]` array, pagination via `data.after` token +- **AUTH REQUIREMENTS:** CHANGED — OAuth now enforced. Unauthenticated `.json` endpoint throttled to 10 req/min. OAuth free tier allows 100 req/min +- **RECOMMENDATION:** Investigate — migrate to OAuth as primary path. At 10 req/min across 18 subreddits, throttling will hit during scan cycles +- **FILE TO EDIT:** ~/social_scraper/scrapers/reddit_scraper.py + +### 3.2 Hacker News (scrapers/hackernews_scraper.py) +- **ENDPOINT STATUS:** UP +- **STRUCTURE:** UNCHANGED — Firebase API returns array of story IDs `[48002938, ...]` +- **AUTH REQUIREMENTS:** UNCHANGED — no auth needed +- **RECOMMENDATION:** No action +- **FILE TO EDIT:** N/A + +### 3.3 YouTube (scrapers/youtube_scraper.py) +- **ENDPOINT STATUS:** UP +- **STRUCTURE:** UNCHANGED — Data API v3, 10,000 units/day quota +- **AUTH REQUIREMENTS:** UNCHANGED — quota increase requests now require compliance audit (1-6 weeks) +- **RECOMMENDATION:** No action (existing `_QUOTA_COOLDOWN_SECONDS` mechanism sufficient) +- **FILE TO EDIT:** N/A + +### 3.4 Mastodon (scrapers/mastodon_scraper.py) — ACTION REQUIRED +- **ENDPOINT STATUS:** DOWN — HTTP 422 `"This method requires an authenticated user"` +- **STRUCTURE:** BROKEN — public timeline API disabled for unauthenticated access +- **AUTH REQUIREMENTS:** NEW RESTRICTION — OAuth bearer token now required +- **RECOMMENDATION:** Update authentication — register app at mastodon.social/settings/applications, add OAuth token. Alternative: use `/api/v1/timelines/tag/:hashtag` endpoint or switch to a different instance +- **FILE TO EDIT:** ~/social_scraper/scrapers/mastodon_scraper.py + +### 3.5 GitHub (scrapers/github_scraper.py) +- **ENDPOINT STATUS:** UP — rate limits: core 60/hr (unauth), search 10/min, GraphQL requires auth +- **STRUCTURE:** UNCHANGED +- **AUTH REQUIREMENTS:** UNCHANGED — consider using PAT for 5000/hr core limit +- **RECOMMENDATION:** No action +- **FILE TO EDIT:** N/A + +### 3.6 SEC EDGAR (scrapers/sec_scraper.py) +- **ENDPOINT STATUS:** UP — requires User-Agent header (mandatory, must include contact info) +- **STRUCTURE:** MODIFIED + - `/LATEST/search-index` endpoint: WORKS with compliant User-Agent (HTTP 200) + - `/LATEST/search` endpoint: NOW RETURNS `"Missing Authentication Token"` — may require new auth + - **NEW aggregations field** in responses: faceted search by form type, entity, SIC code, state + - **NEW query transparency** — parsed Elasticsearch query returned in response + - 18 source fields confirmed: adsh, biz_locations, ciks, display_names, file_date, form, sics, etc. +- **AUTH REQUIREMENTS:** CHANGED — User-Agent now strictly mandatory; `/search` endpoint may need new auth +- **NEW FEATURES:** EDGAR 26.0.1 (Feb 2026) + 26.1 (Mar 2026). Aggregations/faceted search. EDGAR Beta in preview +- **RECOMMENDATION:** Verify scraper uses `/LATEST/search-index` (not `/LATEST/search`). Ensure User-Agent includes contact email. Consider leveraging new aggregations for enhanced filtering +- **FILE TO EDIT:** ~/social_scraper/scrapers/sec_scraper.py (verify endpoint path + User-Agent) + +### 3.7 Discord (scrapers/discord_scraper.py) +- **ENDPOINT STATUS:** UP +- **STRUCTURE:** UNCHANGED for read-only message scraping +- **AUTH REQUIREMENTS:** UNCHANGED — permission splits (PIN_MESSAGES, BYPASS_SLOWMODE) don't affect read-only bots +- **RECOMMENDATION:** No action +- **FILE TO EDIT:** N/A + +### 3.8 Dark Web (scrapers/darkweb_scraper.py) +- **ENDPOINT STATUS:** Requires Tor SOCKS5 proxy — cannot test externally +- **STRUCTURE:** Assumed UNCHANGED +- **RECOMMENDATION:** Verify in production +- **FILE TO EDIT:** N/A + +### 3.9 Web Scraper (scrapers/web_scraper.py) +- **ENDPOINT STATUS:** UP — general web targets accessible +- **RECOMMENDATION:** No action +- **FILE TO EDIT:** N/A + +### 3.10 Central Banks (scrapers/centralbank_scraper.py) +- **ENDPOINT STATUS:** UP — Fed RSS active (latest: May 1, 2026), ECB RSS active (latest: May 3, 2026) +- **STRUCTURE:** UNCHANGED +- **RECOMMENDATION:** No action +- **FILE TO EDIT:** N/A + +--- + +## 4. CONNECTORS + +### 4.1 DragonScope (connectors/dragonscope.py) +- **STATUS:** Code review HEALTHY +- **Redis push:** Dual-mode (Redis direct + API fallback), MAX_ITEMS_PER_CATEGORY=500 cap, connection reset on failure +- **Data format:** 4 category transformers (reddit_posts, news, github_repos, sec_filings) + generic fallback +- **RECOMMENDATION:** No action + +### 4.2 LiquiFi (connectors/liquifi.py) +- **STATUS:** Code review HEALTHY +- **Treasury filter:** Word-boundary regex for short keywords (prevents "omo" matching "tomorrow"), 0.3 relevance threshold +- **Rate extraction:** 6 rate patterns with sanity-check ranges (repo_rate 0-15%, usdinr 40-150, etc.) +- **RECOMMENDATION:** No action + +### 4.3 Router (connectors/router.py) +- **STATUS:** Code review HEALTHY +- **Classification:** Platform-based default + content-based override (treasury score >= 0.3 promotes to BOTH) +- **Parallel push:** asyncio.gather for DragonScope + LiquiFi, Kafka persistence with 45s timeout +- **No missed categories:** All Platform enum values are covered by DRAGONSCOPE_PLATFORMS, LIQUIFI_PLATFORMS, or BOTH_PLATFORMS, with DragonScope as default fallback +- **RECOMMENDATION:** No action + +--- + +## 5. INFRASTRUCTURE + +### 5.1 Celery Beat Schedules +- **STATUS:** CONSISTENT +- **YAML-driven:** 13 sources in config/sources.yaml, all map to valid collector classes via `build_beat_schedule()` +- **Hardcoded scrapers:** 12 scraper tasks in core/scheduler.py, all map to existing scraper files +- **System tasks:** 9 operational tasks (process-articles, daily-digest, health-check, data-quality, route-to-destinations, push-stats, backpressure-check, retention-cleanup, generate-daily-report) +- **Deprecated file:** scheduler/schedule.py properly deprecated with warnings, no conflict +- **RECOMMENDATION:** No action + +### 5.2 Docker Compose +- **STATUS:** HEALTHY +- **Services:** 11 (postgres, redis, minio, zookeeper, kafka, tor, api, worker, beat, nlp-worker, flower) +- **Port conflicts:** None — all 10 exposed ports are unique (2181, 5432, 5555, 6379, 8000, 8118, 9000, 9001, 9050, 9092) +- **Health checks:** postgres, redis, minio, kafka all have explicit health checks with proper intervals +- **Dependencies:** api and worker properly depend on postgres+redis with `condition: service_healthy` +- **RECOMMENDATION:** No action + +### 5.3 Schedule Consistency +- **YAML sources to collector classes:** All 13 mappings verified correct (class names match file contents) +- **Scraper tasks to scraper files:** All 12 mappings verified (class names found via grep) +- **Orphaned entries:** None detected +- **RECOMMENDATION:** No action + +--- + +## 6. NEW DATA SOURCE OPPORTUNITIES + +| Source | Description | Effort | Auth Required | +|--------|-------------|--------|---------------| +| **mfdata.in** | Free Indian mutual fund NAV/holdings API (14K+ schemes, 18yr history) | Low | No | +| **RSSHub proxy** | Self-hosted RSS proxy for Reuters and other paywalled feeds | Medium | No | +| **EDGAR Operational Status API** | New endpoints for detecting EDGAR degraded service | Low | No | + +--- + +## 7. PRIORITY ACTION ITEMS + +### CRITICAL (blocking data collection) +| # | Issue | File | Action | +|---|-------|------|--------| +| 1 | IMF API decommissioned (Nov 2025) | `collectors/imf_data.py:14` | Rewrite to use `sdmxcentral.imf.org` SDMX 3.0 API | +| 2 | Reuters RSS feeds dead (DNS failure) | `config/sources.yaml` | Remove reuters_business + reuters_markets, add replacements | +| 3 | Mastodon public timeline requires OAuth | `scrapers/mastodon_scraper.py` | Add OAuth app registration + bearer token | + +### HIGH (degraded or at risk) +| # | Issue | File | Action | +|---|-------|------|--------| +| 4 | Twitter/X twikit fragile, free API write-only | `scrapers/twitter_scraper.py` | Verify twikit works; evaluate $100/mo Basic tier | +| 5 | Reddit unauthenticated throttled to 10 req/min | `scrapers/reddit_scraper.py` | Migrate to OAuth (100 req/min free tier) | +| 6 | MoneyControl RSS blocked (Akamai 403) | `config/sources.yaml` | Test from prod server; proxy or replace | + +### INVESTIGATE FROM PRODUCTION (may be geo-specific — test from Indian IP before acting) +| # | Issue | File | Action | +|---|-------|------|--------| +| 7 | RBI DBIE TLS cert error on `dbie.rbi.org.in`, possible migration to `data.rbi.org.in` | `collectors/rbi_dbie.py` | Test from prod; if broken, migrate to `data.rbi.org.in` (may need headless browser) | +| 8 | BSE `api.bseindia.com` returning 301, API reorganized | `collectors/bse_api.py` | Inspect BSE network traffic to find new API endpoints | +| 9 | SEBI all URLs timing out from external network | `collectors/sebi_circulars.py` | Test from prod; if blocking, add browser-like headers | +| 10 | data.gov.in returning 403 Forbidden | `collectors/data_gov_in.py` | Test from prod; check if OGPL API deprecated | +| 11 | CCIL FBIL reference rates URL 404, `fbil.org.in` unreachable | `collectors/ccil_rates.py` | Locate new FBIL endpoint; add new CCIL data products | + +### LOW (monitor / cosmetic) +| # | Issue | File | Action | +|---|-------|------|--------| +| 12 | RBI Press is HTML page, not RSS | `config/sources.yaml` | Move to HTML scraper or rbi_circulars collector | +| 13 | ECB + ET Economy RSS missing `` | `collectors/rss_feeds.py` | Ensure parser handles null description gracefully | +| 14 | arXiv q-fin empty on weekends | `core/scheduler.py` | Optional: add weekday-only schedule | +| 15 | NSE may tighten anti-bot | `collectors/nse_bhavcopy.py` | Add 403/captcha error handling | +| 16 | SEC EDGAR: User-Agent has placeholder email (`research@example.com` at line 61) | `scrapers/sec_scraper.py:61` | Update User-Agent to real contact email | +| 17 | FRED API v2 exists | `collectors/fred_api.py` | No migration forced yet — monitor | + +--- + +## 8. ECOSYSTEM CHANGES SUMMARY + +| Platform | Change | Impact | +|----------|--------|--------| +| Reddit | OAuth enforced, unauthenticated throttled to 10 req/min | MEDIUM | +| Twitter/X | Free API write-only (Feb 2026), guest tokens browser-fingerprinted | HIGH | +| SEC EDGAR | Releases 26.0.1 + 26.1, Beta environment in preview | LOW | +| YouTube | No quota changes; quota increase needs compliance audit | NONE | +| Discord | Permission splits (PIN, BYPASS_SLOWMODE); E2EE for calls | NONE | +| NSE India | Algo trading: static IP + whitelisted keys required (April 2026) | LOW (data scraping unaffected) | +| RBI | DPDP Act advisory + Digital Payment Auth Directions (April 2026) | LOW | +| IMF | `dataservices.imf.org` decommissioned Nov 2025; migrated to SDMX 3.0 | CRITICAL | +| Reuters | `feeds.reuters.com` domain permanently retired | CRITICAL | +| Mastodon | Public timeline API requires authentication | HIGH | + +--- + +--- + +## 9. KEY TRENDS + +**Indian financial regulators migrating to SPA architectures:** Multiple Indian sources (BSE, potentially DBIE, data.gov.in) are moving from server-rendered HTML/REST APIs to client-rendered SPAs. This trend will eventually require headless browser infrastructure (Playwright/Puppeteer) for reliable data collection from Indian regulatory sites. Consider adding a shared headless browser service to the Docker stack. + +**Anti-bot protection intensifying:** NSE, MoneyControl, SEBI, and data.gov.in all show increased bot-detection measures. The scraper may need residential proxy rotation or browser fingerprint management for continued access. + +**Social platform APIs becoming more restrictive:** Reddit (OAuth enforcement), Twitter/X (pay-for-read), Mastodon (auth required) all moved toward more restrictive access in 2025-2026. The general trend is toward authenticated, rate-limited, paid access for any automated consumption. + +--- + +*Report generated 2026-05-04 by scheduled task `scraper-tech`* +*Next validation: 2026-05-11* diff --git a/reports/weekly_deep_validation_2026-05-25.md b/reports/weekly_deep_validation_2026-05-25.md new file mode 100644 index 0000000..d69280f --- /dev/null +++ b/reports/weekly_deep_validation_2026-05-25.md @@ -0,0 +1,380 @@ +# Weekly Deep Source Validation Report +**Date:** 2026-05-25 (Sunday) +**Run by:** Automated scheduled task + +--- + +## EXECUTIVE SUMMARY + +| Category | Total | UP/OK | CHANGED | DOWN/BROKEN | ACTION NEEDED | +|----------|-------|-------|---------|-------------|---------------| +| Structured Data Collectors | 10 | 5 | 3 | 2 | 5 | +| RSS Feeds | 14 | 6 | 2 | 4 | 6 | +| Social Scrapers | 10 | 4 | 3 | 1 | 4 | +| Messaging | 2 | 1 | 1 | 0 | 1 | +| Connectors | 3 | 3 | 0 | 0 | 0 | +| Infrastructure | 2 | 2 | 0 | 0 | 0 | + +**Critical Issues (Immediate):** +1. RBI DBIE URL has changed — `dbie.rbi.org.in` has TLS cert issues; new portal at `data.rbi.org.in` +2. CCIL FBIL rates URL structure changed — old `/web/ccil/fbil-overnight-mibor` returns 404 +3. IMF legacy API (`dataservices.imf.org`) appears unresponsive — migrated to SDMX 3.0 +4. Reuters RSS feeds confirmed dead since 2020 — still configured in sources.yaml +5. SEC EDGAR EFTS returning 403 — may need User-Agent update for EDGAR Next + +--- + +## 1. RBI DBIE + +- **ENDPOINT STATUS:** CHANGED / DOWN +- **STRUCTURE:** BROKEN — TLS certificate error on `dbie.rbi.org.in`. RBI has migrated to `data.rbi.org.in/DBIE/` +- **AUTH REQUIREMENTS:** NEW URL discovered; may need updated headers +- **NEW FEATURES:** RBI launched RBIDATA mobile app; new portal at `data.rbi.org.in` is the modernized DBIE +- **RECOMMENDATION:** **Update collector URL immediately** — change `BASE_URL` from `https://dbie.rbi.org.in/DBIE` to `https://data.rbi.org.in/DBIE/`. Investigate new API structure. +- **FILE TO EDIT:** `collectors/rbi_dbie.py` (line 24: `BASE_URL`) and `config/sources.yaml` (rbi_dbie.base_url) + +--- + +## 2. RBI Circulars + +- **ENDPOINT STATUS:** UP +- **STRUCTURE:** UNCHANGED — Press releases page still uses chronological list, year-based dropdown navigation, URL pattern `BS_PressReleaseDisplay.aspx?prid=[ID]` +- **AUTH REQUIREMENTS:** UNCHANGED (no auth needed) +- **NEW FEATURES:** None detected +- **RECOMMENDATION:** No action needed +- **FILE TO EDIT:** N/A + +--- + +## 3. NSE India + +- **ENDPOINT STATUS:** DOWN (connection reset — heavy anti-bot) +- **STRUCTURE:** CANNOT VERIFY — NSE's aggressive anti-bot protection blocks automated fetches +- **AUTH REQUIREMENTS:** NSE continues to use heavy browser fingerprinting, session cookies, and WAF protection +- **NEW FEATURES:** Cannot assess +- **RECOMMENDATION:** Verify in-production scraper logs. If bhavcopy downloads are failing, may need to rotate User-Agents or use playwright-based fetching. Consider if NSE has tightened protections further. +- **FILE TO EDIT:** `collectors/nse_bhavcopy.py` (if issues found in production logs) + +--- + +## 4. BSE + +- **ENDPOINT STATUS:** UP (partial — page loads but minimal content via automated fetch) +- **STRUCTURE:** UNCHANGED (JavaScript-heavy, API-driven frontend) +- **AUTH REQUIREMENTS:** UNCHANGED +- **NEW FEATURES:** None detected +- **RECOMMENDATION:** No action — BSE APIs likely still functional for the collector +- **FILE TO EDIT:** N/A + +--- + +## 5. CCIL Rates + +- **ENDPOINT STATUS:** CHANGED — `/web/ccil/fbil-overnight-mibor` returns **404** +- **STRUCTURE:** MODIFIED — CCIL website restructured. New URL paths: + - Money market rates: `/tenor-wise-term-money`, `/repo-summary` + - MIBOR/TREPS: under "Data & Statistics" → "Money Market" → "Treps" + - Yield curve: `/zcyc-parameters` + - FBIL benchmarks: `/Research/FBIL%20Benchmarks/Pages/default.aspx` +- **AUTH REQUIREMENTS:** UNCHANGED +- **NEW FEATURES:** FBIL data also available directly at `www.fbil.org.in` (separate site) +- **RECOMMENDATION:** **Update URL paths in collector.** Current `FBIL_URL = "https://www.fbil.org.in"` is correct for the API endpoint, but the HTML scrape fallback needs updated selectors. Test `https://www.fbil.org.in/api/ratesapi` endpoint from production. +- **FILE TO EDIT:** `collectors/ccil_rates.py` — verify FBIL_URL API endpoint still responds; update HTML scrape fallback paths if API fails + +--- + +## 6. FRED API + +- **ENDPOINT STATUS:** UP (returned 403 with DEMO_KEY, but real key should work) +- **STRUCTURE:** UNCHANGED — standard JSON response format with observations array +- **AUTH REQUIREMENTS:** UNCHANGED (API key required as before) +- **NEW FEATURES:** None detected +- **RECOMMENDATION:** No action. The 403 was due to using `DEMO_KEY`; real API key in env should work fine. Verify in production logs. +- **FILE TO EDIT:** N/A + +--- + +## 7. SEBI + +- **ENDPOINT STATUS:** UP +- **STRUCTURE:** MODIFIED — Not using standard table format anymore. Circulars use list structure with `/legal/rules/[month-year]/[rule-title]_[ID].html` URL pattern. Has "Updated List" vs "Historical Data" toggle. Login modal overlay present. +- **AUTH REQUIREMENTS:** Login modal overlay visible — may indicate new auth for some content +- **NEW FEATURES:** "Historical Data" toggle functionality +- **RECOMMENDATION:** **Investigate** — verify the collector still correctly parses the circulars listing. The URL pattern change from table to list format may need parser update. +- **FILE TO EDIT:** `collectors/sebi_circulars.py` + +--- + +## 8. data.gov.in + +- **ENDPOINT STATUS:** DOWN (403 Forbidden) +- **STRUCTURE:** CANNOT VERIFY — returns 403 to automated requests +- **AUTH REQUIREMENTS:** NEW RESTRICTIONS DETECTED — API blocking non-browser requests +- **NEW FEATURES:** India modernizing core economic data systems (GDP rebasing to 2022-23 base year, new IIP series May 28 2026, e-SIGMA digital platform, GST/eVahan data integration) +- **RECOMMENDATION:** **Investigate** — check if API key auth header is being sent correctly. The site may have added rate limiting or bot detection. Check `DATA_GOV_API_KEY` is valid. +- **FILE TO EDIT:** `collectors/data_gov_in.py` + +--- + +## 9. World Bank API + +- **ENDPOINT STATUS:** UP +- **STRUCTURE:** UNCHANGED — returns `[metadata, data_array]` JSON. Pagination: page/pages/per_page/total. Records have indicator.id, country.id, date, value fields. India 2024 GDP: $3.91T. +- **AUTH REQUIREMENTS:** UNCHANGED (no auth needed) +- **NEW FEATURES:** None detected +- **RECOMMENDATION:** No action +- **FILE TO EDIT:** N/A + +--- + +## 10. IMF Data + +- **ENDPOINT STATUS:** CHANGED — Legacy `dataservices.imf.org` ECONNREFUSED +- **STRUCTURE:** BROKEN for current collector — IMF has migrated to SDMX Central at `sdmxcentral.imf.org` and new API at `api.imf.org/external/sdmx/3.0` +- **AUTH REQUIREMENTS:** New API may require account sign-in for full access +- **NEW FEATURES:** SDMX 3.0 API available at `api.imf.org/external/sdmx/3.0`. IMF SDMX Central at `sdmxcentral.imf.org/ws/public/sdmxapi/rest/dataflow` confirmed working (BOP, DOT datasets verified present; IFS likely available). +- **RECOMMENDATION:** **Rewrite collector** — migrate from `http://dataservices.imf.org/REST/SDMX_JSON.svc` to new `https://sdmxcentral.imf.org/ws/public/sdmxapi/rest/` endpoint. Update data parsing for SDMX 2.1 XML or new JSON format. +- **FILE TO EDIT:** `collectors/imf_data.py` (lines 14, 24-28: BASE_URL and request logic) + +--- + +## RSS FEEDS + +### Feed Status Summary + +| Feed | Status | Notes | +|------|--------|-------| +| reuters_business | **DEAD** | Discontinued 2020. Remove or replace. | +| reuters_markets | **DEAD** | Discontinued 2020. Remove or replace. | +| et_economy | UNREACHABLE | Blocked from automated fetch (likely bot protection) | +| et_markets | UNREACHABLE | Same as above | +| mint_economy | UNREACHABLE | Same — Mint blocks automated fetches | +| mint_markets | UNREACHABLE | Same | +| moneycontrol | UNREACHABLE | Blocked from this environment | +| rbi_press | UP | Not a real RSS feed (HTML page). Listed URL is the press releases page, not an RSS endpoint. Collector handles this via HTML scraping. | +| fed_press | **UP** | RSS 2.0, items have title/link/description/pubDate/guid/category. Latest: May 22, 2026. Working perfectly. | +| ecb_press | **UP** | RSS 2.0, items have title/link/guid/pubDate. Latest build: May 22, 2026. Working. | +| coindesk | **UP** | RSS 2.0 with media/dc/content namespaces. Latest: May 24-25, 2026. TTL 5min. Working. | +| cnbc | **UP** | RSS 2.0, 27 items, metadata:type tags present. Last built May 24, 2026. Working. | +| ft_markets | UNREACHABLE | FT likely paywalled/blocked | +| arxiv_qfin | **UP** | RSS 2.0, lastBuildDate May 24, 2026. Feed structure intact but items may be empty on weekends. | + +### RSS Recommendations + +- **CRITICAL:** Remove `reuters_business` and `reuters_markets` from `config/sources.yaml` or replace with alternatives: + - Google News RSS: `https://news.google.com/rss/search?q=when:24h+allinurl:reuters.com` + - Or use RSS.app/Feedspot generated feeds +- **INVESTIGATE:** Indian news feeds (ET, Mint, MoneyControl) — verify they work from production server (may be blocked only from non-Indian IPs) +- **FILE TO EDIT:** `config/sources.yaml` (feeds section) + +--- + +## SOCIAL SCRAPERS + +### 11. Reddit + +- **STATUS:** FUNCTIONAL (with caveats) +- **CHANGES:** Reddit API pricing tiers still in effect (since 2023): Free tier = 100 req/min with OAuth, 10K monthly calls. Commercial = $12K/year. The scraper uses `.json` endpoint which is still free for read-only. +- **RISK:** Reddit has increasingly enforced rate limits on unauthorized scraping. The `.json` endpoint may get restricted. +- **RECOMMENDATION:** Monitor rate limit errors in production. Consider if OAuth credentials are being used. The free tier (100 req/min, 10K/month) should suffice for current monitoring volume. +- **FILE TO EDIT:** `scrapers/reddit_scraper.py` (no changes needed now) + +### 12. Hacker News + +- **STATUS:** UP — Firebase API working perfectly +- **STRUCTURE:** UNCHANGED — returns JSON array of integer story IDs. `https://hacker-news.firebaseio.com/v0/topstories.json` confirmed operational. +- **RECOMMENDATION:** No action +- **FILE TO EDIT:** N/A + +### 13. YouTube + +- **STATUS:** UP (quota system unchanged) +- **CHANGES:** Default 10,000 units/day quota still in effect. Search costs 100 units each. Stricter compliance audits for high-volume usage. +- **RISK:** 10 search queries × 100 units = 1,000 units/run. At medium frequency, quota should be fine. +- **RECOMMENDATION:** No action — monitor quota usage +- **FILE TO EDIT:** N/A + +### 14. Mastodon + +- **STATUS:** UP — Public timeline API requires no auth +- **STRUCTURE:** UNCHANGED — instances listed (mastodon.social, fosstodon.org, etc.) still support public API +- **RECOMMENDATION:** No action +- **FILE TO EDIT:** N/A + +### 15. GitHub + +- **STATUS:** UP +- **STRUCTURE:** REST API v3 unchanged. Rate limits: 5000 req/hour with token, 60/hour without. +- **RECOMMENDATION:** No action — ensure `GITHUB_TOKEN` env var is set +- **FILE TO EDIT:** N/A + +### 16. SEC EDGAR + +- **STATUS:** CHANGED — EFTS endpoint returning 403 +- **STRUCTURE:** MODIFIED — EDGAR Next initiative rolled out. New API requires enrollment. EDGAR Release 26.0.1 deployed Feb 2026. +- **AUTH REQUIREMENTS:** NEW — EDGAR Next requires individual user credentials + MFA for submission API. Read access may now require proper User-Agent with contact email. +- **NEW FEATURES:** EDGAR Next APIs (submission, account management, role-based access) +- **RECOMMENDATION:** **Update User-Agent header** in `sec_scraper.py`. Current: `"SocialScraper research@example.com"` — update the email to a real contact email. SEC requires: `Company Name AdminEmail@company.com`. Also verify EFTS URL hasn't changed to a new path under EDGAR Next. +- **FILE TO EDIT:** `scrapers/sec_scraper.py` (line 62: User-Agent header, lines 52-54: EFTS_URL) + +### 17. Discord + +- **STATUS:** UP — API v10 still current +- **STRUCTURE:** UNCHANGED +- **RECOMMENDATION:** No action — verify `DISCORD_BOT_TOKEN` has MESSAGE_CONTENT intent +- **FILE TO EDIT:** N/A + +### 18. Dark Web + +- **STATUS:** Cannot verify (Tor proxy not available from this environment) +- **RECOMMENDATION:** Verify tor container is running and SOCKS5 proxy at port 9050 is accessible from worker container +- **FILE TO EDIT:** N/A + +### 19. Web Scraper + +- **STATUS:** Site-dependent +- **RECOMMENDATION:** No action — general purpose scraper adapts to targets +- **FILE TO EDIT:** N/A + +### 20. Central Banks + +- **STATUS:** MIXED + - Fed press RSS: UP (confirmed working, latest May 22, 2026) + - ECB press RSS: UP (confirmed working, latest May 22, 2026) + - RBI press: UP (HTML page, not RSS — collector handles via scraping) +- **RECOMMENDATION:** No action +- **FILE TO EDIT:** N/A + +--- + +## MESSAGING + +### 21. Telegram + +- **STATUS:** UP (API access depends on credentials) +- **CHANGES:** No known API changes. Telegram Bot API stable. +- **RECOMMENDATION:** Verify `TELEGRAM_API_ID` and `TELEGRAM_API_HASH` are current. Check channel access. +- **FILE TO EDIT:** N/A + +### 22. Twitter/X + +- **STATUS:** HIGH RISK +- **CHANGES (CRITICAL):** + - X introduced "Pay-Per-Use" billing Feb 2026 + - Free tier is WRITE-ONLY (cannot read/scrape) + - Basic tier: $200/mo for 10K tweets read + - Cloudflare Turnstile deployed on login walls and rate-limited endpoints + - Jan 2025: Guest token bound to browser fingerprints; datacenter IPs banned + - Legal threat: $15K liquidated damages for >1M posts/24h automated access +- **AUTH REQUIREMENTS:** Cookie-based scraping via `twikit` is HIGH RISK — X actively detects and bans +- **RECOMMENDATION:** **HIGH PRIORITY** — Current `twikit` cookie approach is likely broken or at serious risk. Options: + 1. Pay for Basic tier ($200/mo) for legitimate read access + 2. Use a third-party scraping API service (various providers available) + 3. Reduce scraping frequency significantly + 4. Accept that Twitter data may be intermittent +- **FILE TO EDIT:** `scrapers/twitter_scraper.py`, `collectors/twitter_lists.py` + +--- + +## CONNECTORS + +### 23. DragonScope Connector +- **STATUS:** UP (depends on Redis and API endpoint) +- **RECOMMENDATION:** Verify `DRAGONSCOPE_REDIS_URL` and `DRAGONSCOPE_API_URL` connectivity +- **FILE TO EDIT:** N/A + +### 24. LiquiFi Connector +- **STATUS:** UP (depends on Redis and API endpoint) +- **RECOMMENDATION:** Verify `LIQUIFI_REDIS_URL` and `LIQUIFI_API_URL` connectivity +- **FILE TO EDIT:** N/A + +### 25. Router +- **STATUS:** UP — classification logic solid +- **STRUCTURE:** Platform-based routing with content-override. Coverage: + - DragonScope: Reddit, Discord, YouTube, HackerNews, Mastodon, GitHub + - LiquiFi: Central Bank data + - Both: Twitter, Telegram, RSS, Web, SEC EDGAR, Dark Web +- **RECOMMENDATION:** No action +- **FILE TO EDIT:** N/A + +--- + +## INFRASTRUCTURE + +### 26. Celery Beat Schedules + +- **STATUS:** CONSISTENT +- `scheduler/schedule.py` — correctly marked as deprecated (reference only) +- `core/scheduler.py` — builds beat schedule dynamically from `config/sources.yaml` +- All sources in `config/sources.yaml` have matching collector files in `collectors/` +- All scrapers have corresponding files in `scrapers/` +- **No orphaned schedule entries detected** +- **RECOMMENDATION:** No action + +### 27. Docker Compose + +- **STATUS:** OK +- Services defined: postgres, redis, minio, zookeeper, kafka, tor, api, worker (+ beat implied) +- Ports: 5432, 6379, 9000/9001, 2181, 9092, 9050/8118, 8000 — no conflicts +- Health checks: postgres (pg_isready), redis (ping), minio (mc ready), kafka (topics --list), api (curl /health) +- **Note:** `dperson/torproxy` image — verify it's still maintained (last DockerHub update should be checked) +- **RECOMMENDATION:** No action needed for port conflicts or health checks + +--- + +## NEW DATA SOURCES DISCOVERED + +| Source | URL | Value for NBFC Treasury | +|--------|-----|------------------------| +| FBIL Direct | `https://www.fbil.org.in` | Official benchmark rates (MIBOR, MIFOR, MIOIS) — more authoritative than CCIL for rate data | +| RBI New Portal | `https://data.rbi.org.in/DBIE/` | Modernized DBIE with potential API access | +| API Setu | `https://www.apisetu.gov.in` | Government API marketplace — may have new economic datasets | +| FinEdge API | `https://www.finedgeapi.com` | Indian corporate financial data (P&L, Balance Sheet, Cash Flow) | +| TrueData | `https://www.truedata.in` | Real-time NSE/BSE/MCX market data API | +| India GDP Rebasing | New IIP series releasing May 28, 2026 | Updated economic indicators with 2022-23 base year | +| IMF SDMX 3.0 | `https://api.imf.org/external/sdmx/3.0` | New API with better data access | +| EDGAR Next APIs | `https://www.sec.gov/submit-filings/filer-support-resources` | Modernized EDGAR filing access | + +--- + +## PRIORITY ACTION ITEMS + +### P0 — Immediate (This Week) + +1. **Update RBI DBIE URL** → `data.rbi.org.in/DBIE/` + - File: `collectors/rbi_dbie.py` line 24, `config/sources.yaml` + +2. **Fix/Verify CCIL rates** — test FBIL API endpoint from production + - File: `collectors/ccil_rates.py` + +3. **Remove dead Reuters feeds** from `config/sources.yaml` + - Replace with Google News RSS alternatives or remove entirely + +### P1 — High Priority (Next Sprint) + +4. **Rewrite IMF collector** for SDMX Central API + - File: `collectors/imf_data.py` — migrate to `sdmxcentral.imf.org` + +5. **Update SEC EDGAR User-Agent** with real contact email + - File: `scrapers/sec_scraper.py` line 62 + +6. **Assess Twitter/X scraping viability** — decide on paid API vs third-party service vs accept degraded data + - Files: `scrapers/twitter_scraper.py`, `collectors/twitter_lists.py` + +### P2 — Medium Priority (Next 2 Weeks) + +7. **Verify SEBI collector** still parses new list-based layout + - File: `collectors/sebi_circulars.py` + +8. **Check data.gov.in API** from production (may be geo-restricted) + - File: `collectors/data_gov_in.py` + +9. **Evaluate new sources:** FBIL direct, API Setu, FinEdge API for NBFC treasury data enrichment + +--- + +## ENVIRONMENT NOTES + +- Some endpoints (ET, Mint, MoneyControl, NSE, FT) are unreachable from this validation environment but may work from the production server (different IP/geo/headers). Check production collector logs for ground truth. +- SEC EDGAR's 403 responses may be specific to this environment's IP/User-Agent. Production should be verified separately. +- IMF legacy API connection refused is definitive — the service has been decommissioned. diff --git a/reports/weekly_source_validation_2026-04-13.md b/reports/weekly_source_validation_2026-04-13.md new file mode 100644 index 0000000..a7ad7e9 --- /dev/null +++ b/reports/weekly_source_validation_2026-04-13.md @@ -0,0 +1,396 @@ +# Weekly Deep Source Validation Report +**Date:** 2026-04-13 (Sunday) +**System:** social_scraper (econscraper) + +--- + +## EXECUTIVE SUMMARY + +**25 sources checked | 7 BROKEN | 6 DEGRADED | 12 HEALTHY** + +### URGENT: Regulatory & Compliance Changes +- **NSE Static IP Mandate (April 1, 2026)** - All algo trading API keys without static IP binding have expired. Any automated data collection connected to NSE needs static IP compliance verification immediately. +- **RBI Data Protection Advisory (April 2026)** - New advisory directing all regulated entities (including NBFCs) to prioritize customer data protection and API security. Mandatory compliance. +- **RBI Digital Payment Authentication Framework (April 1, 2026)** - Risk-based authentication replacing SMS OTP-only approaches. Affects payment processing. +- **X/Twitter API now pay-per-use** - No free tier for new developers. $0.005/post read, $0.010/user profile. Budget impact for sentiment scraping. + +### Critical Issues Requiring Immediate Action +1. **Reuters RSS feeds** - DEAD — `feeds.reuters.com` DNS no longer resolves. Replace URLs. +2. **FBIL/CCIL rates** - `fbil.org.in` returning ECONNREFUSED. Migrate to `ccilindia.com`. Critical for LiquiFi. +3. **Mastodon public timeline** - Now returns 422 without auth. Scraper needs token. +4. **RBI DBIE** - TLS cert broken on `dbie.rbi.org.in`. Portal migrated to `data.rbi.org.in` (SPA). +5. **data.gov.in** - All endpoints returning 403/404. API may be retired or migrated. +6. **IMF SDMX API** - `dataservices.imf.org` completely unreachable (TCP timeout). Portal rebuilt as Next.js app. API likely decommissioned. +7. **Moneycontrol RSS** - Feed FROZEN since April 2024. Serving stale cached data. Find new URL. + +### Warnings (Degraded but Functional) +7. **SEC EDGAR EFTS** - 403 without proper User-Agent. Scraper has placeholder email. +8. **World Bank API** - Intermittent timeouts (slow/overloaded). +9. **ET Economy + ECB Press RSS** - Missing `` tags in items. Parser needs fallback. +10. **RBI Press RSS** - Configured URL is HTML page, not RSS feed. Needs HTML scraper or new URL. +11. **arXiv q-fin RSS** - Empty on weekends by design (academic publishing schedule, not a bug). + +--- + +## STRUCTURED DATA COLLECTORS + +### 1. RBI DBIE (collectors/rbi_dbie.py) +- **ENDPOINT STATUS:** **DEGRADED** - `dbie.rbi.org.in` has **TLS certificate mismatch** (ERR_TLS_CERT_ALTNAME_INVALID). Portal appears to have migrated to `data.rbi.org.in`. +- **STRUCTURE:** **CHANGED** - New URL `data.rbi.org.in` is a heavy SPA (MapMyIndia API, Material Design) — no server-side data, requires headless browser (Playwright/Puppeteer) or discovery of underlying API endpoints. +- **AUTH REQUIREMENTS:** UNCHANGED - No login wall, but SPA architecture blocks simple HTTP scraping. +- **RECOMMENDATION:** Update BASE_URL to `data.rbi.org.in`. Fallback scraping of `PublicationsView.aspx` on `rbi.org.in` still works and is the most reliable path. Investigate `data.rbi.org.in` XHR endpoints for direct API access. +- **FILE TO EDIT:** collectors/rbi_dbie.py (update BASE_URL; optionally add headless browser support) + +### 2. RBI Circulars (collectors/rbi_circulars.py) +- **ENDPOINT STATUS:** UP - rbi.org.in accessible +- **STRUCTURE:** UNCHANGED - Press releases, notifications, circulars pages accessible +- **AUTH REQUIREMENTS:** UNCHANGED +- **RECOMMENDATION:** No action +- **FILE TO EDIT:** None + +### 3. NSE Bhavcopy (collectors/nse_bhavcopy.py) +- **ENDPOINT STATUS:** UP (with anti-bot measures) +- **STRUCTURE:** UNCHANGED - API endpoints respond after cookie prefetch +- **AUTH REQUIREMENTS:** UNCHANGED - Session cookies from homepage required +- **NEW FEATURES:** None detected +- **⚠️ NSE STATIC IP MANDATE (April 1, 2026):** All algo trading API keys without static IP binding have expired. IPv4 only (no IPv6), max 2 IPs (primary+secondary), must be registered with broker. Verify data collection server IPs are compliant. +- **RECOMMENDATION:** **VERIFY** static IP compliance for production server. Cookie prefetch pattern still works for data scraping, but any order/algo API integration is affected. +- **FILE TO EDIT:** collectors/nse_bhavcopy.py (verify; may need config for static IP) + +### 4. BSE API (collectors/bse_api.py) +- **ENDPOINT STATUS:** SLOW - api.bseindia.com times out intermittently +- **STRUCTURE:** UNCHANGED - API endpoint patterns unchanged +- **AUTH REQUIREMENTS:** UNCHANGED - Referer header still required +- **RECOMMENDATION:** Consider adding retry with backoff for timeout resilience +- **FILE TO EDIT:** collectors/bse_api.py (optional improvement) + +### 5. CCIL/FBIL Rates (collectors/ccil_rates.py) +- **ENDPOINT STATUS:** **SPLIT** - `ccilindia.com` is UP (ZCYC rates, MIBOR-OIS, bond/forex/derivatives data under Data & Statistics). `fbil.org.in` is **DOWN** (ECONNREFUSED confirmed). FBIL sub-pages on CCIL also timed out. +- **STRUCTURE:** CCIL has yield curve data, MIBOR data, government securities, forex data accessible from its Data & Statistics section. FBIL reference rates (MIBOR benchmarks, TREPS) are NOT directly on CCIL homepage — FBIL is a separate entity whose domain is down. +- **AUTH REQUIREMENTS:** CCIL public content accessible without auth. Sign-in exists for member portal. +- **RECOMMENDATION:** **MIGRATE** collector to source MIBOR/ZCYC data from `ccilindia.com/web/ccil/` Data & Statistics pages instead of `fbil.org.in`. Critical for LiquiFi connector (MIBOR, TREPS, yield curve, CP/CD rates). +- **FILE TO EDIT:** collectors/ccil_rates.py (replace FBIL_URL with CCIL data pages; add CCIL ZCYC endpoint) + +### 6. FRED API (collectors/fred_api.py) +- **ENDPOINT STATUS:** UP - API responds correctly with valid API key +- **STRUCTURE:** UNCHANGED - JSON response format stable +- **AUTH REQUIREMENTS:** UNCHANGED - API key required (DEMO_KEY returns 403 as expected) +- **NEW FEATURES:** **FRED API v2 (Nov 2025)** - Bulk retrieval of observations for all series in any release with full history in JSON/XML. ALFRED archival content no longer saveable (Jan 5, 2026). +- **RECOMMENDATION:** **UPGRADE** to FRED API v2 bulk retrieval for efficient macro data pulls (FEDFUNDS, DGS10, SOFR, etc.). Significant efficiency gain for treasury analysis. +- **FILE TO EDIT:** collectors/fred_api.py (add v2 bulk retrieval endpoint support) + +### 7. SEBI Circulars (collectors/sebi_circulars.py) +- **ENDPOINT STATUS:** UP (partially) - Homepage and Enforcement section load. **Circulars-specific listing page times out repeatedly.** +- **STRUCTURE:** UNCHANGED - URL pattern `HomeAction.do?doListing=yes&sid=[section]&ssid=[subsection]`. Confirmed: sid=1 (Legal Framework), sid=2 (Enforcement, 54K+ records, 25/page pagination). Legal document pattern: `/legal/[type]/[month-year]/[title]_[id].html`. +- **AUTH REQUIREMENTS:** UNCHANGED - Public documents accessible. SI Portal requires login. +- **RECOMMENDATION:** Increase HTTP timeout to 90s+. Add retry logic specifically for circulars listing. Consider scraping from the Legal Framework section (sid=1) as an alternative path. +- **FILE TO EDIT:** collectors/sebi_circulars.py (timeout 90s, retry logic) + +### 8. data.gov.in (collectors/data_gov_in.py) +- **ENDPOINT STATUS:** **DOWN/BLOCKED** - All endpoints returning 403 Forbidden or 404. Main domain, catalog page, search page, and API subdomain (`api.data.gov.in`) all reject non-browser requests. +- **STRUCTURE:** UNKNOWN - Cannot verify; API subdomain returns 404 suggesting possible API restructuring or retirement. +- **AUTH REQUIREMENTS:** Cannot be determined. Historically required API key registration. +- **RECOMMENDATION:** **INVESTIGATE URGENTLY** - Test from server with browser-like headers. If still blocked, check if API has been restructured under new endpoints (e.g., `apisetu.gov.in`). May need to deprioritize or mark as broken. +- **FILE TO EDIT:** collectors/data_gov_in.py (headers + possible URL migration) + +### 9. World Bank API (collectors/world_bank.py) +- **ENDPOINT STATUS:** DEGRADED - Intermittent timeouts (60s+) +- **STRUCTURE:** UNCHANGED when accessible - JSON format with paginated arrays +- **AUTH REQUIREMENTS:** UNCHANGED - No auth needed +- **RECOMMENDATION:** Increase timeout to 90s; add retry with exponential backoff +- **FILE TO EDIT:** collectors/world_bank.py (timeout/retry improvement) + +### 10. IMF Data (collectors/imf_data.py) +- **ENDPOINT STATUS:** **DOWN** - `dataservices.imf.org` SDMX REST API is unreachable. DNS resolves (134.113.242.23) but TCP connection times out on port 443. Tested with 10s, 15s, 60s, 90s timeouts — all fail. Dataflow endpoint also dead. +- **STRUCTURE:** CANNOT VERIFY - No response received from any endpoint. +- **AUTH REQUIREMENTS:** Historically none. Cannot verify current state. +- **PORTAL:** `data.imf.org` is accessible (HTTP 200) but has been rebuilt as a **Next.js application** — suggesting platform migration. Old SDMX API may be decommissioned. +- **NOTE:** Collector uses HTTP (not HTTPS) for a now-dead endpoint — both issues need fixing. +- **RECOMMENDATION:** **INVESTIGATE NEW IMF API** - The old `dataservices.imf.org/REST/SDMX_JSON.svc/` path appears decommissioned. Check IMF developer docs for replacement API (likely under `data.imf.org` or new SDMX endpoint). This is a **breaking change**. +- **FILE TO EDIT:** collectors/imf_data.py (new BASE_URL needed — old endpoint is dead) + +--- + +## RSS FEEDS (collectors/rss_feeds.py) + +| # | Feed | Status | Items | Structure | Issue | +|---|------|--------|-------|-----------|-------| +| 1 | reuters_business | **BROKEN** | 0 | DNS dead | `feeds.reuters.com` no longer resolves. Dead since 2020. | +| 2 | reuters_markets | **BROKEN** | 0 | DNS dead | Same — entire domain is defunct | +| 3 | et_economy | UP | 49 | title, link, guid, pubDate | **Missing ``** in items (only channel-level) | +| 4 | et_markets | UP | 50 | title, description, link, enclosure, guid, pubDate | Full structure with images | +| 5 | mint_economy | UP | 35 | title, description, link, guid, pubDate, media:content | Healthy (CDATA wrapping) | +| 6 | mint_markets | UP | 35 | title, description, link, guid, pubDate, media:content | Healthy | +| 7 | moneycontrol | **STALE** | 15 | title, description, link, guid, pubDate | **FROZEN since April 2024.** lastBuildDate=Aug 2024. ISO-8859-1 encoding. | +| 8 | rbi_press | **WARNING** | N/A | **HTML, not RSS** | ASPX page, not a feed. `/Scripts/rss.aspx` also HTML. Needs HTML scraper. | +| 9 | fed_press | UP | 20 | title, link, guid, description, category, pubDate | Excellent — includes `` tags | +| 10 | ecb_press | UP | 15 | title, link, guid, pubDate | **Missing ``** in items. Some links to PDFs. Double-slash URLs. | +| 11 | coindesk | UP | 25 | title, description, link, guid, pubDate, media:content, dc:creator, content:encoded | Healthy — rich feed, TTL=5min. (403 only from some tools) | +| 12 | cnbc | UP | 25 | title, description, link, guid, pubDate + custom metadata namespace | Healthy — TTL=60min | +| 13 | ft_markets | UP | 25 | title, description, link, guid, pubDate | Healthy — redirects to stream URL. TTL=15min. | +| 14 | arxiv_qfin | UP | 15 | title, description (abstract), link, guid, category, dc:creator, arxiv:announce_type | Healthy — **empty on weekends by design** (academic publishing schedule) | + +### RSS Feed Actions Required: +- **REPLACE** reuters_business and reuters_markets URLs — domain `feeds.reuters.com` is completely dead (DNS failure) +- **FIX** moneycontrol feed — frozen since April 2024, serving stale cached data. Find new RSS URL or switch to web scraping. +- **FIX** rbi_press URL — this is an HTML page, not RSS. Switch to HTML scraper or find actual RSS endpoint. +- **HANDLE** et_economy and ecb_press missing `` gracefully in parser (use title as fallback) +- **NOTE** coindesk is actually healthy (25 items) — earlier 403 was tool-specific, not a real block +- **NOTE** arxiv_qfin is empty on weekends by design, not intermittent + +**FILE TO EDIT:** config/sources.yaml (Reuters URLs, moneycontrol URL), collectors/rss_feeds.py (description fallback for et_economy/ecb_press, rbi_press HTML handling) + +--- + +## SOCIAL SCRAPERS + +### 11. Reddit (scrapers/reddit_scraper.py) +- **ENDPOINT STATUS:** UP - .json endpoint works from servers (blocked by some CDN tools) +- **STRUCTURE:** UNCHANGED - data.children array with post objects +- **AUTH REQUIREMENTS:** UNCHANGED for .json endpoint. **Official API pricing:** Standard tier $12K/year (100 req/min), Enterprise $50K-$500K+. r/all is being deprecated (Reddit shifting to algorithmic discovery). +- **RECOMMENDATION:** Continue using .json endpoint (free, no API key). Monitor for any restrictions on unauthenticated access. Note r/all deprecation may affect subreddit discovery. +- **FILE TO EDIT:** None (monitor only) + +### 12. Hacker News (scrapers/hackernews_scraper.py) +- **ENDPOINT STATUS:** UP - Firebase API fully functional +- **STRUCTURE:** UNCHANGED - topstories.json returns 500 IDs, newstories works +- **AUTH REQUIREMENTS:** UNCHANGED - No auth needed +- **RECOMMENDATION:** No action +- **FILE TO EDIT:** None + +### 13. YouTube (scrapers/youtube_scraper.py) +- **ENDPOINT STATUS:** UP - Data API v3 responds (auth error without key = expected) +- **STRUCTURE:** UNCHANGED +- **AUTH REQUIREMENTS:** UNCHANGED - API key required, quota applies +- **RECOMMENDATION:** No action +- **FILE TO EDIT:** None + +### 14. Mastodon (scrapers/mastodon_scraper.py) +- **ENDPOINT STATUS:** **BROKEN** - `/api/v1/timelines/public` returns 422 "Unprocessable Entity" +- **STRUCTURE:** CHANGED - mastodon.social now requires authentication for public timeline +- **AUTH REQUIREMENTS:** **NEW RESTRICTION** - Authentication token now required for `/timelines/public` +- **WORKAROUND AVAILABLE:** `/api/v1/trends/statuses` works WITHOUT auth (HTTP 200). Returns trending statuses with full structure (id, content, account, stats, quotes). Instance is v4.6.0-nightly with 3.2M users. +- **RECOMMENDATION:** **QUICK FIX:** Switch to `/api/v1/trends/statuses` endpoint (no auth needed, returns trending content). **FULL FIX:** Add OAuth token support for `/timelines/public` access. Can use both: trends for immediate data, public timeline for comprehensive coverage with auth. +- **FILE TO EDIT:** scrapers/mastodon_scraper.py (switch default endpoint to trends/statuses; add optional OAuth token for public timeline) + +### 15. GitHub (scrapers/github_scraper.py) +- **ENDPOINT STATUS:** UP - API working, 60 req/hr unauthenticated +- **STRUCTURE:** UNCHANGED +- **AUTH REQUIREMENTS:** UNCHANGED - Token optional but recommended for higher limits +- **RECOMMENDATION:** No action +- **FILE TO EDIT:** None + +### 16. SEC EDGAR (scrapers/sec_scraper.py) +- **ENDPOINT STATUS:** **DEGRADED** - EFTS returns 403 without proper User-Agent +- **STRUCTURE:** UNCHANGED - But new `data.sec.gov` REST API available alongside EFTS +- **AUTH REQUIREMENTS:** **STRICTER** - SEC now enforces User-Agent header with real email +- **NEW FEATURES:** EDGAR Release 26.1 (March 2026) added operational status indicators for degraded service. EDGAR Release 26.0.1 (Feb 2026) updated submission notifications. `data.sec.gov` REST APIs remain free, keyless, JSON — only need User-Agent. +- **RECOMMENDATION:** **UPDATE User-Agent** - Replace `research@example.com` with a real email. Consider adding `data.sec.gov` as supplementary source. +- **FILE TO EDIT:** scrapers/sec_scraper.py (line 61: update User-Agent email) + +### 17. Discord (scrapers/discord_scraper.py) +- **ENDPOINT STATUS:** UP - API v10 is current +- **STRUCTURE:** MINOR CHANGES - PIN_MESSAGES split from MANAGE_MESSAGES (Feb 23, 2026). DAVE E2E encryption mandatory for voice/video (March 1, 2026). +- **AUTH REQUIREMENTS:** UNCHANGED - Bot token + MESSAGE_CONTENT intent required. Check if bot permissions need updating for PIN_MESSAGES split. +- **RECOMMENDATION:** Verify bot permissions after PIN_MESSAGES/MANAGE_MESSAGES split. Voice/video not relevant for text scraping. +- **FILE TO EDIT:** scrapers/discord_scraper.py (check permission flags if using pinning) + +### 18. Dark Web (scrapers/darkweb_scraper.py) +- **ENDPOINT STATUS:** DEPENDS ON TOR - Cannot verify externally (needs SOCKS5 proxy) +- **STRUCTURE:** N/A - Varies by .onion site +- **AUTH REQUIREMENTS:** UNCHANGED - Tor proxy required +- **RECOMMENDATION:** Test from Docker container with Tor proxy running +- **FILE TO EDIT:** None + +### 19. Web Scraper (scrapers/web_scraper.py) +- **ENDPOINT STATUS:** UP - General web targets accessible +- **STRUCTURE:** UNCHANGED +- **AUTH REQUIREMENTS:** UNCHANGED +- **RECOMMENDATION:** No action +- **FILE TO EDIT:** None + +### 20. Central Banks (scrapers/centralbank_scraper.py) +- **ENDPOINT STATUS:** UP - Fed RSS confirmed working, RBI pages accessible +- **STRUCTURE:** UNCHANGED - Standard RSS/HTML scraping +- **AUTH REQUIREMENTS:** UNCHANGED +- **RECOMMENDATION:** No action +- **FILE TO EDIT:** None + +--- + +## MESSAGING + +### 21. Telegram (collectors/telegram_channels.py) +- **ENDPOINT STATUS:** REQUIRES CREDENTIALS - Cannot verify without api_id/api_hash +- **STRUCTURE:** N/A +- **AUTH REQUIREMENTS:** UNCHANGED - Telethon/Pyrogram with API credentials +- **RECOMMENDATION:** Verify from running instance +- **FILE TO EDIT:** None + +### 22. Twitter/X (collectors/twitter_lists.py) +- **ENDPOINT STATUS:** UNCERTAIN - Cookie-based scraping fragile; X continues tightening +- **STRUCTURE:** LIKELY CHANGED - X regularly changes page structure +- **AUTH REQUIREMENTS:** **MAJOR CHANGE** - X shifted to pay-per-use model (Feb 6, 2026). No free tier for new developers. Reading a post: $0.005, user profile: $0.010, creating post: $0.010. 24h deduplication window. 2M post-read/month cap before Enterprise required. Legacy free tier users get $10 voucher then move to pay-as-you-go. +- **COST ESTIMATE:** At 10K posts/day = ~$1,500/month. Consider cost-benefit for sentiment scraping. +- **RECOMMENDATION:** **EVALUATE BUDGET** - Cookie scraping remains the free path but is fragile. If moving to official API, budget for pay-per-use costs. Consider reducing scrape frequency or filtering to treasury-relevant content only. +- **FILE TO EDIT:** collectors/twitter_lists.py (verify cookie auth; if migrating to API, add billing config) + +--- + +## CONNECTORS + +### 23. DragonScope Connector (connectors/dragonscope.py) +- **CODE STATUS:** HEALTHY +- **STRUCTURE:** Well-architected with Redis primary + API fallback +- **CATEGORIES:** reddit_posts, news, github_repos, sec_filings all mapped +- **RECOMMENDATION:** No action +- **FILE TO EDIT:** None + +### 24. LiquiFi Connector (connectors/liquifi.py) +- **CODE STATUS:** HEALTHY +- **STRUCTURE:** Good treasury keyword scoring with boundary-matching for short keywords +- **RATE PATTERNS:** Properly validates ranges (repo 0-15%, MIBOR 0-20%, USDINR 40-150) +- **RECOMMENDATION:** No action +- **FILE TO EDIT:** None + +### 25. Router (connectors/router.py) +- **CODE STATUS:** HEALTHY +- **LOGIC:** Platform-based routing with treasury-score content override +- **CATEGORIES COVERED:** All 13 platforms mapped to DRAGONSCOPE/LIQUIFI/BOTH +- **RECOMMENDATION:** No action +- **FILE TO EDIT:** None + +--- + +## INFRASTRUCTURE + +### 26. Celery Beat Schedules +- **scheduler/schedule.py** - DEPRECATED (correctly marked, imports are no-op) +- **core/scheduler.py** - CANONICAL source, dynamically builds from sources.yaml +- **config/sources.yaml** - 15 YAML-driven collectors defined +- **Social scraper schedules** - 13 additional hard-coded in core/scheduler.py + +**Verification Results:** +| Schedule Entry | Task Function | Scraper/Collector File | Status | +|---------------|---------------|----------------------|--------| +| collect-rbi_dbie | core.tasks.run_collector | collectors/rbi_dbie.py | OK | +| collect-rbi_circulars | core.tasks.run_collector | collectors/rbi_circulars.py | OK | +| collect-fred_api | core.tasks.run_collector | collectors/fred_api.py | OK | +| collect-nse_bhavcopy | core.tasks.run_collector | collectors/nse_bhavcopy.py | OK | +| collect-bse_api | core.tasks.run_collector | collectors/bse_api.py | OK | +| collect-ccil_rates | core.tasks.run_collector | collectors/ccil_rates.py | OK | +| collect-data_gov_in | core.tasks.run_collector | collectors/data_gov_in.py | OK | +| collect-sebi_circulars | core.tasks.run_collector | collectors/sebi_circulars.py | OK | +| collect-world_bank | core.tasks.run_collector | collectors/world_bank.py | OK | +| collect-imf_data | core.tasks.run_collector | collectors/imf_data.py | OK | +| collect-rss_feeds | core.tasks.run_collector | collectors/rss_feeds.py | OK | +| collect-telegram_channels | core.tasks.run_collector | collectors/telegram_channels.py | OK | +| collect-twitter_lists | core.tasks.run_collector | collectors/twitter_lists.py | OK | +| scrape-reddit | core.tasks.scrape_reddit | scrapers/reddit_scraper.py | OK | +| scrape-twitter | core.tasks.scrape_twitter | scrapers/twitter_scraper.py | OK | +| scrape-hackernews | core.tasks.scrape_hackernews | scrapers/hackernews_scraper.py | OK | +| scrape-youtube | core.tasks.scrape_youtube | scrapers/youtube_scraper.py | OK | +| scrape-rss-financial | core.tasks.scrape_rss_financial | scrapers/rss_scraper.py | OK | +| scrape-central-banks | core.tasks.scrape_central_banks | scrapers/centralbank_scraper.py | OK | +| scrape-sec | core.tasks.scrape_sec | scrapers/sec_scraper.py | OK | +| scrape-github | core.tasks.scrape_github | scrapers/github_scraper.py | OK | +| scrape-mastodon | core.tasks.scrape_mastodon | scrapers/mastodon_scraper.py | OK | +| scrape-darkweb | core.tasks.scrape_darkweb | scrapers/darkweb_scraper.py | OK | +| scrape-web | core.tasks.scrape_web | scrapers/web_scraper.py | OK | +| scrape-discord | core.tasks.scrape_discord | scrapers/discord_scraper.py | OK | + +**No orphaned schedule entries.** All tasks have corresponding files and functions. +**No missing schedules.** All scrapers/collectors have schedule entries. + +### 27. Docker Compose +- **Services:** 10 defined (postgres, redis, minio, zookeeper, kafka, tor, api, worker, beat, nlp-worker, flower) +- **Port conflicts:** NONE - All ports unique (5432, 6379, 9000, 9001, 2181, 9092, 9050, 8118, 8000, 5555) +- **Health checks:** postgres, redis, minio, kafka, api all have health checks +- **Missing health checks:** tor (restart: unless-stopped only), beat, nlp-worker, flower +- **RECOMMENDATION:** Add health checks for beat and nlp-worker services +- **FILE TO EDIT:** docker-compose.yml (optional - add health checks) + +--- + +## NEW DATA SOURCES & API CHANGES + +### Regulatory & Compliance Changes (NBFC Treasury) +| # | Change | Effective | Impact | Action | +|---|--------|-----------|--------|--------| +| 1 | **NSE Static IP Mandate** | April 1, 2026 | All algo API keys without static IP binding expired. IPv4 only, 2 IPs max. | Verify server IPs registered with broker | +| 2 | **RBI Data Protection Advisory** | April 2026 | All regulated entities must prioritize API security and data handling compliance | Review scraper data handling practices | +| 3 | **RBI Digital Payment Auth Framework** | April 1, 2026 | Risk-based authentication replacing SMS OTP-only | Update any payment auth integrations | +| 4 | **RBI Governance Overhaul** | January 2026 | Data security/privacy policies need formal governance approval | Ensure policy documentation is current | + +### API Pricing & Access Changes +| # | Platform | Change | Cost Impact | +|---|----------|--------|-------------| +| 5 | **X/Twitter** | Pay-per-use model (Feb 6, 2026). No free tier. $0.005/post read, $0.010/profile. 2M reads/month cap. | ~$1,500/month at 10K posts/day | +| 6 | **Reddit** | Standard tier $12K/year. r/all being deprecated. | Free .json endpoint still works | +| 7 | **YouTube** | No changes. 10K units/day quota unchanged. | No impact | +| 8 | **Discord** | PIN_MESSAGES split from MANAGE_MESSAGES (Feb 23, 2026). DAVE E2E encryption for voice (March 1, 2026). | Bot permission audit needed | + +### API Updates & New Features +| # | Source | Update | Benefit | +|---|--------|--------|---------| +| 9 | **FRED API v2** | Bulk retrieval of all series in a release (Nov 2025). JSON/XML. | Major efficiency gain for macro data | +| 10 | **SEC EDGAR 26.1** | New operational status indicators (March 2026). data.sec.gov REST API free/keyless. | Better monitoring, new data source | +| 11 | **Reuters RSS** | Fully dead. Workarounds failed March 2026. | Replace with alternatives | +| 12 | **Mastodon** | mastodon.social now requires auth for public timeline. | Add auth token | +| 13 | **CoinDesk** | Cloudflare bot protection blocks RSS. | Switch feed URL or add headers | +| 14 | **RBI DPIP** | Digital Payments Intelligence Platform — AI-powered transaction analysis. | Monitor for data access opportunities | + +### Potential New Data Sources for NBFC Treasury +| # | Source | Type | Relevance | +|---|--------|------|-----------| +| 1 | **Breeze API** (ICICIdirect) | Free Indian market data, historical OHLC, streaming | HIGH — direct NSE/BSE data, no cost | +| 2 | **GitHub NSE/BSE API** (0xramm) | Free REST API via Yahoo Finance backend, no key needed | HIGH — backup for NSE/BSE data | +| 3 | **NaBFID** | Infrastructure financing DFI data | MEDIUM — relevant to bond/credit markets | +| 4 | **RBI KLEMS** | Productivity/growth database | MEDIUM — macro indicators | +| 5 | **NPCI** | UPI transaction statistics | MEDIUM — digital payment flow signals | +| 6 | **IBBI** | Insolvency/bankruptcy data | MEDIUM — corporate health signals | +| 7 | **data.sec.gov** | Free SEC RESTful JSON API | MEDIUM — supplements EFTS search | +| 8 | **Upstox API** | Free market data (may have expired March 31, 2026) | LOW — verify availability | + +--- + +## PRIORITY ACTION ITEMS + +### P0 - URGENT: Compliance & Broken Sources (This Week) +| # | Issue | File | Action | +|---|-------|------|--------| +| 1 | **NSE Static IP Mandate** (April 1 deadline PASSED) | Infrastructure | Verify production server IPs registered with broker; IPv4 only | +| 2 | **RBI Data Protection Advisory** (April 2026) | All collectors | Review API security and data handling for NBFC compliance | +| 3 | FBIL down (ECONNREFUSED) | collectors/ccil_rates.py | Migrate to `ccilindia.com` Data & Statistics for MIBOR/ZCYC data | +| 4 | **RBI DBIE TLS cert broken** | collectors/rbi_dbie.py | Update BASE_URL to `data.rbi.org.in`; investigate SPA API endpoints | +| 5 | **data.gov.in fully blocked** (403/404 all endpoints) | collectors/data_gov_in.py | Test with browser headers; check if API migrated to `apisetu.gov.in` | +| 6 | Reuters RSS dead | config/sources.yaml | Replace with RSS.app proxies or switch to web scraping | +| 7 | Mastodon needs auth | scrapers/mastodon_scraper.py | **Quick fix:** switch to `/api/v1/trends/statuses` (no auth). Full fix: add OAuth token. | +| 8 | SEC User-Agent placeholder | scrapers/sec_scraper.py:61 | Replace `research@example.com` with real email | +| 9 | **IMF SDMX API dead** (TCP timeout) | collectors/imf_data.py | Find replacement API — old `dataservices.imf.org` decommissioned. Check `data.imf.org` docs. | + +### P1 - Fix This Month +| # | Issue | File | Action | +|---|-------|------|--------| +| 10 | **X/Twitter cost planning** | collectors/twitter_lists.py | Evaluate pay-per-use API vs cookie scraping; budget if migrating | +| 11 | **FRED API v2 upgrade** | collectors/fred_api.py | Add v2 bulk retrieval for efficient macro data pulls | +| 12 | **Moneycontrol RSS frozen** (since April 2024) | config/sources.yaml | Find new RSS URL or switch to web scraping | +| 13 | **RBI Press not RSS** (HTML page) | config/sources.yaml, collectors/rss_feeds.py | Switch to HTML scraper or find actual RSS endpoint | +| 14 | ET Economy + ECB no descriptions | collectors/rss_feeds.py | Handle missing `` gracefully (use title fallback) | +| 15 | Discord permission split | scrapers/discord_scraper.py | Verify bot permissions for PIN_MESSAGES change | + +### P2 - Nice to Have / New Sources +| # | Issue | File | Action | +|---|-------|------|--------| +| 14 | World Bank timeouts | collectors/world_bank.py | Add retry with backoff, increase timeout | +| 15 | SEBI slow pages | collectors/sebi_circulars.py | Increase timeout to 60s | +| 16 | Docker health checks | docker-compose.yml | Add checks for beat/nlp-worker | +| 17 | SEC data.sec.gov API | scrapers/sec_scraper.py | Add as supplementary data source | +| 18 | Breeze API integration | NEW collector | Free NSE/BSE data — evaluate as backup source | +| 19 | Upstox API check | N/A | Verify if free access extended past March 31, 2026 | +| 20 | RBI DPIP monitoring | N/A | Watch for public data access when platform launches | + +--- + +*Report generated by automated weekly validation task.* +*Next validation: 2026-04-20* diff --git a/tests/test_cbb_quality.py b/tests/test_cbb_quality.py new file mode 100644 index 0000000..d17404d --- /dev/null +++ b/tests/test_cbb_quality.py @@ -0,0 +1,197 @@ +"""Offline tests for the CBB per-source quality validator.""" + +from datetime import datetime, timedelta, timezone + +import pytest + +from processors.cbb_quality import ( + _frequency_to_days, + _is_numeric, + _to_datetime, + run_quality_report, + validate_cn_indicators, + validate_comtrade, +) + + +# ── Helpers ───────────────────────────────────────────────────────── + +def _dt(days_offset: float = 0.0) -> datetime: + return datetime.now(timezone.utc) + timedelta(days=days_offset) + + +# ── Low-level helper tests ────────────────────────────────────────── + +def test_to_datetime_parses_iso_and_variants(): + assert _to_datetime("2024-03-15") == datetime(2024, 3, 15, tzinfo=timezone.utc) + assert _to_datetime("2024-03") == datetime(2024, 3, 1, tzinfo=timezone.utc) + assert _to_datetime("2024") == datetime(2024, 1, 1, tzinfo=timezone.utc) + assert _to_datetime("2024-03-15T12:00:00Z") == datetime( + 2024, 3, 15, 12, 0, tzinfo=timezone.utc + ) + assert _to_datetime(datetime(2024, 3, 15, tzinfo=timezone.utc)) == datetime( + 2024, 3, 15, tzinfo=timezone.utc + ) + assert _to_datetime("not-a-date") is None + + +def test_is_numeric_rejects_bools_and_strings(): + assert _is_numeric(1.5) is True + assert _is_numeric(42) is True + assert _is_numeric("1,234.5") is True + assert _is_numeric(True) is False + assert _is_numeric("n/a") is False + assert _is_numeric(float("nan")) is False + + +def test_frequency_to_days(): + assert _frequency_to_days("daily") == 1 + assert _frequency_to_days("weekly") == 7 + assert _frequency_to_days("monthly") == 30 + assert _frequency_to_days("annual") == 365 + assert _frequency_to_days("unknown") == 30 + assert _frequency_to_days(None) == 30 + + +# ── validate_comtrade tests ───────────────────────────────────────── + +def test_validate_comtrade_passes_fresh_rows(): + now = _dt() + rows = [ + { + "source": "comtrade_mirror", + "indicator": "trade_X_84", + "date": now.replace(day=1, hour=0, minute=0, second=0, microsecond=0), + "value": 1_000_000.0, + "collected_at": now, + } + ] + report = validate_comtrade(rows, now=now) + assert report["schema_valid"] is True + assert report["freshness_valid"] is True + assert report["bad_rows"] == 0 + + +def test_validate_comtrade_detects_stale_collected_at(): + now = _dt() + rows = [ + { + "source": "comtrade_mirror", + "indicator": "trade_X_84", + "date": now.replace(day=1), + "value": 1_000_000.0, + "collected_at": now - timedelta(days=10), + } + ] + report = validate_comtrade(rows, now=now) + assert report["freshness_valid"] is False + assert report["freshness_age_days"] > 7 + + +def test_validate_comtrade_reports_schema_errors(): + now = _dt() + rows = [ + {"source": "comtrade_mirror", "indicator": "trade_X_84", "date": now, "value": "bad"}, + {"source": "comtrade_mirror", "indicator": "trade_M_85", "date": "not-a-date", "value": 100.0}, + {"source": "comtrade_mirror", "indicator": "trade_M_85"}, # missing date/value + ] + report = validate_comtrade(rows, now=now) + assert report["schema_valid"] is False + assert report["bad_rows"] == 3 + assert any("not numeric" in e for e in report["schema_errors"]) + assert any("not parseable" in e for e in report["schema_errors"]) + + +def test_validate_comtrade_empty_is_trivially_ok(): + now = _dt() + report = validate_comtrade([], now=now) + assert report["schema_valid"] is True + assert report["freshness_valid"] is True + assert report["row_count"] == 0 + + +# ── validate_cn_indicators tests ──────────────────────────────────── + +def test_validate_cn_indicators_fresh_and_stale(): + now = _dt() + catalog = [ + {"key": "ccfi", "frequency": "weekly"}, + {"key": "bdi", "frequency": "daily"}, + ] + rows = [ + {"source": "cn_indicators", "indicator": "ccfi", "date": now, "value": 1000.0}, + { + "source": "cn_indicators", + "indicator": "bdi", + "date": now - timedelta(days=5), + "value": 1500.0, + }, + ] + report = validate_cn_indicators(rows, catalog=catalog, now=now) + assert report["schema_valid"] is True + assert report["indicators"]["ccfi"]["freshness_valid"] is True + assert report["indicators"]["bdi"]["freshness_valid"] is False + assert report["freshness_valid"] is False + assert report["indicators"]["ccfi"]["freshness_threshold_days"] == 14 + assert report["indicators"]["bdi"]["freshness_threshold_days"] == 2 + + +def test_validate_cn_indicators_missing_catalog(): + now = _dt() + rows = [ + {"source": "cn_indicators", "indicator": "new_indicator", "date": now, "value": 1.0}, + ] + report = validate_cn_indicators(rows, catalog=[], now=now) + assert report["indicators"]["new_indicator"]["catalog_missing"] is True + assert report["indicators"]["new_indicator"]["catalog_frequency"] == "unknown" + + +def test_validate_cn_indicators_schema_error(): + now = _dt() + catalog = [{"key": "ccfi", "frequency": "weekly"}] + rows = [ + {"source": "cn_indicators", "indicator": "ccfi", "date": now, "value": None}, + ] + report = validate_cn_indicators(rows, catalog=catalog, now=now) + assert report["schema_valid"] is False + assert report["bad_rows"] == 1 + + +# ── run_quality_report tests ──────────────────────────────────────── + +def test_run_quality_report_routes_and_overall_ok(): + now = _dt() + rows = [ + {"source": "comtrade_mirror", "indicator": "trade_X_84", "date": now, "value": 1.0}, + {"source": "cn_indicators", "indicator": "ccfi", "date": now, "value": 1.0}, + ] + report = run_quality_report(rows, now=now) + assert report["status"] == "ok" + assert report["row_counts"]["comtrade_mirror"] == 1 + assert report["row_counts"]["cn_indicators"] == 1 + assert "comtrade_mirror" in report["sources"] + assert "cn_indicators" in report["sources"] + + +def test_run_quality_report_overall_fail_on_schema_error(): + now = _dt() + rows = [ + {"source": "comtrade_mirror", "indicator": "trade_X_84", "date": now, "value": "bad"}, + ] + report = run_quality_report(rows, now=now) + assert report["status"] == "fail" + + +def test_run_quality_report_overall_degraded_on_stale(): + now = _dt() + rows = [ + { + "source": "comtrade_mirror", + "indicator": "trade_X_84", + "date": now, + "value": 1.0, + "collected_at": now - timedelta(days=10), + }, + ] + report = run_quality_report(rows, now=now) + assert report["status"] == "degraded" diff --git a/tests/test_cn_indicators.py b/tests/test_cn_indicators.py new file mode 100644 index 0000000..99a418b --- /dev/null +++ b/tests/test_cn_indicators.py @@ -0,0 +1,506 @@ +"""Unit tests for the cn_indicators collector. + +All network calls and DB dependencies are mocked; tests run offline. +""" + +import json +import math +from datetime import datetime, timezone +from unittest.mock import AsyncMock, MagicMock, patch + +import pandas as pd +import pytest + +from collectors.cn_indicators import ( + CNIndicatorsCollector, + _CUSTOM_PARSERS, + _parse_chinadata_series, + _parse_sse_freight, +) + + +# ── Helpers ───────────────────────────────────────────────────────── + +def _mock_response(status_code: int = 200, json_data=None, text: str = ""): + """Build an httpx-style async response mock. + + ``json()`` is synchronous on ``httpx.Response``, so a regular MagicMock is used. + """ + resp = AsyncMock() + resp.status_code = status_code + resp.json = MagicMock(return_value=json_data if json_data is not None else {}) + resp.text = text + return resp + + +def _http_client(responses: dict[str, AsyncMock]) -> AsyncMock: + """AsyncMock client with a keyed get() side effect.""" + async def _get(url: str, **kwargs): + return responses.get(url, _mock_response(status_code=404)) + + client = AsyncMock() + client.get = _get + client.post = AsyncMock(return_value=_mock_response(status_code=405)) + client.aclose = AsyncMock() + return client + + +# ── Custom parser unit tests ──────────────────────────────────────── + +def test_parse_sse_freight_composite_emits_current_and_prior(): + payload = { + "data": { + "currentDate": "2024-01-05", + "lastDate": "2023-12-29", + "lineDataList": [ + { + "dataItemTypeName": "CCFI_T", + "currentContent": 1000.5, + "lastContent": 990.0, + "properties": {"lineName_EN": "COMPOSITE INDEX"}, + }, + { + "dataItemTypeName": "ROUTE_EUROPE", + "currentContent": 1200.0, + "properties": {"lineName_EN": "EUROPE"}, + }, + ], + } + } + rows = _parse_sse_freight(payload) + assert len(rows) == 2 + assert rows[0] == {"date": "2024-01-05", "value": 1000.5, "line": "COMPOSITE"} + assert rows[1] == {"date": "2023-12-29", "value": 990.0, "line": "COMPOSITE"} + + +def test_parse_sse_freight_fallback_to_first_line(): + payload = { + "data": { + "currentDate": "2024-01-05", + "lineDataList": [ + { + "dataItemTypeName": "ROUTE_MED", + "currentContent": 850.0, + "properties": {"lineName_EN": "MEDITERRANEAN"}, + } + ], + } + } + rows = _parse_sse_freight(payload) + assert len(rows) == 1 + assert rows[0]["line"] == "ROUTE_MED" + + +def test_parse_sse_freight_empty_payload(): + assert _parse_sse_freight(None) == [] + assert _parse_sse_freight({}) == [] + assert _parse_sse_freight({"data": {"lineDataList": []}}) == [] + + +def test_parse_chinadata_series_exports_and_extra_metrics(): + payload = { + "data": { + "data": [ + {"date": "2024-01", "total": 100.0, "export": 60.0, "import": 40.0, "balance": 20.0}, + {"date": "2024-02", "total": 110.0, "export": 65.0, "import": 45.0, "balance": 20.0}, + ] + } + } + rows = _parse_chinadata_series(payload) + assert len(rows) == 2 + assert rows[0]["date"] == "2024-01" + assert rows[0]["value"] == 60.0 + assert rows[0]["export"] == 60.0 + assert rows[0]["total"] == 100.0 + + +def test_parse_chinadata_series_uses_value_key(): + payload = { + "data": { + "data": [ + {"date": "2024-01", "total": 100.0, "export": 60.0}, + ] + } + } + rows = _parse_chinadata_series(payload, value_key="total") + assert rows[0]["value"] == 100.0 + + +def test_parse_chinadata_series_empty_and_malformed_rows(): + assert _parse_chinadata_series(None) == [] + assert _parse_chinadata_series({"data": {"data": []}}) == [] + assert _parse_chinadata_series({"data": {"data": ["not-a-dict"]}}) == [] + + +def test_custom_parsers_registry_keys(): + assert set(_CUSTOM_PARSERS.keys()) == {"ccfi", "scfi", "macro_customs"} + + +# ── Normalization unit tests ──────────────────────────────────────── + +@pytest.mark.parametrize( + "raw,expected", + [ + (None, None), + ("", None), + (" ", None), + (2024, datetime(2024, 1, 1, tzinfo=timezone.utc)), + (2024.0, datetime(2024, 1, 1, tzinfo=timezone.utc)), + # ISO date-only strings parse as *naive* datetimes; documented edge case. + ("2024-03-15", datetime(2024, 3, 15)), + ("2024-03", datetime(2024, 3, 1, tzinfo=timezone.utc)), + ("2024", datetime(2024, 1, 1, tzinfo=timezone.utc)), + ("2024/03/15", datetime(2024, 3, 15, tzinfo=timezone.utc)), + ("15-03-2024", datetime(2024, 3, 15, tzinfo=timezone.utc)), + ("2024-03-15T08:30:00Z", datetime(2024, 3, 15, 8, 30, tzinfo=timezone.utc)), + (datetime(2024, 3, 15, 8, 30), datetime(2024, 3, 15, 8, 30, tzinfo=timezone.utc)), + (datetime(2024, 3, 15, 8, 30, tzinfo=timezone.utc), datetime(2024, 3, 15, 8, 30, tzinfo=timezone.utc)), + ("not-a-date", None), + (float("inf"), None), + (True, None), # bool is int subclass; should be rejected by iso/int paths + ], +) +def test_normalize_date(raw, expected): + assert CNIndicatorsCollector._normalize_date(raw) == expected + + +@pytest.mark.parametrize( + "raw,expected", + [ + (None, None), + (True, None), + (False, None), + (42, 42.0), + (-3.5, -3.5), + ("1,234.56", 1234.56), + (" 78.9 ", 78.9), + ("", None), + (".", None), + ("-", None), + ("nd", None), + ("NA", None), + ("n/a", None), + ("null", None), + ("None", None), + (float("nan"), None), + (float("inf"), None), + ("abc", None), + ], +) +def test_normalize_value(raw, expected): + result = CNIndicatorsCollector._normalize_value(raw) + if expected is None: + assert result is None + elif math.isnan(expected): + assert result is None + else: + assert result == expected + + +# ── Nested path helper ────────────────────────────────────────────── + +@pytest.mark.parametrize( + "data,path,expected", + [ + ({"a": {"b": 1}}, "a.b", 1), + ({"a": [{"b": 2}]}, "a.0.b", 2), + ({"a": [10, 20]}, "a.1", 20), + ({"a": {"b": 1}}, "a.c", None), + ({"a": {"b": 1}}, "a.b.c", None), + ({"a": {"b": 1}}, "x", None), + ([{"a": 1}], "0.a", 1), + ({"a": {"b": 1}}, None, {"a": {"b": 1}}), + ({"a": {"b": 1}}, "", {"a": {"b": 1}}), + ], +) +def test_get_nested(data, path, expected): + assert CNIndicatorsCollector._get_nested(data, path) == expected + + +# ── Source normalization and catalog loading ──────────────────────── + +def test_normalize_source_maps_access_method(): + src = {"key": "k", "access_method": "todo"} + norm = CNIndicatorsCollector._normalize_source(src) + assert norm["access"] == "todo" + assert norm["method"] == "GET" + assert norm["parser"] == "json" + assert norm["date_field"] == "date" + assert norm["value_field"] == "value" + + +def test_load_catalog_reads_dict_and_list(tmp_path): + dict_catalog = tmp_path / "cn_hf_sources.json" + dict_catalog.write_text(json.dumps({"sources": [{"key": "x"}]})) + with patch("collectors.cn_indicators._CATALOG_PATH", dict_catalog): + assert CNIndicatorsCollector._load_catalog() == [{"key": "x"}] + + list_catalog = tmp_path / "list_sources.json" + list_catalog.write_text(json.dumps([{"key": "y"}])) + with patch("collectors.cn_indicators._CATALOG_PATH", list_catalog): + assert CNIndicatorsCollector._load_catalog() == [{"key": "y"}] + + +def test_load_catalog_missing_file_returns_empty(tmp_path): + missing = tmp_path / "does_not_exist.json" + with patch("collectors.cn_indicators._CATALOG_PATH", missing): + assert CNIndicatorsCollector._load_catalog() == [] + + +# ── Collection flow (async) ───────────────────────────────────────── + +@pytest.mark.asyncio +async def test_collect_skips_todo_sources(caplog): + caplog.set_level("INFO") + collector = CNIndicatorsCollector( + {"enabled_sources": [{"key": "bdi", "access": "todo", "note": "needs scraper"}]} + ) + collector._http = _http_client({}) + records = await collector.collect() + assert records == [] + assert "TODO: bdi" in caplog.text + + +@pytest.mark.asyncio +async def test_collect_open_json_with_nested_path(): + url = "https://api.worldbank.org/v2/country/CHN/indicator/NY.GDP.MKTP.CD" + collector = CNIndicatorsCollector( + { + "enabled_sources": [ + { + "key": "wb_chn_gdp", + "name_en": "World Bank China GDP", + "name_zh": "世界银行中国GDP", + "url": url, + "access": "open_json", + "parser": "json", + "json_path": "1", + "date_field": "date", + "value_field": "value", + "unit": "USD", + "sector": "macro", + "frequency": "annual", + } + ] + } + ) + collector._http = _http_client( + { + url: _mock_response( + json_data=[{"indicator": {"id": "NY.GDP.MKTP.CD"}}, [{"date": 2022, "value": 17963.2}, {"date": 2021, "value": 17734.1}]] + ) + } + ) + records = await collector.collect() + assert len(records) == 2 + assert records[0]["key"] == "wb_chn_gdp" + assert records[0]["value"] == 17963.2 + assert records[0]["date"] == datetime(2022, 1, 1, tzinfo=timezone.utc) + + +@pytest.mark.asyncio +async def test_collect_open_json_with_custom_parser(): + url = "https://en.sse.net.cn/currentIndex?indexName=ccfi" + collector = CNIndicatorsCollector( + { + "enabled_sources": [ + { + "key": "ccfi", + "name_en": "CCFI", + "name_zh": "中国出口集装箱运价指数", + "url": url, + "access": "open_json", + "parser": "json", + "date_field": "date", + "value_field": "value", + "unit": "points", + "sector": "transport_logistics", + "frequency": "weekly", + } + ] + } + ) + collector._http = _http_client( + { + url: _mock_response( + json_data={ + "data": { + "currentDate": "2024-01-05", + "lastDate": "2023-12-29", + "lineDataList": [ + { + "dataItemTypeName": "CCFI_T", + "currentContent": 1000.0, + "lastContent": 990.0, + "properties": {"lineName_EN": "COMPOSITE INDEX"}, + } + ], + } + } + ) + } + ) + records = await collector.collect() + assert len(records) == 2 + assert {r["date"] for r in records} == {datetime(2024, 1, 5), datetime(2023, 12, 29)} + assert all(r["key"] == "ccfi" for r in records) + + +@pytest.mark.asyncio +async def test_collect_csv_parser(): + url = "https://example.com/data.csv" + collector = CNIndicatorsCollector( + { + "enabled_sources": [ + { + "key": "csv_demo", + "name_en": "CSV Demo", + "name_zh": "CSV演示", + "url": url, + "access": "open_csv", + "parser": "csv", + "date_field": "date", + "value_field": "value", + "unit": "", + "sector": "macro", + "frequency": "daily", + } + ] + } + ) + collector._http = _http_client({url: _mock_response(text="date,value\n2024-03-15,123.45\n2024-03-16,130.00")}) + records = await collector.collect() + assert len(records) == 2 + # Date-only CSV strings are parsed as naive datetimes by the current pipeline. + assert records[0]["date"] == datetime(2024, 3, 15) + assert records[0]["value"] == 123.45 + + +@pytest.mark.asyncio +async def test_collect_non_200_returns_empty(caplog): + url = "https://example.com/bad" + collector = CNIndicatorsCollector( + { + "enabled_sources": [ + { + "key": "bad_source", + "name_en": "Bad Source", + "name_zh": "坏源", + "url": url, + "access": "open_json", + "parser": "json", + "date_field": "date", + "value_field": "value", + "unit": "", + "sector": "macro", + "frequency": "daily", + } + ] + } + ) + collector._http = _http_client({url: _mock_response(status_code=500)}) + records = await collector.collect() + assert records == [] + assert "non-200 status 500" in caplog.text + + +@pytest.mark.asyncio +async def test_collect_fetch_exception_is_graceful(caplog): + collector = CNIndicatorsCollector( + { + "enabled_sources": [ + { + "key": "explode", + "name_en": "Explode", + "name_zh": "爆炸", + "url": "https://example.com/x", + "access": "open_json", + "parser": "json", + "date_field": "date", + "value_field": "value", + "unit": "", + "sector": "macro", + "frequency": "daily", + } + ] + } + ) + + async def boom(*args, **kwargs): + raise RuntimeError("network down") + + client = AsyncMock() + client.get = boom + client.aclose = AsyncMock() + collector._http = client + records = await collector.collect() + assert records == [] + assert "fetch/parse failed" in caplog.text + + +@pytest.mark.asyncio +async def test_collect_ignores_non_dict_observations(): + url = "https://example.com/list" + collector = CNIndicatorsCollector( + { + "enabled_sources": [ + { + "key": "listy", + "name_en": "Listy", + "name_zh": "列表", + "url": url, + "access": "open_json", + "parser": "json", + "date_field": "date", + "value_field": "value", + "unit": "", + "sector": "macro", + "frequency": "daily", + } + ] + } + ) + collector._http = _http_client( + {url: _mock_response(json_data=[{"date": "2024-01-01", "value": 1}, "bad", {"date": "2024-01-02", "value": None}])} + ) + records = await collector.collect() + assert len(records) == 1 + assert records[0]["date"] == datetime(2024, 1, 1) + + +# ── Parse / validate ──────────────────────────────────────────────── + +@pytest.mark.asyncio +async def test_parse_shapes_dataframe(): + raw = [ + { + "key": "ccfi", + "date": datetime(2024, 1, 5, tzinfo=timezone.utc), + "value": 1000.0, + "unit": "points", + "sector": "transport_logistics", + "frequency": "weekly", + "source_name_zh": "中国出口集装箱运价指数", + "source_name_en": "CCFI", + "url": "https://en.sse.net.cn/currentIndex?indexName=ccfi", + "access": "open_json", + "metadata_extra": {"line": "COMPOSITE"}, + } + ] + collector = CNIndicatorsCollector({"enabled_sources": []}) + df = await collector.parse(raw) + assert list(df.columns) == ["indicator", "date", "value", "unit", "metadata"] + assert df.iloc[0]["indicator"] == "ccfi" + assert df.iloc[0]["metadata"]["sector"] == "transport_logistics" + + +def test_validate_requires_columns(): + collector = CNIndicatorsCollector({"enabled_sources": []}) + good = pd.DataFrame({"indicator": ["x"], "date": [datetime.now(timezone.utc)], "value": [1.0]}) + assert collector.validate(good) is True + + bad = pd.DataFrame({"indicator": ["x"], "value": [1.0]}) + from core.exceptions import SchemaChangedError + + with pytest.raises(SchemaChangedError): + collector.validate(bad) diff --git a/tests/test_comtrade_mirror.py b/tests/test_comtrade_mirror.py new file mode 100644 index 0000000..1a809d4 --- /dev/null +++ b/tests/test_comtrade_mirror.py @@ -0,0 +1,335 @@ +"""Unit tests for the UN Comtrade mirror collector. + +All network calls are mocked; tests run offline. +""" + +import math +from datetime import datetime, timezone +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from collectors.comtrade_mirror import ComtradeMirrorCollector +from core.exceptions import RateLimitError + + +# ── Inline fixtures / helpers ──────────────────────────────────────── + + +def _make_collector(config=None): + """Return a collector with an AsyncClient mocked at the base class.""" + cfg = config or {} + with patch("core.base_collector.httpx.AsyncClient") as mock_client: + mock_client.return_value = MagicMock(is_closed=False) + collector = ComtradeMirrorCollector(cfg) + # Tests that call _fetch directly assign a fresh mock _http below. + return collector + + +def _raw_record( + *, + flow="M", + period=202401, + cmd_code="8412", + primary_value=1234.5, + net_weight=100.0, + reporter_code=156, + partner_code=0, + mirror_reporter=None, + original_flow=None, +): + rec = { + "flowCode": flow, + "period": period, + "cmdCode": cmd_code, + "primaryValue": primary_value, + "netWgt": net_weight, + "reporterCode": reporter_code, + "partnerCode": partner_code, + } + if mirror_reporter: + rec["_mirror_reporter"] = mirror_reporter + if original_flow: + rec["_original_flow"] = original_flow + return rec + + +def _async_http(responses): + """Build a mock AsyncClient whose .get() returns responses in order. + + `responses` is an iterable of (status_code, json_body, headers_dict) + triples. `json_body` may be None to simulate a non-JSON response. + """ + seq = list(responses) + calls = {"count": 0} + + async def _get(url, params=None, headers=None): + idx = calls["count"] + calls["count"] += 1 + status, body, hdrs = seq[idx] if idx < len(seq) else (200, None, {}) + resp = MagicMock() + resp.status_code = status + resp.url = url + resp.headers = hdrs or {} + if body is None: + resp.json = MagicMock(side_effect=ValueError("not json")) + else: + resp.json = MagicMock(return_value=body) + return resp + + http = MagicMock() + http.get = AsyncMock(side_effect=_get) + http.aclose = AsyncMock() + return http + + +# ── parse() and _parse_record ──────────────────────────────────────── + + +@pytest.mark.asyncio +async def test_parse_row_shape_reported(): + collector = _make_collector() + raw = [_raw_record(flow="M", period=202403, cmd_code="8501", primary_value=999.0)] + df = await collector.parse(raw) + + assert len(df) == 1 + row = df.iloc[0] + assert row["indicator"] == "trade_M_85" + assert row["date"] == datetime(2024, 3, 1, tzinfo=timezone.utc) + assert math.isclose(row["value"], 999.0) + assert row["unit"] == "USD" + + meta = row["metadata"] + assert meta["hs"] == "85" + assert meta["flow"] == "M" + assert meta["reporter"] == 156 + assert meta["partner"] == 0 + assert meta["period"] == "202403" + assert meta["view"] == "reported" + + +@pytest.mark.asyncio +async def test_parse_mirror_flow_inversion(): + collector = _make_collector() + # Partner reports an import from China (flow=M); we store it as China export. + raw = [ + _raw_record( + flow="M", + period=202402, + cmd_code="7308", + primary_value=5000.0, + reporter_code=842, + partner_code=156, + ) + ] + df = await collector.parse(raw) + + assert len(df) == 1 + row = df.iloc[0] + assert row["indicator"] == "trade_X_73_mirror" + assert row["metadata"]["flow"] == "X" + assert row["metadata"]["original_flow"] == "M" + assert row["metadata"]["view"] == "mirror" + assert row["metadata"]["reporter"] == 0 + + +@pytest.mark.asyncio +async def test_parse_aggregates_sub_commodity_rows(): + """Sub-commodity rows (HS6) under the same HS2 chapter are aggregated.""" + collector = _make_collector() + raw = [ + _raw_record(period=202401, cmd_code="8412", primary_value=100.0, net_weight=10.0), + _raw_record(period=202401, cmd_code="8499", primary_value=200.0, net_weight=20.0), + ] + df = await collector.parse(raw) + + assert len(df) == 1 + row = df.iloc[0] + assert row["indicator"] == "trade_M_84" + assert math.isclose(row["value"], 300.0) + assert math.isclose(row["metadata"]["netWeight"], 30.0) + + +@pytest.mark.asyncio +async def test_parse_skips_malformed_records(): + collector = _make_collector() + raw = [ + _raw_record(period=202401, cmd_code="8412", primary_value=100.0), + {"flowCode": "M", "period": "bad", "cmdCode": "8412", "primaryValue": 200.0}, + _raw_record(period=202401, cmd_code="8412", primary_value="nope"), + {}, + ] + df = await collector.parse(raw) + assert len(df) == 1 + assert math.isclose(df.iloc[0]["value"], 100.0) + + +@pytest.mark.asyncio +async def test_parse_empty_input(): + collector = _make_collector() + df = await collector.parse([]) + assert df.empty + + +# ── _fetch error handling ──────────────────────────────────────────── + + +@pytest.mark.asyncio +async def test_fetch_returns_dataset_on_success(): + collector = _make_collector() + collector._http = _async_http([(200, {"dataset": [{"id": 1}]}, {})]) + + result = await collector._fetch("http://example.com", {}, {}) + assert result == [{"id": 1}] + + +@pytest.mark.asyncio +async def test_fetch_handles_http_error(): + collector = _make_collector() + collector._http = _async_http([(500, None, {})]) + + result = await collector._fetch("http://example.com", {}, {}) + assert result == [] + + +@pytest.mark.asyncio +async def test_fetch_handles_non_json_response(): + collector = _make_collector() + collector._http = _async_http([(200, None, {})]) + + result = await collector._fetch("http://example.com", {}, {}) + assert result == [] + + +@pytest.mark.asyncio +async def test_fetch_raises_rate_limit_on_429(): + collector = _make_collector() + collector._http = _async_http([(429, {"error": "too many"}, {"Retry-After": "30"})]) + + with pytest.raises(RateLimitError) as exc: + await collector._fetch("http://example.com", {}, {}) + assert exc.value.retry_after == 30 + + +@pytest.mark.asyncio +async def test_fetch_graceful_on_request_exception(): + collector = _make_collector() + collector._http = MagicMock() + collector._http.get = AsyncMock(side_effect=ConnectionError("no route")) + + result = await collector._fetch("http://example.com", {}, {}) + assert result == [] + + +# ── collect() integration ──────────────────────────────────────────── + + +@pytest.mark.asyncio +async def test_collect_gathers_reported_and_mirror_records(): + """collect() should assemble records from both the reported and mirror loops.""" + collector = _make_collector({"recent_months": 1, "partner_reporters": [842]}) + + reported_payload = { + "dataset": [ + _raw_record(flow="M", period="202401", cmd_code="8412", primary_value=100.0), + ] + } + mirror_payload = { + "dataset": [ + _raw_record( + flow="M", + period="202401", + cmd_code="8412", + primary_value=50.0, + reporter_code=842, + partner_code=156, + ), + ] + } + # 2 flows * 1 period = 2 reported calls, then 2 flows * 1 period * 1 partner = 2 mirror calls. + collector._http = _async_http( + [ + (200, reported_payload, {}), + (200, reported_payload, {}), + (200, mirror_payload, {}), + (200, mirror_payload, {}), + ] + ) + + records = await collector.collect() + # Each mirrored record is tagged, not duplicated; we just check non-empty. + assert len(records) == 4 + mirror_records = [r for r in records if r.get("_mirror_reporter")] + assert len(mirror_records) == 2 + + +@pytest.mark.asyncio +async def test_collect_empty_responses_graceful(): + collector = _make_collector({"recent_months": 1, "partner_reporters": [842]}) + collector._http = _async_http([(200, {"dataset": []}, {})] * 4) + + records = await collector.collect() + assert records == [] + + +@pytest.mark.asyncio +async def test_collect_stops_on_rate_limit(): + collector = _make_collector({"recent_months": 2, "partner_reporters": [842]}) + collector._http = _async_http( + [ + (429, {"error": "rate"}, {"Retry-After": "10"}), + ] + ) + + records = await collector.collect() + assert records == [] + + +# ── validate() ─────────────────────────────────────────────────────── + + +def test_validate_empty_dataframe(): + import pandas as pd + + collector = _make_collector() + assert collector.validate(pd.DataFrame()) is True + + +def test_validate_missing_columns_raises(): + import pandas as pd + + collector = _make_collector() + with pytest.raises(Exception): + collector.validate(pd.DataFrame({"value": [1]})) + + +# ── Internal helpers ───────────────────────────────────────────────── + + +def test_period_to_date(): + collector = _make_collector() + assert collector._period_to_date(202405) == datetime( + 2024, 5, 1, tzinfo=timezone.utc + ) + + +def test_periods_shape(): + collector = _make_collector() + periods = collector._periods(3) + assert len(periods) == 3 + assert all(len(p) == 6 for p in periods) + + +def test_endpoint(): + collector = _make_collector() + url = collector._endpoint("202401", 156) + assert url.endswith("C/M/HS/202401/156") + + +def test_params_includes_desc(): + collector = _make_collector() + params = collector._params(flow="X", partner=0, cmd_code="84,85") + assert params["flowCode"] == "X" + assert params["partnerCode"] == "0" + assert params["cmdCode"] == "84,85" + assert params["includeDesc"] == "True" diff --git a/tests/test_conditions_index.py b/tests/test_conditions_index.py new file mode 100644 index 0000000..8fb8552 --- /dev/null +++ b/tests/test_conditions_index.py @@ -0,0 +1,371 @@ +"""Unit tests for processors/conditions_index.py. + +All DB and network dependencies are mocked; tests run offline. +""" + +import math +import subprocess +import sys +from datetime import datetime, timezone +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from processors.conditions_index import ( + ConditionsIndexProcessor, + _latest_complete_month, + _load_taxonomy, + _norm_dt, + _period_str, + _prev_month, + compute_conditions, +) + + +SIMPLE_TAXONOMY = { + "sectors": { + "electronics": { + "hs_codes": ["85"], + "cn_hf_sources": ["bdi"], + "region": "coastal_export", + } + } +} + + +def _dt(*args, tz=timezone.utc): + return datetime(*args, tzinfo=tz) + + +# --------------------------------------------------------------------------- +# Date helpers +# --------------------------------------------------------------------------- + +def test_norm_dt_handles_naive_and_aware(): + naive = datetime(2024, 5, 15, 10, 0, 0) + assert _norm_dt(naive) == datetime(2024, 5, 15, 10, 0, 0, tzinfo=timezone.utc) + aware = datetime(2024, 5, 15, 10, 0, 0, tzinfo=timezone.utc) + assert _norm_dt(aware) == aware + assert _norm_dt(None) is None + + +def test_latest_complete_month_and_prev_month(): + # June 15 -> latest complete month is May + assert _latest_complete_month(_dt(2024, 6, 15)) == (2024, 5) + assert _prev_month(2024, 5) == (2024, 4) + assert _prev_month(2024, 1) == (2023, 12) + # June 1 at 00:00 is not complete (last instant of June not reached) + assert _latest_complete_month(_dt(2024, 6, 1)) == (2024, 5) + # July 1 00:00 -> June is complete + assert _latest_complete_month(_dt(2024, 7, 1)) == (2024, 6) + + +def test_period_str(): + assert _period_str(2024, 5) == "2024-05" + + +# --------------------------------------------------------------------------- +# Core diffusion math: SD / AS / D +# --------------------------------------------------------------------------- + +def test_diffusion_math_with_trade_anchor(): + """D = 0.4*SD + 0.6*AS when an anchor is available.""" + now = _dt(2024, 6, 15) + trade = [ + # April (previous) + {"date": _dt(2024, 4, 1), "flow": "X", "hs": "85", "value": 100.0, "reporter": 156, "partner": 0}, + # May (latest) - 10% growth + {"date": _dt(2024, 5, 1), "flow": "X", "hs": "85", "value": 110.0, "reporter": 156, "partner": 0}, + ] + # 2 positive, 1 negative, 1 neutral -> SD = 100*(2-1)/4 = 25 + mentions = [ + {"date": _dt(2024, 5, 1), "sector": "electronics", "score": 0.20}, + {"date": _dt(2024, 5, 2), "sector": "electronics", "score": 0.25}, + {"date": _dt(2024, 5, 3), "sector": "electronics", "score": -0.20}, + {"date": _dt(2024, 5, 4), "sector": "electronics", "score": 0.05}, + ] + results = compute_conditions(trade, [], mentions, SIMPLE_TAXONOMY, now) + assert len(results) == 1 + r = results[0] + expected_sd = 25.0 + expected_as = 100.0 * math.tanh(0.10 / 0.10) + expected_d = 0.4 * expected_sd + 0.6 * expected_as + assert r["SD"] == pytest.approx(expected_sd, 0.001) + assert r["AS"] == pytest.approx(expected_as, 0.001) + assert r["D"] == pytest.approx(expected_d, 0.001) + + +def test_diffusion_math_with_hf_anchor_fallback(): + """When trade anchor is missing, the processor falls back to CN HF indicators.""" + now = _dt(2024, 6, 15) + indicators = [ + {"date": _dt(2024, 4, 1), "indicator": "bdi", "value": 1000.0}, + {"date": _dt(2024, 5, 1), "indicator": "bdi", "value": 1100.0}, + ] + mentions = [ + {"date": _dt(2024, 5, 1), "sector": "electronics", "score": 0.20}, + ] + results = compute_conditions([], indicators, mentions, SIMPLE_TAXONOMY, now) + r = results[0] + assert r["inputs"]["anchor_source"] == "cn_hf:bdi" + expected_as = 100.0 * math.tanh(0.10 / 0.10) + assert r["AS"] == pytest.approx(expected_as, 0.001) + + +# --------------------------------------------------------------------------- +# Momentum +# --------------------------------------------------------------------------- + +def test_momentum_is_latest_minus_previous(): + now = _dt(2024, 6, 15) + trade = [ + {"date": _dt(2024, 3, 1), "flow": "X", "hs": "85", "value": 100.0, "reporter": 156, "partner": 0}, + {"date": _dt(2024, 4, 1), "flow": "X", "hs": "85", "value": 100.0, "reporter": 156, "partner": 0}, + {"date": _dt(2024, 5, 1), "flow": "X", "hs": "85", "value": 110.0, "reporter": 156, "partner": 0}, + ] + mentions = [ + {"date": _dt(2024, 4, 1), "sector": "electronics", "score": 0.0}, + {"date": _dt(2024, 5, 1), "sector": "electronics", "score": 0.0}, + ] + results = compute_conditions(trade, [], mentions, SIMPLE_TAXONOMY, now) + r = results[0] + assert r["momentum"] == pytest.approx(r["D"] - 0.0, 0.001) + + +def test_momentum_zero_when_unchanged(): + """When latest and previous diffusion are identical, momentum is zero.""" + now = _dt(2024, 6, 15) + trade = [ + {"date": _dt(2024, 3, 1), "flow": "X", "hs": "85", "value": 100.0, "reporter": 156, "partner": 0}, + {"date": _dt(2024, 4, 1), "flow": "X", "hs": "85", "value": 110.0, "reporter": 156, "partner": 0}, + {"date": _dt(2024, 5, 1), "flow": "X", "hs": "85", "value": 121.0, "reporter": 156, "partner": 0}, + ] + # Same sentiment composition for both months -> same SD, and same AS (same growth) + mentions = [ + {"date": _dt(2024, 4, 1), "sector": "electronics", "score": 0.20}, + {"date": _dt(2024, 4, 2), "sector": "electronics", "score": -0.20}, + {"date": _dt(2024, 5, 1), "sector": "electronics", "score": 0.20}, + {"date": _dt(2024, 5, 2), "sector": "electronics", "score": -0.20}, + ] + results = compute_conditions(trade, [], mentions, SIMPLE_TAXONOMY, now) + r = results[0] + # Both months have 10% growth and identical sentiment -> D equal -> momentum 0 + assert r["momentum"] == pytest.approx(0.0, 0.001) + + +# --------------------------------------------------------------------------- +# Mirror gap +# --------------------------------------------------------------------------- + +def test_mirror_gap_calculation(): + now = _dt(2024, 6, 15) + trade = [ + # Reported exports + {"date": _dt(2024, 5, 1), "flow": "X", "hs": "85", "value": 100.0, "reporter": 156, "partner": 0}, + # Partner-reported imports from China (mirror flow) + {"date": _dt(2024, 5, 1), "flow": "M", "hs": "85", "value": 120.0, "reporter": 0, "partner": 156}, + ] + results = compute_conditions(trade, [], [], SIMPLE_TAXONOMY, now) + r = results[0] + assert r["mirror_gap"] == pytest.approx(20.0, 0.001) + + +def test_mirror_gap_none_when_missing(): + now = _dt(2024, 6, 15) + trade = [ + {"date": _dt(2024, 5, 1), "flow": "X", "hs": "85", "value": 100.0, "reporter": 156, "partner": 0}, + ] + results = compute_conditions(trade, [], [], SIMPLE_TAXONOMY, now) + assert results[0]["mirror_gap"] is None + + +# --------------------------------------------------------------------------- +# Confidence tiers +# --------------------------------------------------------------------------- + +def test_confidence_high_with_anchor_and_many_mentions(): + now = _dt(2024, 6, 15) + trade = [ + {"date": _dt(2024, 4, 1), "flow": "X", "hs": "85", "value": 100.0, "reporter": 156, "partner": 0}, + {"date": _dt(2024, 5, 1), "flow": "X", "hs": "85", "value": 110.0, "reporter": 156, "partner": 0}, + ] + mentions = [ + {"date": _dt(2024, 5, 1), "sector": "electronics", "score": 0.20} + for _ in range(30) + ] + results = compute_conditions(trade, [], mentions, SIMPLE_TAXONOMY, now) + assert results[0]["confidence"] == "high" + + +def test_confidence_med_with_anchor_but_few_mentions(): + now = _dt(2024, 6, 15) + trade = [ + {"date": _dt(2024, 4, 1), "flow": "X", "hs": "85", "value": 100.0, "reporter": 156, "partner": 0}, + {"date": _dt(2024, 5, 1), "flow": "X", "hs": "85", "value": 110.0, "reporter": 156, "partner": 0}, + ] + mentions = [ + {"date": _dt(2024, 5, 1), "sector": "electronics", "score": 0.20} + for _ in range(5) + ] + results = compute_conditions(trade, [], mentions, SIMPLE_TAXONOMY, now) + assert results[0]["confidence"] == "med" + + +def test_confidence_med_without_anchor_but_many_mentions(): + now = _dt(2024, 6, 15) + mentions = [ + {"date": _dt(2024, 5, 1), "sector": "electronics", "score": 0.20} + for _ in range(15) + ] + results = compute_conditions([], [], mentions, SIMPLE_TAXONOMY, now) + assert results[0]["confidence"] == "med" + + +def test_confidence_low_without_anchor_and_few_mentions(): + now = _dt(2024, 6, 15) + mentions = [ + {"date": _dt(2024, 5, 1), "sector": "electronics", "score": 0.20} + for _ in range(3) + ] + results = compute_conditions([], [], mentions, SIMPLE_TAXONOMY, now) + assert results[0]["confidence"] == "low" + + +# --------------------------------------------------------------------------- +# Empty / degenerate inputs +# --------------------------------------------------------------------------- + +def test_empty_taxonomy_returns_empty(): + now = _dt(2024, 6, 15) + assert compute_conditions([], [], [], {"sectors": {}}, now) == [] + + +def test_empty_inputs_with_taxonomy_returns_zeroed_sectors(): + now = _dt(2024, 6, 15) + results = compute_conditions([], [], [], SIMPLE_TAXONOMY, now) + assert len(results) == 1 + r = results[0] + assert r["D"] == 0.0 + assert r["SD"] == 0.0 + assert r["AS"] == 0.0 + assert r["momentum"] == 0.0 + assert r["mirror_gap"] is None + assert r["confidence"] == "low" + assert r["n_mentions"] == 0 + + +def test_missing_values_skipped_gracefully(): + now = _dt(2024, 6, 15) + trade = [ + {"date": _dt(2024, 5, 1), "flow": "X", "hs": "85", "value": None, "reporter": 156, "partner": 0}, + {"date": None, "flow": "X", "hs": "85", "value": 100.0, "reporter": 156, "partner": 0}, + {"date": _dt(2024, 5, 1), "flow": "X", "hs": "85", "value": "not-a-number", "reporter": 156, "partner": 0}, + ] + results = compute_conditions(trade, [], [], SIMPLE_TAXONOMY, now) + assert results[0]["n_mentions"] == 0 + assert results[0]["confidence"] == "low" + + +# --------------------------------------------------------------------------- +# Processor wiring (DB + Redis mocked) +# --------------------------------------------------------------------------- + +def test_conditions_index_processor_run_publishes_to_redis_and_persists_snapshot(): + now = _dt(2024, 6, 15) + fake_result = { + "sector": "electronics", + "region": "coastal_export", + "period": "2024-05", + "D": 10.0, + "SD": 5.0, + "AS": 12.5, + "momentum": 1.0, + "mirror_gap": None, + "confidence": "med", + "n_mentions": 15, + "inputs": {}, + } + + mock_redis = MagicMock() + mock_redis_module = MagicMock() + mock_redis_module.from_url.return_value = mock_redis + + mock_db = MagicMock() + mock_session_local = MagicMock(return_value=mock_db) + mock_api_database = MagicMock() + mock_api_database.SessionLocal = mock_session_local + + mock_snapshot_class = MagicMock() + mock_storage_models = MagicMock() + mock_storage_models.ConditionsIndexSnapshot = mock_snapshot_class + + with patch( + "processors.conditions_index._build_inputs_from_db", + return_value=([], [], [], SIMPLE_TAXONOMY), + ), patch( + "processors.conditions_index.compute_conditions", return_value=[fake_result] + ), patch( + "processors.conditions_index._latest_complete_month", return_value=(2024, 5) + ), patch.dict( + sys.modules, + { + "redis": mock_redis_module, + "api.database": mock_api_database, + "storage.models": mock_storage_models, + }, + ): + proc = ConditionsIndexProcessor() + result = proc.run() + + assert result["status"] == "success" + assert result["sectors"] == 1 + assert result["period"] == "2024-05" + mock_redis.set.assert_called_once() + mock_redis.close.assert_called_once() + mock_db.add.assert_called_once() + mock_db.commit.assert_called_once() + mock_db.close.assert_called_once() + + +def test_conditions_index_processor_run_survives_compute_error(): + with patch( + "processors.conditions_index._build_inputs_from_db", + return_value=([], [], [], SIMPLE_TAXONOMY), + ), patch( + "processors.conditions_index.compute_conditions", + side_effect=ValueError("boom"), + ): + proc = ConditionsIndexProcessor() + result = proc.run() + assert result["status"] == "error" + assert "boom" in result["error"] + + +def test_process_one_returns_use_run(): + proc = ConditionsIndexProcessor() + assert proc.process_one({}) == {"status": "use_run"} + + +# --------------------------------------------------------------------------- +# Offline self-test path +# --------------------------------------------------------------------------- + +def test_offline_self_test_path_runs_cleanly(): + """Execute the module's __main__ block via subprocess and verify output.""" + proc = subprocess.run( + [sys.executable, "-m", "processors.conditions_index"], + cwd=Path(__file__).resolve().parent.parent, + capture_output=True, + text=True, + ) + assert proc.returncode == 0, proc.stderr + assert "China Economic Conditions Index (offline self-test)" in proc.stdout + # All three sectors should be printed. + assert "electronics" in proc.stdout + assert "autos" in proc.stdout + assert "steel" in proc.stdout + + +def test_load_taxonomy_missing_path_returns_skeleton(): + assert _load_taxonomy(Path("/does/not/exist.json")) == {"sectors": {}} diff --git a/tests/test_conditions_report.py b/tests/test_conditions_report.py new file mode 100644 index 0000000..682418a --- /dev/null +++ b/tests/test_conditions_report.py @@ -0,0 +1,370 @@ +"""Unit tests for processors/conditions_report.py. + +All network and database dependencies are mocked so tests run offline. +""" + +import json +import sys +from datetime import date, datetime, timezone +from types import ModuleType +from unittest.mock import MagicMock, patch + +import pytest + +from processors.conditions_report import ConditionsReportGenerator + + +def _async_return(value): + """Return a callable that yields *value* when awaited via asyncio.run().""" + async def _coro(**kwargs): + return value + return _coro + + +# Inject lightweight stand-ins for optional dependencies that are imported +# inside methods, so patches below resolve without installing them. +if "free_llm_router" not in sys.modules: + sys.modules["free_llm_router"] = ModuleType("free_llm_router") +sys.modules["free_llm_router"].FreeLLMRouter = MagicMock() + +if "redis" not in sys.modules: + sys.modules["redis"] = ModuleType("redis") +sys.modules["redis"].from_url = MagicMock() + +if "anthropic" not in sys.modules: + sys.modules["anthropic"] = ModuleType("anthropic") +sys.modules["anthropic"].Anthropic = MagicMock() + + +@pytest.fixture +def gen() -> ConditionsReportGenerator: + return ConditionsReportGenerator({"llm_model": "claude-test", "ollama_model": "phi-test"}) + + +@pytest.fixture +def sample_sectors(): + return [ + { + "sector": "electronics_machinery", + "region": "coastal_export", + "period": "2024-05", + "D": 18.5, + "SD": 12.0, + "AS": 22.0, + "momentum": 4.2, + "mirror_gap": -8.3, + "confidence": "high", + "n_mentions": 42, + "inputs": { + "reported_value": 120_000_000_000.0, + "mirror_value": 110_000_000_000.0, + "anchor_source": "trade", + }, + }, + { + "sector": "property_construction", + "region": "national", + "period": "2024-05", + "D": -22.1, + "SD": -18.5, + "AS": -25.0, + "momentum": -6.7, + "mirror_gap": None, + "confidence": "med", + "n_mentions": 18, + "inputs": {"anchor_source": "cn_hf:bdi"}, + }, + ] + + +class TestBuildPrompt: + """Context-assembly and formatting helpers.""" + + def test_prompt_includes_generated_at_and_sector_count(self, gen, sample_sectors): + prompt = gen._build_prompt(sample_sectors, "2024-05-31T12:00:00Z") + assert "2024-05-31T12:00:00Z" in prompt + assert "Sectors covered: 2" in prompt + + def test_prompt_includes_sector_data(self, gen, sample_sectors): + prompt = gen._build_prompt(sample_sectors, None) + assert "Sector: electronics_machinery" in prompt + assert "Diffusion D=18.50" in prompt + assert "Mirror gap=-8.30%" in prompt + + def test_prompt_omits_mirror_when_none(self, gen, sample_sectors): + prompt = gen._build_prompt(sample_sectors, None) + assert "Sector: property_construction" in prompt + # The None sector should not render a Mirror gap line. + prop_tail = prompt.split("Sector: property_construction")[1] + next_sector = prop_tail.split("Sector:")[0] + assert "Mirror gap" not in next_sector + + def test_prompt_handles_empty_sectors(self, gen): + prompt = gen._build_prompt([], None) + assert "Sectors covered: 0" in prompt + assert "--- Sector data ---" in prompt + + +class TestGenerateReport: + """LLM routing and fallback report generation.""" + + def test_free_llm_router_success(self, gen, sample_sectors): + with patch("free_llm_router.FreeLLMRouter") as mock_router: + mock_router.return_value.chat_completion = _async_return({"text": "LLM report body"}) + prompt = gen._build_prompt(sample_sectors, None) + report = gen._generate_report(prompt, sample_sectors, None) + assert report == "LLM report body" + + def test_free_llm_router_empty_falls_back(self, gen, sample_sectors): + with patch("free_llm_router.FreeLLMRouter") as mock_router: + mock_router.return_value.chat_completion = _async_return({"text": ""}) + # No Anthropic key, and mock Ollama failure so rule-based stub runs. + with patch.dict("os.environ", {"ANTHROPIC_API_KEY": ""}, clear=False): + with patch("processors.conditions_report.httpx.post", side_effect=Exception("down")): + prompt = gen._build_prompt(sample_sectors, None) + report = gen._generate_report(prompt, sample_sectors, "2024-05-31") + assert "# China Economic Conditions Briefing" in report + assert "electronics_machinery" in report + + def test_anthropic_success(self, gen, sample_sectors): + mock_msg = MagicMock() + mock_msg.content = [MagicMock(text="Claude report")] + with patch("free_llm_router.FreeLLMRouter") as mock_router: + mock_router.side_effect = Exception("no router") + with patch.dict("os.environ", {"ANTHROPIC_API_KEY": "test-key"}, clear=False): + with patch("anthropic.Anthropic") as mock_anthropic_cls: + mock_anthropic_cls.return_value.messages.create.return_value = mock_msg + prompt = gen._build_prompt(sample_sectors, None) + report = gen._generate_report(prompt, sample_sectors, None) + assert report == "Claude report" + + def test_ollama_success(self, gen, sample_sectors): + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.json.return_value = {"response": "Ollama report"} + with patch("free_llm_router.FreeLLMRouter") as mock_router: + mock_router.side_effect = Exception("no router") + with patch.dict("os.environ", {"ANTHROPIC_API_KEY": ""}, clear=False): + with patch("processors.conditions_report.httpx.post", return_value=mock_resp): + prompt = gen._build_prompt(sample_sectors, None) + report = gen._generate_report(prompt, sample_sectors, None) + assert report == "Ollama report" + + +class TestRuleBasedReport: + """Deterministic fallback report formatting.""" + + def test_rule_based_header(self, gen, sample_sectors): + report = gen._rule_based_report(sample_sectors, "2024-05-31T12:00:00Z") + assert "# China Economic Conditions Briefing" in report + assert "2024-05-31T12:00:00Z" in report + + def test_rule_based_sorts_by_absolute_momentum(self, gen): + sectors = [ + {"sector": "a", "momentum": 1.0, "D": 0.0, "confidence": "low"}, + {"sector": "b", "momentum": -9.0, "D": 5.0, "confidence": "med"}, + {"sector": "c", "momentum": 3.0, "D": -2.0, "confidence": "high"}, + ] + report = gen._rule_based_report(sectors, None) + # Biggest mover should be b because abs(-9) is largest. + movers = report.split("## Biggest movers")[1] + first_mover = movers.strip().splitlines()[0] + assert "b" in first_mover + assert "momentum -9.00" in first_mover + + def test_rule_based_arrows(self, gen): + sectors = [ + {"sector": "up", "momentum": 1.0, "D": 0.0, "confidence": "low"}, + {"sector": "down", "momentum": -1.0, "D": 0.0, "confidence": "low"}, + {"sector": "flat", "momentum": 0.0, "D": 0.0, "confidence": "low"}, + ] + report = gen._rule_based_report(sectors, None) + assert "▲ improving" in report + assert "▼ weakening" in report + assert "▬ stable" in report + + def test_rule_based_empty_sectors(self, gen): + report = gen._rule_based_report([], None) + assert "# China Economic Conditions Briefing" in report + assert "## Biggest movers" not in report + assert "Cross-source triangulation" in report + + +class TestWriteReport: + """Report file I/O helpers.""" + + def test_write_report_creates_dated_and_latest_files(self, gen, tmp_path, sample_sectors): + with patch("processors.conditions_report._REPORT_DIR", tmp_path): + today = date(2024, 5, 31) + report_text = "# Test report" + report_path, latest_path = gen._write_report(report_text, today) + assert report_path.exists() + assert latest_path.exists() + assert report_path.read_text(encoding="utf-8") == report_text + assert latest_path.read_text(encoding="utf-8") == report_text + assert report_path.name == "2024-05-31.md" + assert latest_path.name == "latest.md" + + +class TestEscapeMarkdown: + """Telegram MarkdownV2 escaping.""" + + @pytest.mark.parametrize( + "text,expected", + [ + ("hello_world", "hello\\_world"), + ("**bold**", "\\*\\*bold\\*\\*"), + ("# title", "\\# title"), + ("a | b", "a \\| b"), + ], + ) + def test_escape_markdown(self, gen, text, expected): + assert gen._escape_markdown(text) == expected + + +class TestSendTelegram: + """Telegram notification helper.""" + + def test_send_telegram_skips_without_config(self, gen): + with patch.dict("os.environ", {}, clear=True): + with patch("processors.conditions_report.httpx.post") as mock_post: + gen._send_telegram("report") + mock_post.assert_not_called() + + def test_send_telegram_posts_with_config(self, gen): + env = {"TELEGRAM_BOT_TOKEN": "bot123", "TELEGRAM_ALERT_CHAT_ID": "chat456"} + with patch.dict("os.environ", env, clear=False): + with patch("processors.conditions_report.httpx.post") as mock_post: + gen._send_telegram("# Report") + mock_post.assert_called_once() + args, kwargs = mock_post.call_args + assert "bot123/sendMessage" in args[0] + assert kwargs["json"]["chat_id"] == "chat456" + assert "parse_mode" in kwargs["json"] + + +class TestLoadLatestIndex: + """Redis / DB fallback for loading the latest conditions index.""" + + def test_load_latest_index_from_redis(self, gen): + payload = { + "generated_at": "2024-05-31T12:00:00Z", + "sectors": [{"sector": "steel", "D": 5.0}], + } + mock_redis = MagicMock() + mock_redis.get.return_value = json.dumps(payload) + with patch("redis.from_url", return_value=mock_redis): + result = gen._load_latest_index(MagicMock()) + assert result["generated_at"] == "2024-05-31T12:00:00Z" + assert result["sectors"][0]["sector"] == "steel" + mock_redis.close.assert_called_once() + + def test_load_latest_index_redis_empty_falls_back_to_db(self, gen): + mock_redis = MagicMock() + mock_redis.get.return_value = None + + row = MagicMock() + row.sector = "steel" + row.region = "national" + row.period = "2024-05" + row.diffusion = 5.0 + row.sentiment = 2.0 + row.anchor = 8.0 + row.momentum = 1.0 + row.mirror_gap = -3.0 + row.confidence = "med" + row.n_mentions = 7 + row.inputs = {"reported_value": 100} + row.generated_at = datetime(2024, 5, 31, 12, 0, 0, tzinfo=timezone.utc) + + mock_db = MagicMock() + mock_db.query.return_value.filter.return_value.order_by.return_value.all.return_value = [row] + + with patch("redis.from_url", return_value=mock_redis): + result = gen._load_latest_index(mock_db) + assert result["generated_at"] == "2024-05-31T12:00:00+00:00" + assert len(result["sectors"]) == 1 + sector = result["sectors"][0] + assert sector["sector"] == "steel" + assert sector["D"] == 5.0 + assert sector["mirror_gap"] == -3.0 + + def test_load_latest_index_returns_empty_when_no_data(self, gen): + mock_redis = MagicMock() + mock_redis.get.return_value = None + + mock_db = MagicMock() + mock_db.query.return_value.filter.return_value.order_by.return_value.all.return_value = [] + + with patch("redis.from_url", return_value=mock_redis): + result = gen._load_latest_index(mock_db) + assert result == {} + + +class TestRun: + """End-to-end run() orchestration with mocked dependencies.""" + + def test_run_success(self, gen, sample_sectors, tmp_path): + payload = { + "generated_at": "2024-05-31T12:00:00Z", + "sectors": sample_sectors, + } + mock_redis = MagicMock() + mock_redis.get.return_value = json.dumps(payload) + + mock_db = MagicMock() + mock_session_local = MagicMock(return_value=mock_db) + + mock_digest_cls = MagicMock() + + with patch("redis.from_url", return_value=mock_redis): + with patch("api.database.SessionLocal", mock_session_local): + with patch("storage.models.DailyDigest", mock_digest_cls): + with patch("free_llm_router.FreeLLMRouter") as mock_router: + mock_router.return_value.chat_completion = _async_return( + {"text": "LLM generated report"} + ) + with patch("processors.conditions_report._REPORT_DIR", tmp_path): + result = gen.run() + + assert result["status"] == "success" + assert result["sectors"] == 2 + assert result["report_length"] == len("LLM generated report") + mock_db.add.assert_called_once() + mock_db.commit.assert_called_once() + mock_db.close.assert_called_once() + + def test_run_no_data_returns_no_data(self, gen): + mock_redis = MagicMock() + mock_redis.get.return_value = None + mock_db = MagicMock() + mock_db.query.return_value.filter.return_value.order_by.return_value.all.return_value = [] + mock_session_local = MagicMock(return_value=mock_db) + + with patch("redis.from_url", return_value=mock_redis): + with patch("api.database.SessionLocal", mock_session_local): + result = gen.run() + + assert result["status"] == "no_data" + mock_db.close.assert_called_once() + + def test_run_error_rolls_back_and_returns_error(self, gen, sample_sectors): + payload = { + "generated_at": "2024-05-31T12:00:00Z", + "sectors": sample_sectors, + } + mock_redis = MagicMock() + mock_redis.get.return_value = json.dumps(payload) + mock_db = MagicMock() + mock_session_local = MagicMock(return_value=mock_db) + + with patch("redis.from_url", return_value=mock_redis): + with patch("api.database.SessionLocal", mock_session_local): + # Force an exception during prompt building. + with patch.object(gen, "_build_prompt", side_effect=ValueError("boom")): + result = gen.run() + + assert result["status"] == "error" + assert "boom" in result["error"] + mock_db.rollback.assert_called_once() + mock_db.close.assert_called_once() diff --git a/tests/test_zh_finance.py b/tests/test_zh_finance.py new file mode 100644 index 0000000..765810a --- /dev/null +++ b/tests/test_zh_finance.py @@ -0,0 +1,159 @@ +"""Tests for processors.zh_finance. + +Covers: +- lexicon loading (cache, missing file, invalid JSON) +- detect_chinese_policy_and_sectors: negation flips, substring matching, + sector counts, threshold logic, and empty inputs. +""" + +import json +import logging +from pathlib import Path +from unittest.mock import patch + +import pytest + +from processors.zh_finance import detect_chinese_policy_and_sectors, load_lexicon + + +@pytest.fixture +def lexicon(): + """Small inline lexicon for deterministic tests.""" + return { + "hawkish_keywords": ["加息", "加息周期", "收紧", "紧缩"], + "dovish_keywords": ["降息", "降准", "宽松", "刺激"], + "sector_keywords": { + "banking": ["银行", "银行股"], + "markets": ["股市", "大盘"], + "tech": ["科技", "AI"], + }, + } + + +class TestLoadLexicon: + def test_loads_valid_lexicon(self, tmp_path, caplog): + """load_lexicon returns the parsed JSON and logs success.""" + lex_path = tmp_path / "zh_finance_lexicon.json" + payload = {"hawkish_keywords": ["加息"], "dovish_keywords": [], "sector_keywords": {}} + lex_path.write_text(json.dumps(payload), encoding="utf-8") + + with caplog.at_level(logging.INFO): + with patch("processors.zh_finance._LEXICON_PATH", lex_path): + load_lexicon.cache_clear() + result = load_lexicon() + + assert result == payload + assert "Loaded lexicon" in caplog.text + + def test_missing_file_returns_empty_dict(self, tmp_path, caplog): + """A missing lexicon file yields {} and a warning.""" + missing = tmp_path / "does_not_exist.json" + + with caplog.at_level(logging.WARNING): + with patch("processors.zh_finance._LEXICON_PATH", missing): + load_lexicon.cache_clear() + result = load_lexicon() + + assert result == {} + assert "Lexicon not found" in caplog.text + + def test_invalid_json_returns_empty_dict(self, tmp_path, caplog): + """Invalid JSON yields {} and an error.""" + bad = tmp_path / "bad.json" + bad.write_text("{not valid json", encoding="utf-8") + + with caplog.at_level(logging.ERROR): + with patch("processors.zh_finance._LEXICON_PATH", bad): + load_lexicon.cache_clear() + result = load_lexicon() + + assert result == {} + assert "invalid JSON" in caplog.text + + +class TestDetectChinesePolicyAndSectors: + def test_basic_hawkish(self, lexicon): + """Direct hawkish keywords above threshold are detected.""" + text = "加息周期,货币紧缩" + result = detect_chinese_policy_and_sectors(text, lexicon) + assert result["policy_direction"] == "hawkish" + + def test_basic_dovish(self, lexicon): + """Direct dovish keywords above threshold are detected.""" + text = "降准降息,流动性宽松" # "降息" matches dovish, "降准" matches, "宽松" matches + result = detect_chinese_policy_and_sectors(text, lexicon) + assert result["policy_direction"] == "dovish" + + def test_negation_flips_hawkish_to_dovish(self, lexicon): + """不加息 / 尚未加息 are not hawkish and instead boost dovish.""" + # Two negated hawkish signals give dovish_total >= 2. + text = "我们不加息,央行尚未加息" + result = detect_chinese_policy_and_sectors(text, lexicon) + assert result["policy_direction"] == "dovish" + assert result["sectors"] == {} + + def test_negation_flips_dovish_to_hawkish(self, lexicon): + """不降息 is not dovish and instead boosts hawkish.""" + text = "不会降息,难以宽松" + result = detect_chinese_policy_and_sectors(text, lexicon) + # Two negated dovish signals -> hawkish_total >= 2, dovish_total = 0. + assert result["policy_direction"] == "hawkish" + + def test_threshold_requires_two_net_hits(self, lexicon): + """A single non-negated keyword is not enough to declare a direction.""" + assert detect_chinese_policy_and_sectors("加息", lexicon)["policy_direction"] == "neutral" + assert detect_chinese_policy_and_sectors("降息", lexicon)["policy_direction"] == "neutral" + + def test_substring_matching_in_chinese(self, lexicon): + """Keywords match as substrings; no \b boundary semantics are used.""" + # "加息周期" contains "加息" as a substring. + text = "加息周期启动" + result = detect_chinese_policy_and_sectors(text, lexicon) + # "加息" and "加息周期" are both matched -> 2 hawkish hits. + assert result["policy_direction"] == "hawkish" + + def test_overlapping_keyword_matches_counted(self, lexicon): + """Overlapping sector keyword matches are each counted.""" + # "银行银行股" matches "银行" twice (positions 0 and 2) and "银行股" once (position 0). + text = "银行银行股上涨" + result = detect_chinese_policy_and_sectors(text, lexicon) + assert result["sectors"]["banking"]["mentions"] == 3 + + def test_sector_counts_multiple_sectors(self, lexicon): + """Multiple sectors can be detected and counted independently.""" + text = "银行、股市、科技板块全线走强" + result = detect_chinese_policy_and_sectors(text, lexicon) + assert result["sectors"] == { + "banking": {"mentions": 1}, + "markets": {"mentions": 1}, + "tech": {"mentions": 1}, + } + + def test_empty_text_returns_neutral(self, lexicon): + """Empty/whitespace input is neutral with no sectors.""" + assert detect_chinese_policy_and_sectors("", lexicon) == { + "policy_direction": "neutral", + "sectors": {}, + } + assert detect_chinese_policy_and_sectors(" ", lexicon) == { + "policy_direction": "neutral", + "sectors": {}, + } + + def test_empty_keyword_is_ignored(self, lexicon): + """Empty-string keywords must not cause spurious matches.""" + broken = { + "hawkish_keywords": ["加息", ""], + "dovish_keywords": ["降息", ""], + "sector_keywords": {"banking": ["", "银行"]}, + } + text = "银行降息" + result = detect_chinese_policy_and_sectors(text, broken) + assert result["sectors"]["banking"]["mentions"] == 1 + + def test_tie_returns_neutral(self, lexicon): + """Equal hawkish and dovish totals fall back to neutral.""" + # One hawkish hit and one negated hawkish -> hawkish_total=1, dovish_total=1. + text = "加息,但不加息" + result = detect_chinese_policy_and_sectors(text, lexicon) + assert result["policy_direction"] == "neutral"