# -*- coding: utf-8 -*-
"""yfinance 全球指數資料下載與品質實測（對應文章的完整可重現腳本）。

對應文章:
  https://finlab.finance/blog/python-global-index-yfinance-tutorial

執行方式（任一 Python 3.10+ 環境）:
  pip install yfinance matplotlib lxml
  python strategy.py

或用 uv 免安裝執行:
  uv run --with yfinance --with matplotlib --with lxml python strategy.py

輸出:
  - ./out/metrics.json               文章中所有數字的來源
  - ./out/global_indices_close.csv   8 大指數 2000 年起日收盤價
  - ./out/*.png                      文章中的 5 張圖表

口徑說明:
  - 資料截止釘在 DATA_END = 2026-06-10，文章標「資料截至 2026-06」
  - 指數為「價格指數」，不含股息；報酬為純指數算術（期間報酬連乘），
    不含任何交易成本與滑價
  - 本腳本沒有任何策略或績效宣稱，僅做資料下載、整理與描述統計

投資警語: 本程式僅供教學用途，不構成投資建議，過去績效不代表未來表現。
"""
from __future__ import annotations

import io
import json
import os
from pathlib import Path

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import matplotlib.ticker
import numpy as np
import pandas as pd
import requests
import yfinance as yf

# 文章數字以 yfinance 1.4.1 實跑產出，先印版本方便對照
print(f"yfinance version: {yf.__version__}")

# 中文字型（macOS;Windows 可改 Microsoft JhengHei）
plt.rcParams["font.sans-serif"] = ["Heiti TC", "PingFang HK"]
plt.rcParams["axes.unicode_minus"] = False

OUT = Path("./out")
OUT.mkdir(parents=True, exist_ok=True)

DATA_END = os.environ.get("DATA_END", "2026-06-10")  # 釘住資料截止日，確保可重現
COMMON_START = "2000-01-01"                          # 跨指數比較的共同窗口

INDICES = {
    "^GSPC": "標普 500（美）",
    "^IXIC": "那斯達克綜合（美）",
    "^DJI": "道瓊工業（美）",
    "^TWII": "台灣加權",
    "^N225": "日經 225（日）",
    "^GDAXI": "DAX（德）",
    "^FTSE": "富時 100（英）",
    "^HSI": "恒生（港）",
}

metrics: dict = {
    "yfinance_version": yf.__version__,
    "data_end": DATA_END,
    "common_start": COMMON_START,
}


# ---------- 步驟 1:批次下載 8 大指數全部歷史 ----------
def download_indices() -> pd.DataFrame:
    """用 yf.download 一次抓多檔，回傳日收盤價寬表（欄=指數）。"""
    raw = yf.download(list(INDICES), period="max", auto_adjust=False, progress=False)
    close = raw["Close"].copy()
    # 新版 yfinance 回傳帶時區的索引，先拿掉時區方便切片與存檔
    close.index = pd.DatetimeIndex(close.index).tz_localize(None)
    close = close[close.index <= DATA_END]
    return close.dropna(how="all")


close = download_indices()

# 各指數可回溯起點與資料量
coverage = {}
for sym, name in INDICES.items():
    s = close[sym].dropna()
    coverage[sym] = {
        "name": name,
        "first_date": str(s.index[0].date()),
        "last_date": str(s.index[-1].date()),
        "n_days": int(s.shape[0]),
    }
metrics["coverage"] = coverage


# ---------- 步驟 2:共同窗口統計（2000 年起，純指數算術） ----------
def describe(s: pd.Series) -> dict:
    """總報酬、CAGR、年化波動、最大回撤。"""
    s = s.dropna()
    ret = s.pct_change().dropna()
    n_years = (s.index[-1] - s.index[0]).days / 365.25
    total = s.iloc[-1] / s.iloc[0] - 1
    cagr = (s.iloc[-1] / s.iloc[0]) ** (1 / n_years) - 1
    ann_vol = ret.std() * np.sqrt(252)
    cum = (1 + ret).cumprod()
    max_dd = (cum / cum.cummax() - 1).min()
    return {
        "total_return_pct": round(total * 100, 1),
        "cagr_pct": round(cagr * 100, 2),
        "ann_vol_pct": round(ann_vol * 100, 1),
        "max_drawdown_pct": round(max_dd * 100, 1),
    }


win = close[close.index >= COMMON_START].dropna(how="all")
stats = {sym: {"name": INDICES[sym], **describe(win[sym])} for sym in INDICES}
metrics["window_2000_stats"] = stats
win.round(2).to_csv(OUT / "global_indices_close.csv")


# ---------- 步驟 3:台積電原始收盤 vs 還原收盤 ----------
ticker = yf.Ticker("2330.TW")
tsmc_raw = ticker.history(period="max", auto_adjust=False)
tsmc_adj = ticker.history(period="max", auto_adjust=True)
for df in (tsmc_raw, tsmc_adj):
    df.index = pd.DatetimeIndex(df.index).tz_localize(None)
tsmc_raw = tsmc_raw[tsmc_raw.index <= DATA_END]
tsmc_adj = tsmc_adj[tsmc_adj.index <= DATA_END]

SINCE = "2010-01-01"
raw_close = tsmc_raw.loc[tsmc_raw.index >= SINCE, "Close"].dropna()
adj_close = tsmc_adj.loc[tsmc_adj.index >= SINCE, "Close"].dropna()
common_days = raw_close.index.intersection(adj_close.index)
raw_close = raw_close[common_days]
adj_close = adj_close[common_days]

raw_mult = raw_close.iloc[-1] / raw_close.iloc[0]
adj_mult = adj_close.iloc[-1] / adj_close.iloc[0]
n_years_tsmc = (raw_close.index[-1] - raw_close.index[0]).days / 365.25
raw_cagr = raw_mult ** (1 / n_years_tsmc) - 1
adj_cagr = adj_mult ** (1 / n_years_tsmc) - 1
dividends = tsmc_raw.loc[tsmc_raw.index >= SINCE, "Dividends"]
full_close = tsmc_raw["Close"].dropna()
metrics["tsmc_2330_max_history"] = {
    "first_date": str(full_close.index[0].date()),
    "first_close": round(float(full_close.iloc[0]), 2),
    "last_date": str(full_close.index[-1].date()),
    "last_close": round(float(full_close.iloc[-1]), 2),
    "n_days": int(full_close.shape[0]),
}
metrics["tsmc_2330"] = {
    "since": SINCE,
    "raw_multiple": round(float(raw_mult), 2),
    "adj_multiple": round(float(adj_mult), 2),
    "raw_cagr_pct": round(float(raw_cagr) * 100, 2),  # 文中 24.10%
    "adj_cagr_pct": round(float(adj_cagr) * 100, 2),  # 文中 27.85%
    "dividends_per_share_total": round(float(dividends.sum()), 2),
    "n_dividend_events": int((dividends > 0).sum()),
    "raw_first": round(float(raw_close.iloc[0]), 2),
    "raw_last": round(float(raw_close.iloc[-1]), 2),
}


# ---------- 步驟 4:world-indices 清單頁爬取實測 ----------
def scrape_world_indices(user_agent: str | None) -> dict:
    """測試 Yahoo Finance 全球指數清單頁是否可用 pd.read_html 解析。"""
    headers = {"User-Agent": user_agent} if user_agent else {}
    resp = requests.get("https://finance.yahoo.com/markets/world-indices/",
                        headers=headers, timeout=20)
    result = {"status_code": resp.status_code, "n_tables": 0, "n_rows": 0, "symbols": []}
    if resp.status_code == 200:
        tables = pd.read_html(io.StringIO(resp.text))
        result["n_tables"] = len(tables)
        if tables:
            result["n_rows"] = int(tables[0].shape[0])
            result["symbols"] = tables[0].iloc[:, 0].astype(str).tolist()[:5]
    return result


UA = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
      "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36")
try:
    metrics["scrape_no_ua"] = scrape_world_indices(None)
except Exception as e:  # noqa: BLE001
    metrics["scrape_no_ua"] = {"error": str(e)[:120]}
try:
    metrics["scrape_with_ua"] = scrape_world_indices(UA)
except Exception as e:  # noqa: BLE001
    metrics["scrape_with_ua"] = {"error": str(e)[:120]}

# 舊網址（2020 年教學用的路徑）現況:實測會回 301 轉址到 /markets/world-indices/
try:
    resp_old = requests.get("https://finance.yahoo.com/world-indices/",
                            headers={"User-Agent": UA},
                            timeout=20,
                            allow_redirects=False)
    metrics["old_url_2020"] = {
        "status_code": resp_old.status_code,
        "redirect_to": resp_old.headers.get("Location", "")[:120],
    }
except Exception as e:  # noqa: BLE001
    metrics["old_url_2020"] = {"error": str(e)[:120]}


# ---------- 步驟 5:畫圖（16:9，共 5 張） ----------
FIGSIZE = (12, 6.75)
COLORS = plt.cm.tab10.colors

# 圖 1:2000 年起累積走勢（各指數以自身窗口首日 = 100，與統計表同口徑）
norm = pd.DataFrame({sym: (lambda s: s / s.iloc[0] * 100)(win[sym].dropna())
                     for sym in INDICES})
fig, ax = plt.subplots(figsize=FIGSIZE, dpi=100)
order = norm.ffill().iloc[-1].sort_values(ascending=False).index
for i, sym in enumerate(order):
    s = norm[sym].dropna()
    ax.plot(s.index, s, lw=1.6, color=COLORS[i % 10],
            label=f"{INDICES[sym]} {s.iloc[-1]:,.0f}")
ax.set_yscale("log")
ax.set_yticks([50, 100, 200, 400, 800])
ax.get_yaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
ax.set_title("8 大指數 2000 年至今累積走勢（起點=100，對數座標，不含股息）")
ax.set_ylabel("指數化價格（2000 年初=100）")
ax.legend(loc="upper left", fontsize=9, ncol=2)
ax.grid(alpha=0.3)
fig.tight_layout()
fig.savefig(OUT / "global_indices_2000_2026.png")
plt.close(fig)

# 圖 2:台積電原始 vs 還原收盤
fig, ax = plt.subplots(figsize=FIGSIZE, dpi=100)
ax.plot(raw_close.index, raw_close / raw_close.iloc[0], lw=1.8, color="#d62728",
        label=f"原始收盤價（漲 {raw_mult - 1:.1f} 倍）")
ax.plot(adj_close.index, adj_close / adj_close.iloc[0], lw=1.8, color="#1f77b4",
        label=f"還原收盤價 auto_adjust=True（漲 {adj_mult - 1:.1f} 倍）")
ax.set_title("台積電 2330.TW:原始收盤 vs 還原收盤（2010 年初 = 1）")
ax.legend(loc="upper left")
ax.grid(alpha=0.3)
fig.tight_layout()
fig.savefig(OUT / "tsmc_raw_vs_adjusted.png")
plt.close(fig)

# 圖 3:可回溯起始年份（資料完整度體檢）
fig, ax = plt.subplots(figsize=FIGSIZE, dpi=100)
syms_cov = sorted(INDICES, key=lambda s: coverage[s]["first_date"])
years = [int(coverage[s]["first_date"][:4]) for s in syms_cov]
names_cov = [f"{INDICES[s]}  {s}" for s in syms_cov]
END_YEAR = 2026
bars = ax.barh(names_cov, [END_YEAR - y for y in years], left=years,
               color="#1f77b4", alpha=0.85)
for bar, sym, year in zip(bars, syms_cov, years):
    label = (f"{coverage[sym]['first_date']} 起，"
             f"共 {coverage[sym]['n_days']:,} 個交易日")
    ax.text(year + 0.3, bar.get_y() + bar.get_height() / 2, label,
            va="center", fontsize=10, color="white")
ax.set_xlim(1925, 2030)
ax.set_title("yfinance 全球指數可回溯起始年實測（period=\"max\"，資料截至 2026-06）")
ax.set_xlabel("年份")
ax.invert_yaxis()
ax.grid(axis="x", alpha=0.3)
fig.tight_layout()
fig.savefig(OUT / "data_coverage.png")
plt.close(fig)

# 圖 4:風險報酬地圖
fig, ax = plt.subplots(figsize=FIGSIZE, dpi=100)
for i, sym in enumerate(INDICES):
    st = stats[sym]
    ax.scatter(st["ann_vol_pct"], st["cagr_pct"], s=160, color=COLORS[i % 10])
    ax.annotate(INDICES[sym], (st["ann_vol_pct"], st["cagr_pct"]),
                xytext=(8, 6), textcoords="offset points")
ax.set_xlabel("年化波動度（%）")
ax.set_ylabel("年化報酬 CAGR（%）")
ax.set_title("全球 8 大指數風險報酬地圖（2000-2026，價格指數）")
ax.grid(alpha=0.3)
fig.tight_layout()
fig.savefig(OUT / "risk_return_map.png")
plt.close(fig)

# 圖 5:最大回撤
fig, ax = plt.subplots(figsize=FIGSIZE, dpi=100)
syms_dd = sorted(INDICES, key=lambda s: stats[s]["max_drawdown_pct"])
dd_values = [stats[s]["max_drawdown_pct"] for s in syms_dd]
names_dd = [INDICES[s] for s in syms_dd]
dd_colors = []
for v in dd_values:
    if v < -60:
        dd_colors.append("#d62728")
    elif v < -50:
        dd_colors.append("#ff7f0e")
    else:
        dd_colors.append("#1f77b4")
bars = ax.bar(names_dd, dd_values, color=dd_colors)
for bar, v in zip(bars, dd_values):
    ax.text(bar.get_x() + bar.get_width() / 2, v - 1.5, f"{v:.0f}%",
            ha="center", va="top", fontsize=11)
ax.set_title("全球 8 大指數最大回撤（2000-2026，收盤價計，資料截至 2026-06）")
ax.set_ylabel("最大回撤（%）")
ax.set_ylim(min(dd_values) - 10, 0)
ax.grid(axis="y", alpha=0.3)
plt.setp(ax.get_xticklabels(), fontsize=10)
fig.tight_layout()
fig.savefig(OUT / "max_drawdown.png")
plt.close(fig)

with open(OUT / "metrics.json", "w", encoding="utf-8") as f:
    json.dump(metrics, f, ensure_ascii=False, indent=2)

print(json.dumps(metrics, ensure_ascii=False, indent=2))
print("done. 圖表與數據已輸出到 ./out/")
