复现方法
该计算逻辑可在 Excel、Python 或本页面浏览器脚本中复现。关键点是先还原线性定量值,再使用样本标准差,而不是直接对 log2 数值求 CV。复现原 CV 汇总时使用矩阵中的全部 Protein Group 行,阈值计数不是过滤前置步骤。
识别样本列
读取所有以 `.PG.Log2Quantity` 结尾的列,并根据样本名、分组名或手动规则划分到不同实验组。
还原线性丰度
Spectronaut PG 定量矩阵常以 log2 形式输出。CV 衡量的是线性丰度的相对离散程度,因此每个值先计算 `2 ^ log2Quantity`。
使用 STDEV.S
组内 CV 使用样本标准差 `STDEV.S`,即方差分母为 `n - 1`。这与总体标准差 `STDEV.P` 的结果不同。
阈值计数
每个 Protein Group 得到一个组内 CV 后,再统计 `CV ≤ 阈值` 的数量。阈值可设为 10%、20% 或实验方案要求的其他值。
import re
import math
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.read_csv("spectronaut_pg_matrix.tsv", sep="\t")
sample_cols = [c for c in df.columns if c.endswith(".PG.Log2Quantity")]
# 按样本列名分组;也可以换成手动列名列表。
group_rules = {
"O": r"Condition-O|Cellenone|\\bO\\b",
"S": r"Condition-S|Celleagle|\\bS\\b",
}
groups = {
label: [c for c in sample_cols if re.search(pattern, c, re.I)]
for label, pattern in group_rules.items()
}
def cv_from_log2(cols):
log2_values = df[cols].apply(pd.to_numeric, errors="coerce").to_numpy(float)
linear_values = np.power(2.0, log2_values)
valid_n = np.sum(~np.isnan(linear_values), axis=1)
with warnings.catch_warnings():
warnings.simplefilter("ignore", RuntimeWarning)
mean = np.nanmean(linear_values, axis=1)
sd = np.nanstd(linear_values, axis=1, ddof=1) # STDEV.S
cv = sd / mean
cv[(valid_n < 2) | ~np.isfinite(cv)] = np.nan
return valid_n, cv
results = {}
for label, cols in groups.items():
valid_n, cv = cv_from_log2(cols)
results[label] = {
"sample_count": len(cols),
"valid_n": valid_n,
"cv": cv,
"cv_percent": cv[np.isfinite(cv)] * 100,
}
# 1. Protein Group CVs below X:先逐行算 CV,再统计阈值内数量。
summary = pd.DataFrame([
{
"XLabel": label,
"SampleColumns": item["sample_count"],
"Identified": int((item["valid_n"] > 0).sum()),
"Identified <= 20% CV": int(np.nansum(item["cv"] <= 0.20)),
"Identified <= 10% CV": int(np.nansum(item["cv"] <= 0.10)),
"ComputableCV": int(np.isfinite(item["cv"]).sum()),
}
for label, item in results.items()
])
summary.to_csv("protein_group_cvs_below_x.tsv", sep="\t", index=False)
fig, ax = plt.subplots(figsize=(12.8, 6.0))
x = np.arange(len(summary))
w = 0.23
bars = [
ax.bar(x - w, summary["Identified"], w, label="Identified", color="#c90b32", edgecolor="black"),
ax.bar(x, summary["Identified <= 20% CV"], w, label="Identified <= 20% CV", color="#8f8f8f", edgecolor="black"),
ax.bar(x + w, summary["Identified <= 10% CV"], w, label="Identified <= 10% CV", color="#b8b8b8", edgecolor="black"),
]
for bar_group in bars:
ax.bar_label(bar_group, labels=[f"{int(v):,}" for v in bar_group.datavalues], padding=3, fontsize=8)
ax.set_title("Protein Group CVs below X", fontsize=10, fontweight="bold")
ax.set_ylabel("Count", fontweight="bold")
ax.set_xlabel("Condition", fontweight="bold")
ax.set_xticks(x, summary["XLabel"])
ax.legend(loc="upper left", ncols=3, frameon=True, edgecolor="black", fancybox=False, fontsize=8)
ax.grid(axis="y", color="#e5e5e5")
fig.tight_layout()
fig.savefig("protein_group_cvs_below_x.svg")
# 2. CV Distribution per Condition:使用逐蛋白 %CV 做 Gaussian KDE。
def gaussian_kde(values, x_grid):
values = np.asarray(values, dtype=float)
values = values[np.isfinite(values)]
if values.size == 0:
return np.zeros_like(x_grid)
q1, q3 = np.nanquantile(values, [0.25, 0.75])
iqr = q3 - q1
sd = np.nanstd(values, ddof=1) if values.size > 1 else np.nan
scale = min([v for v in [sd, iqr / 1.34] if np.isfinite(v) and v > 0] or [1.0])
bandwidth = np.clip(0.9 * scale * values.size ** (-0.2), 1.2, 18.0)
z = (x_grid[:, None] - values[None, :]) / bandwidth
return np.exp(-0.5 * z * z).sum(axis=1) / (values.size * bandwidth * math.sqrt(2 * math.pi))
x_grid = np.linspace(0, 250, 2000)
density_table = {}
fig, ax = plt.subplots(figsize=(12.8, 5.0))
for label, item in results.items():
y = gaussian_kde(item["cv_percent"], x_grid)
density_table[f"(x)% CV_[{label}]"] = x_grid
density_table[f"(y)Density_[{label}]"] = y
ax.plot(x_grid, y, "--", linewidth=1.2, label=label)
median = np.nanmedian(item["cv_percent"])
ax.axvline(median, linestyle="--", linewidth=1.0)
ax.text(median + 1.5, y.max() * 0.72, f"Median: {median:.1f}%", fontsize=8)
pd.DataFrame(density_table).to_csv("protein_group_cv_distribution.tsv", sep="\t", index=False)
ax.set_title("Protein Group- CV Distribution per Condition", fontsize=10, fontweight="bold")
ax.set_xlabel("% CV", fontweight="bold")
ax.set_ylabel("Density", fontweight="bold")
ax.set_xlim(0, 250)
ax.legend(loc="upper left", frameon=True, edgecolor="black", fancybox=False, fontsize=8)
fig.tight_layout()
fig.savefig("protein_group_cv_distribution.svg")
# 3. Protein Group CVs:逐蛋白 %CV 箱线图。
cv_columns = {
label: pd.Series(np.sort(item["cv_percent"]))
for label, item in results.items()
}
pd.DataFrame(cv_columns).to_csv("protein_group_cvs.tsv", sep="\t", index=False)
fig, ax = plt.subplots(figsize=(12.8, 4.6))
labels = list(results.keys())
positions = np.arange(1, len(labels) + 1)
ax.boxplot(
[item["cv_percent"] for item in results.values()],
positions=positions,
whis=1.5,
showmeans=True,
showfliers=False,
meanprops={"marker": "D", "markerfacecolor": "none", "markeredgecolor": "red", "markersize": 4},
medianprops={"color": "black", "linewidth": 1.4},
whiskerprops={"color": "black", "linestyle": ":"},
capprops={"color": "black"},
)
ax.set_xticks(positions, labels)
ax.set_ylabel("% CV", fontweight="bold")
ax.set_xlabel("Condition", fontweight="bold")
fig.tight_layout()
fig.savefig("protein_group_cvs_boxplot.svg")