Coverage for wifa_uq/postprocessing/PCE_tool/pce

1import numpy as np

2import pandas as pd

3from pathlib import Path

4import openturns as ot

5import matplotlib.pyplot as plt

6from scipy import stats

7from sklearn.metrics import mean_squared_error, r2_score

8from scipy.stats import wasserstein_distance, ks_2samp, entropy

11def build_input_output_arrays(ds, stochastic_vars, model_vars, output_var):

12 """

13 Build input and output arrays for PCE from dataset

14 """

15 ntimes = len(ds["wind_farm"].values)

16 kk_values = ds[model_vars[0]].values if model_vars else np.array([0.0])

17 if not model_vars:

18 nk = 1

19 else:

20 nk = ds.sizes["sample"]

22 n_feature = len(stochastic_vars)

23 nvar = n_feature + len(model_vars)

25 input_array_physical = np.zeros((ntimes, n_feature))

26 input_array = np.zeros((ntimes * nk, nvar))

27 output_array = np.zeros((ntimes * nk))

29 # Physical variables

30 for j, var in enumerate(stochastic_vars):

31 values = np.array(ds[var])

32 input_array_physical[:, j] = values

33 for i in range(nk):

34 input_array[i * ntimes : (i + 1) * ntimes, j] = values

36 # Model variables

37 for j, var in enumerate(model_vars):

38 for i in range(nk):

39 input_array[i * ntimes : (i + 1) * ntimes, n_feature + j] = kk_values[i]

41 # Output

43 if not model_vars:

44 # output_var expected shape (sample, ntimes)

45 ymean = ds[output_var].mean(dim="sample").values # shape (ntimes,)

46 output_array[:] = np.tile(

47 ymean, nk

48 ) # ou mieux: output_array = np.tile(ymean, nk)

49 else:

50 for i in range(nk):

51 output_array[i * ntimes : (i + 1) * ntimes] = ds[output_var][i, :]

53 varnames = stochastic_vars + model_vars

54 return input_array, output_array, varnames, kk_values, n_feature

57def construct_PCE_ot(

58 input_array, output_array, marginals, copula, degree, LARS=True, q=1

59):

60 """

61 Construct Polynomial Chaos Expansion using OpenTURNS

62 """

63 Nt = input_array.shape[0]

64 Nvar = input_array.shape[1]

66 # Create samples

67 outputSample = ot.Sample(Nt, 1)

68 for i in range(Nt):

69 outputSample[i, 0] = output_array[i]

71 polyColl = ot.PolynomialFamilyCollection(Nvar)

72 collection = ot.DistributionCollection(Nvar)

73 marginal = {}

74 UncorrelatedInputSample = ot.Sample(input_array)

76 # Marginals

77 for i in range(Nvar):

78 varSample = ot.Sample(Nt, 1)

79 for j in range(Nt):

80 varSample[j, 0] = input_array[j, i]

81 minValue = varSample.getMin()[0]

82 maxValue = varSample.getMax()[0]

83 if marginals[i] == "kernel":

84 marginal[i] = ot.KernelSmoothing().build(varSample)

85 elif marginals[i] == "uniform":

86 marginal[i] = ot.Uniform(minValue - 1e-5, maxValue + 1e-5)

87 else:

88 marginal[i] = ot.NormalFactory().build(varSample)

89 collection[i] = ot.Distribution(marginal[i])

91 # Copula

92 if copula == "independent":

93 copula = ot.IndependentCopula(Nvar)

94 elif copula in ["gaussian", "normal"]:

95 copula = ot.NormalCopulaFactory().build(ot.Sample(input_array))

96 else:

97 copula = ot.IndependentCopula(Nvar)

99 UncorrelatedInputDistribution = ot.ComposedDistribution(collection, copula)

100

101 # Polynomial basis

102 for v in range(Nvar):

103 marginalv = UncorrelatedInputDistribution.getMarginal(v)

104 polyColl[v] = ot.StandardDistributionPolynomialFactory(marginalv)

105

106 enumerateFunction = ot.HyperbolicAnisotropicEnumerateFunction(Nvar, q)

107 multivariateBasis = ot.OrthogonalProductPolynomialFactory(

108 polyColl, enumerateFunction

109 )

110 P = enumerateFunction.getStrataCumulatedCardinal(degree)

111 adaptativeStrategy = ot.FixedStrategy(multivariateBasis, P)

112

113 if LARS:

114 basisSequenceFactory = ot.LARS()

115 fittingAlgorithm = ot.CorrectedLeaveOneOut()

116 approximationAlgorithm = ot.LeastSquaresMetaModelSelectionFactory(

117 basisSequenceFactory, fittingAlgorithm

118 )

119 projectionStrategy = ot.LeastSquaresStrategy(

120 UncorrelatedInputSample, outputSample, approximationAlgorithm

121 )

122 algo = ot.FunctionalChaosAlgorithm(

123 UncorrelatedInputSample,

124 outputSample,

125 UncorrelatedInputDistribution,

126 adaptativeStrategy,

127 projectionStrategy,

128 )

129 else:

130 wei_exp = ot.MonteCarloExperiment(

131 UncorrelatedInputDistribution, UncorrelatedInputSample.getSize()

132 )

133 X_UncorrelatedInputSample, weights = wei_exp.generateWithWeights()

134 projectionStrategy = ot.LeastSquaresStrategy()

135 algo = ot.FunctionalChaosAlgorithm(

136 X_UncorrelatedInputSample,

137 weights,

138 outputSample,

139 UncorrelatedInputDistribution,

140 adaptativeStrategy,

141 projectionStrategy,

142 )

143

144 algo.run()

145 return algo.getResult()

146

147

148def compute_sobol_indices(pce_result, nvar):

149 """

150 Compute first and total Sobol indices for each k

151 """

152 first_sobol = np.zeros((nvar))

153 total_sobol = np.zeros((nvar))

154 chaosSI = ot.FunctionalChaosSobolIndices(pce_result)

155 for v in range(nvar):

156 first_sobol[v] = chaosSI.getSobolIndex(v)

157 total_sobol[v] = chaosSI.getSobolTotalIndex(v)

158 return first_sobol, total_sobol

159

160

161def plot_sobol_indices(

162 first, total, varnames, save=False, filename="pce_sobol_indices.png"

163):

164 """

165 Plot Sobol indices

166 """

167 colors = ["#002d74", "#e85113"]

168 nvar = len(varnames)

169 bar_width = 0.24

170 bar_positions1 = np.arange(nvar) + 0.5 * bar_width

171 bar_positions2 = np.arange(nvar) + 1.5 * bar_width

172

173 fig, ax = plt.subplots(figsize=(8, 5))

174 ax.bar(bar_positions1, first, width=bar_width, color=colors[0], label="1st order")

175 ax.bar(bar_positions2, total, width=bar_width, color=colors[1], label="Total order")

176 ax.set_xticks(bar_positions2)

177 ax.set_xticklabels(varnames, rotation=45, ha="right")

178 ax.set_xlabel("Variables")

179 ax.set_ylabel("Sobol indices")

180 ax.legend()

181 ax.grid(True)

182 plt.tight_layout()

183 if save:

184 plt.savefig(filename, dpi=150)

185 plt.show()

186

187

188def plot_training_quality(

189 database,

190 varnames,

191 value_of_interest,

192 kk_values,

193 input_variable_array_physical,

194 n_feature,

195 nvar,

196 PCE_metamodel,

197 save=True,

198 plot_options=None,

199 seed=42,

200 output_dir="./PCE_training_quality/",

201):

202 """

203 Evaluate PCE training quality via scatter plots, distribution comparison, and metrics.

204

205 Special case:

206 - If there is no model variable (nvar == n_feature), PCE is deterministic per time index.

207

208 Args:

209 database: Dataset of inputs.

210 varnames: Physical (stochastic) variable names.

211 value_of_interest: Target array of shape (sample_size, ntimes).

212 kk_values: Model-variable values per sample (None if no model var).

213 input_variable_array_physical: Physical input array (n_samples, n_feature).

214 n_feature: Number of physical variables.

215 nvar: Total number of input variables (physical + model).

216 PCE_metamodel: OpenTURNS PCE metamodel callable.

217 save: Whether to save figures to disk.

218 plot_options: Dict with flags: {"scatter": bool, "distribution": bool, "metrics": list}.

219 seed: RNG seed for selecting time indices.

220 output_dir: Output directory for figures.

221

222 Returns:

223 None

224 """

225

226 ntimes = value_of_interest.shape[1]

227

228 # Always use dataset sample dimension

229 sample_size = value_of_interest.shape[0]

230

231 # Detect presence of a model variable

232 has_model_var = (

233 (nvar > n_feature)

234 and (kk_values is not None)

235 and (len(kk_values) == sample_size)

236 )

237

238 # Compute PCE realizations: shape (sample_size, ntimes)

239 PCE_realizations = np.zeros((sample_size, ntimes))

240

241 for it in range(ntimes):

242 physical_var = input_variable_array_physical[it, :]

243

244 if has_model_var:

245 realization = np.zeros(nvar)

246 realization[:n_feature] = physical_var

247 for j, k in enumerate(kk_values):

248 realization[-1] = k

249 PCE_realizations[j, it] = PCE_metamodel(realization)[0]

250 else:

251 # deterministic (no model var): 1 prediction per time index, broadcast over samples

252 realization = np.zeros(n_feature)

253 realization[:] = physical_var

254 yhat = PCE_metamodel(realization)[0]

255 PCE_realizations[:, it] = yhat

256

257 # ------------------------------------------------------------------

258 # SPECIAL CASE: no model var -> plot value_of_interest[:, :] vs time

259 # ------------------------------------------------------------------

260

261 if not has_model_var and plot_options.get("scatter", False):

262 # PCE deterministic per time index -> 1 prediction per it

263 obs = np.asarray(value_of_interest) # (sample_size, ntimes)

264 pce_curve = np.asarray(PCE_realizations[0, :]) # (ntimes,)

265

266 # Build scatter arrays over all (sample, time) pairs

267 x = obs.ravel() # all observed points

268 y = np.tile(pce_curve, sample_size) # predicted repeated over samples

269

270 fig, ax = plt.subplots(1, 1, figsize=(7, 6))

271 ax.grid(True)

272

273 ax.plot(

274 x,

275 y,

276 "r.",

277 alpha=0.35,

278 markersize=4,

279 label="PCE vs Data (all samples, all times)",

280 )

281

282 # y=x reference

283 xmin = float(min(np.min(x), np.min(y)))

284 xmax = float(max(np.max(x), np.max(y)))

285 ax.plot([xmin, xmax], [xmin, xmax], "k--", linewidth=1.5, label="y=x")

286

287 ax.set_xlabel("Observed values")

288 ax.set_ylabel("PCE prediction")

289 ax.set_title(

290 "Training scatter (no model var): PCE deterministic per time index"

291 )

292 ax.legend()

293

294 plt.tight_layout()

295 if save:

296 plt.savefig(output_dir / "PCE_training_scatter_no_modelvar.png", dpi=150)

297 # plt.show()

298

299 return

300

301 # ------------------------------------------------------------------

302 # Standard case (model var exists): keep your previous behavior

303 # ------------------------------------------------------------------

304

305 # Fix random seed for reproducibility

306 rng = np.random.default_rng(seed)

307 n_pick = 4 if ntimes >= 4 else ntimes

308 time_indices = rng.choice(ntimes, n_pick, replace=False)

309

310 # --- Scatter plot ---

311 if plot_options.get("scatter", False):

312 fig, ax = plt.subplots(1, n_pick, figsize=(4.5 * n_pick, 6))

313 if n_pick == 1:

314 ax = [ax]

315

316 for idx, it in enumerate(time_indices):

317 line_len = 0

318 str_title = ""

319 for var_name in varnames:

320 value_str = f"{var_name}={np.array(database[var_name])[it]:.4g}"

321 if line_len + len(value_str) > 30:

322 str_title += "\n"

323 line_len = 0

324 str_title += value_str + ", "

325 line_len += len(value_str) + 2

326 str_title = str_title.rstrip(", ")

327

328 ax[idx].set_title(str_title)

329 ax[idx].grid(True)

330

331 ax[idx].plot(

332 value_of_interest[:, it],

333 PCE_realizations[:, it],

334 "r.",

335 label="PCE vs Data",

336 )

337 ax[idx].plot(

338 value_of_interest[:, it], value_of_interest[:, it], "k--", label="y=x"

339 )

340 ax[idx].set_xlabel("Observed biases")

341 ax[idx].set_ylabel("PCE biases")

342 ax[idx].legend()

343

344 plt.tight_layout(rect=[0, 0, 1, 0.95])

345 if save:

346 plt.savefig(output_dir / "PCE_training_scatter.png", dpi=150)

347 # plt.show()

348

349 # --- Distribution comparison ---

350 if plot_options.get("distribution", False):

351 fig, ax = plt.subplots(1, n_pick, figsize=(4.5 * n_pick, 6))

352 if n_pick == 1:

353 ax = [ax]

354

355 mb_min = float(value_of_interest.min().values)

356 mb_max = float(value_of_interest.max().values)

357 pce_min = float(np.min(PCE_realizations))

358 pce_max = float(np.max(PCE_realizations))

359 xs = np.linspace(min(mb_min, pce_min), max(mb_max, pce_max), 1000)

360

361 def safe_kde(arr):

362 arr = np.asarray(arr)

363 if arr.size < 2:

364 return None

365 if np.std(arr) <= 1e-12:

366 return None

367 try:

368 return stats.gaussian_kde(arr)

369 except Exception:

370 return None

371

372 for i, it in enumerate(time_indices):

373 ref = np.asarray(value_of_interest[:, it])

374 pce = np.asarray(PCE_realizations[:, it])

375

376 ref_kde = safe_kde(ref)

377 pce_kde = safe_kde(pce)

378

379 if ref_kde is not None:

380 ax[i].plot(xs, ref_kde(xs), color="green", label="Observed")

381 else:

382 ax[i].axvline(

383 np.mean(ref),

384 color="green",

385 linestyle="--",

386 label="Observed (degenerate)",

387 )

388

389 if pce_kde is not None:

390 ax[i].plot(xs, pce_kde(xs), color="blue", label="PCE")

391 else:

392 ax[i].axvline(

393 np.mean(pce), color="blue", linestyle="--", label="PCE (degenerate)"

394 )

395

396 ax[i].set_xlabel("Biases")

397 ax[i].set_ylabel("PDF")

398 ax[i].grid(True)

399 if i == 0:

400 ax[i].legend()

401

402 if save:

403 plt.savefig(output_dir / "PCE_training_distribution.png", dpi=150)

404 # plt.show()

405

406 # --- Metrics ---

407 metrics_to_plot = plot_options.get("metrics", [])

408 if metrics_to_plot:

409 mb_min = float(value_of_interest.min().values)

410 mb_max = float(value_of_interest.max().values)

411 pce_min = float(np.min(PCE_realizations))

412 pce_max = float(np.max(PCE_realizations))

413 xs = np.linspace(min(mb_min, pce_min), max(mb_max, pce_max), 1000)

414

415 metric_dict = {"RMSE": [], "R2": [], "Wasserstein": [], "KS": [], "KL": []}

416

417 def can_kde(arr):

418 arr = np.asarray(arr)

419 return (arr.size >= 2) and (np.std(arr) > 1e-12)

420

421 for it in range(ntimes):

422 ref = np.asarray(value_of_interest[:, it])

423 pce = np.asarray(PCE_realizations[:, it])

424

425 metric_dict["RMSE"].append(np.sqrt(mean_squared_error(ref, pce)))

426 metric_dict["R2"].append(r2_score(ref, pce))

427 metric_dict["Wasserstein"].append(wasserstein_distance(ref, pce))

428

429 ks_stat, _ = ks_2samp(ref, pce)

430 metric_dict["KS"].append(ks_stat)

431

432 if "KL" in metrics_to_plot and can_kde(ref) and can_kde(pce):

433 try:

434 ref_kde = stats.gaussian_kde(ref)

435 pce_kde = stats.gaussian_kde(pce)

436 p_vals = ref_kde(xs) + 1e-12

437 q_vals = pce_kde(xs) + 1e-12

438 p_vals /= np.sum(p_vals)

439 q_vals /= np.sum(q_vals)

440 metric_dict["KL"].append(entropy(p_vals, q_vals))

441 except Exception:

442 metric_dict["KL"].append(np.nan)

443 else:

444 metric_dict["KL"].append(np.nan)

445

446 fig, axes = plt.subplots(

447 len(metrics_to_plot), 1, figsize=(10, 3 * len(metrics_to_plot)), sharex=True

448 )

449 if len(metrics_to_plot) == 1:

450 axes = [axes]

451

452 for ax, metric in zip(axes, metrics_to_plot):

453 ax.plot(metric_dict[metric], label=metric, color="tab:blue")

454 ax.set_ylabel(metric)

455 ax.grid(True)

456 ax.legend()

457

458 axes[-1].set_xlabel("Time index")

459 plt.tight_layout()

460 if save:

461 plt.savefig(output_dir / "PCE_training_metrics.png", dpi=150)

462 # plt.show()

463

464

465def run_pce_sensitivity(database, feature_names, pce_config: dict, output_dir: Path):

466 """

467 PCE-based sensitivity analysis on error.

468

469 Computes Sobol indices to determine which physical features

470 contribute most to error variance.

471

472 Args:

473 database: xarray Dataset

474 feature_names: List of feature names corresponding to X columns

475 pce_config: Dict with PCE settings:

476 - degree (int): Polynomial degree, default 5

477 - marginals (str): 'kernel', 'uniform', or 'normal', default 'kernel'

478 - copula (str): 'independent' or 'normal', default 'independent'

479 - q (float): Hyperbolic truncation parameter, default 1.0

480 plot_options: Dict with settings to evalute PCE quality

481 - scatter (bool): activate scatter plot

482 - distribution (bool): activate distribution plot

483 - metrics (str): Metrics to plot["RMSE", "R2", "Wasserstein", "KS", "KL"]

484 output_dir: Path to save plots and CSV output

485

486 Returns:

487 Dict with:

488 - 'sobol_first': dict mapping feature names to first-order indices

489 - 'sobol_total': dict mapping feature names to total-order indices

490 - 'pce_result': the fitted PCE object (for further analysis if needed)

491 - 'feature_names': list of feature names

492 - 'model_coeff_name': model coefficient name

493 - 'varnames': list of variable names, feature_names + model_coeff_name

494 - 'value_of_interest': value_of_interest, 'ref_power_cap' if model_coeff_name = None, else 'model_bias_cap'

495

496 """

497 # Extract config with defaults

498 degree = pce_config.get("degree", 5)

499 marginals = pce_config.get("marginals", "kernel")

500 copula = pce_config.get("copula", "independent")

501 q = pce_config.get("q", 1.0)

502 DEFAULT_PLOT_OPTIONS = {

503 "scatter": True,

504 "distribution": False,

505 "metrics": [],

506 }

507 user_plot_options = pce_config.get("plot_options", {}) or {}

508 plot_options = {**DEFAULT_PLOT_OPTIONS, **user_plot_options}

509

510 print("pce_config keys:", pce_config.keys())

511 print("pce_config:", pce_config)

512 print("plot_options", plot_options)

513 model_coeff_name = pce_config.get("model_coeff_name", None)

514

515 if isinstance(model_coeff_name, str) and model_coeff_name.strip().lower() == "none":

516 model_coeff_name = None

517

518 # Decide model vars + target output (hardcoded target names)

519 if model_coeff_name is None:

520 model_vars = []

521 value_of_interest = "ref_power_cap"

522 else:

523 model_vars = [model_coeff_name]

524 value_of_interest = "model_bias_cap"

525

526 output_dir = Path(output_dir)

527 output_dir.mkdir(parents=True, exist_ok=True)

528

529 print("\n--- PCE Sensitivity (single run) ---")

530 print(f" model_coeff_name: {model_coeff_name}")

531 print(f" value_of_interest: {value_of_interest}")

532

533 # Build arrays

534 input_array, output_array, varnames, kk_values, n_feature = (

535 build_input_output_arrays(

536 database, feature_names, model_vars, value_of_interest

537 )

538 )

539

540 # Validate shapes

541 n_samples, n_features = input_array.shape

542

543 print("--- Running PCE Sensitivity Analysis ---")

544 print(f" Samples: {n_samples}, Features: {n_features}")

545 print(f" Degree: {degree}, Marginals: {marginals}, Copula: {copula}, q: {q}")

546 print(f" Features: {feature_names}")

547

548 # Construct PCE metamodel

549 print(" Constructing PCE metamodel...")

550 pce_result = construct_PCE_ot(

551 input_array=input_array,

552 output_array=output_array,

553 marginals=[marginals] * n_features,

554 copula=copula,

555 degree=degree,

556 q=q,

557 )

558

559 # Compute Sobol indices

560 print(" Computing Sobol indices...")

561 sobol_first, sobol_total = compute_sobol_indices(pce_result, n_features)

562

563 # Create results dict

564 results = {

565 "sobol_first": dict(zip(varnames, sobol_first)),

566 "sobol_total": dict(zip(varnames, sobol_total)),

567 "pce_result": pce_result,

568 "feature_names": feature_names,

569 "model_coeff_name": model_coeff_name,

570 "varnames": varnames,

571 "value_of_interest": value_of_interest,

572 }

573 # Save outputs

574 output_dir = Path(output_dir)

575 output_dir.mkdir(parents=True, exist_ok=True)

576

577 # Plot Sobol indices

578

579 # Save outputs (use suffix based on model_coeff_name/target)

580 tag = "no_modelvar" if model_coeff_name is None else str(model_coeff_name)

581 plot_file = output_dir / f"pce_sobol_indices_{tag}.png"

582 csv_file = output_dir / f"pce_sobol_indices_{tag}.csv"

583

584 plot_sobol_indices(

585 sobol_first,

586 sobol_total,

587 varnames,

588 save=True,

589 filename=str(plot_file),

590 )

591 print(f" Saved plot to {plot_file}")

592

593 sobol_df = pd.DataFrame(

594 {"Var Names": varnames, "First_Order": sobol_first, "Total_Order": sobol_total}

595 ).sort_values("Total_Order", ascending=False)

596 sobol_df.to_csv(csv_file, index=False)

597 print(f" Saved indices to {csv_file}")

598

599 # Training quality plot

600 PCE_metamodel = pce_result.getMetaModel()

601

602 array_of_interest = database[value_of_interest]

603

604 plot_training_quality(

605 database=database,

606 varnames=feature_names,

607 value_of_interest=array_of_interest,

608 kk_values=kk_values,

609 input_variable_array_physical=input_array[:n_samples, :n_feature],

610 n_feature=n_feature,

611 nvar=n_feature + (0 if model_coeff_name is None else 1),

612 PCE_metamodel=PCE_metamodel,

613 save=True,

614 plot_options=plot_options,

615 seed=42,

616 output_dir=output_dir,

617 )

618

619 return results

Coverage for wifa_uq / postprocessing / PCE_tool / pce_utils.py: 93%

301 statements