Coverage for wifa_uq / postprocessing / PCE_tool / pce_utils.py: 93%

301 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-19 02:10 +0000

1import numpy as np 

2import pandas as pd 

3from pathlib import Path 

4import openturns as ot 

5import matplotlib.pyplot as plt 

6from scipy import stats 

7from sklearn.metrics import mean_squared_error, r2_score 

8from scipy.stats import wasserstein_distance, ks_2samp, entropy 

9 

10 

11def build_input_output_arrays(ds, stochastic_vars, model_vars, output_var): 

12 """ 

13 Build input and output arrays for PCE from dataset 

14 """ 

15 ntimes = len(ds["wind_farm"].values) 

16 kk_values = ds[model_vars[0]].values if model_vars else np.array([0.0]) 

17 if not model_vars: 

18 nk = 1 

19 else: 

20 nk = ds.sizes["sample"] 

21 

22 n_feature = len(stochastic_vars) 

23 nvar = n_feature + len(model_vars) 

24 

25 input_array_physical = np.zeros((ntimes, n_feature)) 

26 input_array = np.zeros((ntimes * nk, nvar)) 

27 output_array = np.zeros((ntimes * nk)) 

28 

29 # Physical variables 

30 for j, var in enumerate(stochastic_vars): 

31 values = np.array(ds[var]) 

32 input_array_physical[:, j] = values 

33 for i in range(nk): 

34 input_array[i * ntimes : (i + 1) * ntimes, j] = values 

35 

36 # Model variables 

37 for j, var in enumerate(model_vars): 

38 for i in range(nk): 

39 input_array[i * ntimes : (i + 1) * ntimes, n_feature + j] = kk_values[i] 

40 

41 # Output 

42 

43 if not model_vars: 

44 # output_var expected shape (sample, ntimes) 

45 ymean = ds[output_var].mean(dim="sample").values # shape (ntimes,) 

46 output_array[:] = np.tile( 

47 ymean, nk 

48 ) # ou mieux: output_array = np.tile(ymean, nk) 

49 else: 

50 for i in range(nk): 

51 output_array[i * ntimes : (i + 1) * ntimes] = ds[output_var][i, :] 

52 

53 varnames = stochastic_vars + model_vars 

54 return input_array, output_array, varnames, kk_values, n_feature 

55 

56 

57def construct_PCE_ot( 

58 input_array, output_array, marginals, copula, degree, LARS=True, q=1 

59): 

60 """ 

61 Construct Polynomial Chaos Expansion using OpenTURNS 

62 """ 

63 Nt = input_array.shape[0] 

64 Nvar = input_array.shape[1] 

65 

66 # Create samples 

67 outputSample = ot.Sample(Nt, 1) 

68 for i in range(Nt): 

69 outputSample[i, 0] = output_array[i] 

70 

71 polyColl = ot.PolynomialFamilyCollection(Nvar) 

72 collection = ot.DistributionCollection(Nvar) 

73 marginal = {} 

74 UncorrelatedInputSample = ot.Sample(input_array) 

75 

76 # Marginals 

77 for i in range(Nvar): 

78 varSample = ot.Sample(Nt, 1) 

79 for j in range(Nt): 

80 varSample[j, 0] = input_array[j, i] 

81 minValue = varSample.getMin()[0] 

82 maxValue = varSample.getMax()[0] 

83 if marginals[i] == "kernel": 

84 marginal[i] = ot.KernelSmoothing().build(varSample) 

85 elif marginals[i] == "uniform": 

86 marginal[i] = ot.Uniform(minValue - 1e-5, maxValue + 1e-5) 

87 else: 

88 marginal[i] = ot.NormalFactory().build(varSample) 

89 collection[i] = ot.Distribution(marginal[i]) 

90 

91 # Copula 

92 if copula == "independent": 

93 copula = ot.IndependentCopula(Nvar) 

94 elif copula in ["gaussian", "normal"]: 

95 copula = ot.NormalCopulaFactory().build(ot.Sample(input_array)) 

96 else: 

97 copula = ot.IndependentCopula(Nvar) 

98 

99 UncorrelatedInputDistribution = ot.ComposedDistribution(collection, copula) 

100 

101 # Polynomial basis 

102 for v in range(Nvar): 

103 marginalv = UncorrelatedInputDistribution.getMarginal(v) 

104 polyColl[v] = ot.StandardDistributionPolynomialFactory(marginalv) 

105 

106 enumerateFunction = ot.HyperbolicAnisotropicEnumerateFunction(Nvar, q) 

107 multivariateBasis = ot.OrthogonalProductPolynomialFactory( 

108 polyColl, enumerateFunction 

109 ) 

110 P = enumerateFunction.getStrataCumulatedCardinal(degree) 

111 adaptativeStrategy = ot.FixedStrategy(multivariateBasis, P) 

112 

113 if LARS: 

114 basisSequenceFactory = ot.LARS() 

115 fittingAlgorithm = ot.CorrectedLeaveOneOut() 

116 approximationAlgorithm = ot.LeastSquaresMetaModelSelectionFactory( 

117 basisSequenceFactory, fittingAlgorithm 

118 ) 

119 projectionStrategy = ot.LeastSquaresStrategy( 

120 UncorrelatedInputSample, outputSample, approximationAlgorithm 

121 ) 

122 algo = ot.FunctionalChaosAlgorithm( 

123 UncorrelatedInputSample, 

124 outputSample, 

125 UncorrelatedInputDistribution, 

126 adaptativeStrategy, 

127 projectionStrategy, 

128 ) 

129 else: 

130 wei_exp = ot.MonteCarloExperiment( 

131 UncorrelatedInputDistribution, UncorrelatedInputSample.getSize() 

132 ) 

133 X_UncorrelatedInputSample, weights = wei_exp.generateWithWeights() 

134 projectionStrategy = ot.LeastSquaresStrategy() 

135 algo = ot.FunctionalChaosAlgorithm( 

136 X_UncorrelatedInputSample, 

137 weights, 

138 outputSample, 

139 UncorrelatedInputDistribution, 

140 adaptativeStrategy, 

141 projectionStrategy, 

142 ) 

143 

144 algo.run() 

145 return algo.getResult() 

146 

147 

148def compute_sobol_indices(pce_result, nvar): 

149 """ 

150 Compute first and total Sobol indices for each k 

151 """ 

152 first_sobol = np.zeros((nvar)) 

153 total_sobol = np.zeros((nvar)) 

154 chaosSI = ot.FunctionalChaosSobolIndices(pce_result) 

155 for v in range(nvar): 

156 first_sobol[v] = chaosSI.getSobolIndex(v) 

157 total_sobol[v] = chaosSI.getSobolTotalIndex(v) 

158 return first_sobol, total_sobol 

159 

160 

161def plot_sobol_indices( 

162 first, total, varnames, save=False, filename="pce_sobol_indices.png" 

163): 

164 """ 

165 Plot Sobol indices 

166 """ 

167 colors = ["#002d74", "#e85113"] 

168 nvar = len(varnames) 

169 bar_width = 0.24 

170 bar_positions1 = np.arange(nvar) + 0.5 * bar_width 

171 bar_positions2 = np.arange(nvar) + 1.5 * bar_width 

172 

173 fig, ax = plt.subplots(figsize=(8, 5)) 

174 ax.bar(bar_positions1, first, width=bar_width, color=colors[0], label="1st order") 

175 ax.bar(bar_positions2, total, width=bar_width, color=colors[1], label="Total order") 

176 ax.set_xticks(bar_positions2) 

177 ax.set_xticklabels(varnames, rotation=45, ha="right") 

178 ax.set_xlabel("Variables") 

179 ax.set_ylabel("Sobol indices") 

180 ax.legend() 

181 ax.grid(True) 

182 plt.tight_layout() 

183 if save: 

184 plt.savefig(filename, dpi=150) 

185 plt.show() 

186 

187 

188def plot_training_quality( 

189 database, 

190 varnames, 

191 value_of_interest, 

192 kk_values, 

193 input_variable_array_physical, 

194 n_feature, 

195 nvar, 

196 PCE_metamodel, 

197 save=True, 

198 plot_options=None, 

199 seed=42, 

200 output_dir="./PCE_training_quality/", 

201): 

202 """ 

203 Evaluate PCE training quality via scatter plots, distribution comparison, and metrics. 

204 

205 Special case: 

206 - If there is no model variable (nvar == n_feature), PCE is deterministic per time index. 

207 

208 Args: 

209 database: Dataset of inputs. 

210 varnames: Physical (stochastic) variable names. 

211 value_of_interest: Target array of shape (sample_size, ntimes). 

212 kk_values: Model-variable values per sample (None if no model var). 

213 input_variable_array_physical: Physical input array (n_samples, n_feature). 

214 n_feature: Number of physical variables. 

215 nvar: Total number of input variables (physical + model). 

216 PCE_metamodel: OpenTURNS PCE metamodel callable. 

217 save: Whether to save figures to disk. 

218 plot_options: Dict with flags: {"scatter": bool, "distribution": bool, "metrics": list}. 

219 seed: RNG seed for selecting time indices. 

220 output_dir: Output directory for figures. 

221 

222 Returns: 

223 None 

224 """ 

225 

226 ntimes = value_of_interest.shape[1] 

227 

228 # Always use dataset sample dimension 

229 sample_size = value_of_interest.shape[0] 

230 

231 # Detect presence of a model variable 

232 has_model_var = ( 

233 (nvar > n_feature) 

234 and (kk_values is not None) 

235 and (len(kk_values) == sample_size) 

236 ) 

237 

238 # Compute PCE realizations: shape (sample_size, ntimes) 

239 PCE_realizations = np.zeros((sample_size, ntimes)) 

240 

241 for it in range(ntimes): 

242 physical_var = input_variable_array_physical[it, :] 

243 

244 if has_model_var: 

245 realization = np.zeros(nvar) 

246 realization[:n_feature] = physical_var 

247 for j, k in enumerate(kk_values): 

248 realization[-1] = k 

249 PCE_realizations[j, it] = PCE_metamodel(realization)[0] 

250 else: 

251 # deterministic (no model var): 1 prediction per time index, broadcast over samples 

252 realization = np.zeros(n_feature) 

253 realization[:] = physical_var 

254 yhat = PCE_metamodel(realization)[0] 

255 PCE_realizations[:, it] = yhat 

256 

257 # ------------------------------------------------------------------ 

258 # SPECIAL CASE: no model var -> plot value_of_interest[:, :] vs time 

259 # ------------------------------------------------------------------ 

260 

261 if not has_model_var and plot_options.get("scatter", False): 

262 # PCE deterministic per time index -> 1 prediction per it 

263 obs = np.asarray(value_of_interest) # (sample_size, ntimes) 

264 pce_curve = np.asarray(PCE_realizations[0, :]) # (ntimes,) 

265 

266 # Build scatter arrays over all (sample, time) pairs 

267 x = obs.ravel() # all observed points 

268 y = np.tile(pce_curve, sample_size) # predicted repeated over samples 

269 

270 fig, ax = plt.subplots(1, 1, figsize=(7, 6)) 

271 ax.grid(True) 

272 

273 ax.plot( 

274 x, 

275 y, 

276 "r.", 

277 alpha=0.35, 

278 markersize=4, 

279 label="PCE vs Data (all samples, all times)", 

280 ) 

281 

282 # y=x reference 

283 xmin = float(min(np.min(x), np.min(y))) 

284 xmax = float(max(np.max(x), np.max(y))) 

285 ax.plot([xmin, xmax], [xmin, xmax], "k--", linewidth=1.5, label="y=x") 

286 

287 ax.set_xlabel("Observed values") 

288 ax.set_ylabel("PCE prediction") 

289 ax.set_title( 

290 "Training scatter (no model var): PCE deterministic per time index" 

291 ) 

292 ax.legend() 

293 

294 plt.tight_layout() 

295 if save: 

296 plt.savefig(output_dir / "PCE_training_scatter_no_modelvar.png", dpi=150) 

297 # plt.show() 

298 

299 return 

300 

301 # ------------------------------------------------------------------ 

302 # Standard case (model var exists): keep your previous behavior 

303 # ------------------------------------------------------------------ 

304 

305 # Fix random seed for reproducibility 

306 rng = np.random.default_rng(seed) 

307 n_pick = 4 if ntimes >= 4 else ntimes 

308 time_indices = rng.choice(ntimes, n_pick, replace=False) 

309 

310 # --- Scatter plot --- 

311 if plot_options.get("scatter", False): 

312 fig, ax = plt.subplots(1, n_pick, figsize=(4.5 * n_pick, 6)) 

313 if n_pick == 1: 

314 ax = [ax] 

315 

316 for idx, it in enumerate(time_indices): 

317 line_len = 0 

318 str_title = "" 

319 for var_name in varnames: 

320 value_str = f"{var_name}={np.array(database[var_name])[it]:.4g}" 

321 if line_len + len(value_str) > 30: 

322 str_title += "\n" 

323 line_len = 0 

324 str_title += value_str + ", " 

325 line_len += len(value_str) + 2 

326 str_title = str_title.rstrip(", ") 

327 

328 ax[idx].set_title(str_title) 

329 ax[idx].grid(True) 

330 

331 ax[idx].plot( 

332 value_of_interest[:, it], 

333 PCE_realizations[:, it], 

334 "r.", 

335 label="PCE vs Data", 

336 ) 

337 ax[idx].plot( 

338 value_of_interest[:, it], value_of_interest[:, it], "k--", label="y=x" 

339 ) 

340 ax[idx].set_xlabel("Observed biases") 

341 ax[idx].set_ylabel("PCE biases") 

342 ax[idx].legend() 

343 

344 plt.tight_layout(rect=[0, 0, 1, 0.95]) 

345 if save: 

346 plt.savefig(output_dir / "PCE_training_scatter.png", dpi=150) 

347 # plt.show() 

348 

349 # --- Distribution comparison --- 

350 if plot_options.get("distribution", False): 

351 fig, ax = plt.subplots(1, n_pick, figsize=(4.5 * n_pick, 6)) 

352 if n_pick == 1: 

353 ax = [ax] 

354 

355 mb_min = float(value_of_interest.min().values) 

356 mb_max = float(value_of_interest.max().values) 

357 pce_min = float(np.min(PCE_realizations)) 

358 pce_max = float(np.max(PCE_realizations)) 

359 xs = np.linspace(min(mb_min, pce_min), max(mb_max, pce_max), 1000) 

360 

361 def safe_kde(arr): 

362 arr = np.asarray(arr) 

363 if arr.size < 2: 

364 return None 

365 if np.std(arr) <= 1e-12: 

366 return None 

367 try: 

368 return stats.gaussian_kde(arr) 

369 except Exception: 

370 return None 

371 

372 for i, it in enumerate(time_indices): 

373 ref = np.asarray(value_of_interest[:, it]) 

374 pce = np.asarray(PCE_realizations[:, it]) 

375 

376 ref_kde = safe_kde(ref) 

377 pce_kde = safe_kde(pce) 

378 

379 if ref_kde is not None: 

380 ax[i].plot(xs, ref_kde(xs), color="green", label="Observed") 

381 else: 

382 ax[i].axvline( 

383 np.mean(ref), 

384 color="green", 

385 linestyle="--", 

386 label="Observed (degenerate)", 

387 ) 

388 

389 if pce_kde is not None: 

390 ax[i].plot(xs, pce_kde(xs), color="blue", label="PCE") 

391 else: 

392 ax[i].axvline( 

393 np.mean(pce), color="blue", linestyle="--", label="PCE (degenerate)" 

394 ) 

395 

396 ax[i].set_xlabel("Biases") 

397 ax[i].set_ylabel("PDF") 

398 ax[i].grid(True) 

399 if i == 0: 

400 ax[i].legend() 

401 

402 if save: 

403 plt.savefig(output_dir / "PCE_training_distribution.png", dpi=150) 

404 # plt.show() 

405 

406 # --- Metrics --- 

407 metrics_to_plot = plot_options.get("metrics", []) 

408 if metrics_to_plot: 

409 mb_min = float(value_of_interest.min().values) 

410 mb_max = float(value_of_interest.max().values) 

411 pce_min = float(np.min(PCE_realizations)) 

412 pce_max = float(np.max(PCE_realizations)) 

413 xs = np.linspace(min(mb_min, pce_min), max(mb_max, pce_max), 1000) 

414 

415 metric_dict = {"RMSE": [], "R2": [], "Wasserstein": [], "KS": [], "KL": []} 

416 

417 def can_kde(arr): 

418 arr = np.asarray(arr) 

419 return (arr.size >= 2) and (np.std(arr) > 1e-12) 

420 

421 for it in range(ntimes): 

422 ref = np.asarray(value_of_interest[:, it]) 

423 pce = np.asarray(PCE_realizations[:, it]) 

424 

425 metric_dict["RMSE"].append(np.sqrt(mean_squared_error(ref, pce))) 

426 metric_dict["R2"].append(r2_score(ref, pce)) 

427 metric_dict["Wasserstein"].append(wasserstein_distance(ref, pce)) 

428 

429 ks_stat, _ = ks_2samp(ref, pce) 

430 metric_dict["KS"].append(ks_stat) 

431 

432 if "KL" in metrics_to_plot and can_kde(ref) and can_kde(pce): 

433 try: 

434 ref_kde = stats.gaussian_kde(ref) 

435 pce_kde = stats.gaussian_kde(pce) 

436 p_vals = ref_kde(xs) + 1e-12 

437 q_vals = pce_kde(xs) + 1e-12 

438 p_vals /= np.sum(p_vals) 

439 q_vals /= np.sum(q_vals) 

440 metric_dict["KL"].append(entropy(p_vals, q_vals)) 

441 except Exception: 

442 metric_dict["KL"].append(np.nan) 

443 else: 

444 metric_dict["KL"].append(np.nan) 

445 

446 fig, axes = plt.subplots( 

447 len(metrics_to_plot), 1, figsize=(10, 3 * len(metrics_to_plot)), sharex=True 

448 ) 

449 if len(metrics_to_plot) == 1: 

450 axes = [axes] 

451 

452 for ax, metric in zip(axes, metrics_to_plot): 

453 ax.plot(metric_dict[metric], label=metric, color="tab:blue") 

454 ax.set_ylabel(metric) 

455 ax.grid(True) 

456 ax.legend() 

457 

458 axes[-1].set_xlabel("Time index") 

459 plt.tight_layout() 

460 if save: 

461 plt.savefig(output_dir / "PCE_training_metrics.png", dpi=150) 

462 # plt.show() 

463 

464 

465def run_pce_sensitivity(database, feature_names, pce_config: dict, output_dir: Path): 

466 """ 

467 PCE-based sensitivity analysis on error. 

468 

469 Computes Sobol indices to determine which physical features 

470 contribute most to error variance. 

471 

472 Args: 

473 database: xarray Dataset 

474 feature_names: List of feature names corresponding to X columns 

475 pce_config: Dict with PCE settings: 

476 - degree (int): Polynomial degree, default 5 

477 - marginals (str): 'kernel', 'uniform', or 'normal', default 'kernel' 

478 - copula (str): 'independent' or 'normal', default 'independent' 

479 - q (float): Hyperbolic truncation parameter, default 1.0 

480 plot_options: Dict with settings to evalute PCE quality 

481 - scatter (bool): activate scatter plot 

482 - distribution (bool): activate distribution plot 

483 - metrics (str): Metrics to plot["RMSE", "R2", "Wasserstein", "KS", "KL"] 

484 output_dir: Path to save plots and CSV output 

485 

486 Returns: 

487 Dict with: 

488 - 'sobol_first': dict mapping feature names to first-order indices 

489 - 'sobol_total': dict mapping feature names to total-order indices 

490 - 'pce_result': the fitted PCE object (for further analysis if needed) 

491 - 'feature_names': list of feature names 

492 - 'model_coeff_name': model coefficient name 

493 - 'varnames': list of variable names, feature_names + model_coeff_name 

494 - 'value_of_interest': value_of_interest, 'ref_power_cap' if model_coeff_name = None, else 'model_bias_cap' 

495 

496 """ 

497 # Extract config with defaults 

498 degree = pce_config.get("degree", 5) 

499 marginals = pce_config.get("marginals", "kernel") 

500 copula = pce_config.get("copula", "independent") 

501 q = pce_config.get("q", 1.0) 

502 DEFAULT_PLOT_OPTIONS = { 

503 "scatter": True, 

504 "distribution": False, 

505 "metrics": [], 

506 } 

507 user_plot_options = pce_config.get("plot_options", {}) or {} 

508 plot_options = {**DEFAULT_PLOT_OPTIONS, **user_plot_options} 

509 

510 print("pce_config keys:", pce_config.keys()) 

511 print("pce_config:", pce_config) 

512 print("plot_options", plot_options) 

513 model_coeff_name = pce_config.get("model_coeff_name", None) 

514 

515 if isinstance(model_coeff_name, str) and model_coeff_name.strip().lower() == "none": 

516 model_coeff_name = None 

517 

518 # Decide model vars + target output (hardcoded target names) 

519 if model_coeff_name is None: 

520 model_vars = [] 

521 value_of_interest = "ref_power_cap" 

522 else: 

523 model_vars = [model_coeff_name] 

524 value_of_interest = "model_bias_cap" 

525 

526 output_dir = Path(output_dir) 

527 output_dir.mkdir(parents=True, exist_ok=True) 

528 

529 print("\n--- PCE Sensitivity (single run) ---") 

530 print(f" model_coeff_name: {model_coeff_name}") 

531 print(f" value_of_interest: {value_of_interest}") 

532 

533 # Build arrays 

534 input_array, output_array, varnames, kk_values, n_feature = ( 

535 build_input_output_arrays( 

536 database, feature_names, model_vars, value_of_interest 

537 ) 

538 ) 

539 

540 # Validate shapes 

541 n_samples, n_features = input_array.shape 

542 

543 print("--- Running PCE Sensitivity Analysis ---") 

544 print(f" Samples: {n_samples}, Features: {n_features}") 

545 print(f" Degree: {degree}, Marginals: {marginals}, Copula: {copula}, q: {q}") 

546 print(f" Features: {feature_names}") 

547 

548 # Construct PCE metamodel 

549 print(" Constructing PCE metamodel...") 

550 pce_result = construct_PCE_ot( 

551 input_array=input_array, 

552 output_array=output_array, 

553 marginals=[marginals] * n_features, 

554 copula=copula, 

555 degree=degree, 

556 q=q, 

557 ) 

558 

559 # Compute Sobol indices 

560 print(" Computing Sobol indices...") 

561 sobol_first, sobol_total = compute_sobol_indices(pce_result, n_features) 

562 

563 # Create results dict 

564 results = { 

565 "sobol_first": dict(zip(varnames, sobol_first)), 

566 "sobol_total": dict(zip(varnames, sobol_total)), 

567 "pce_result": pce_result, 

568 "feature_names": feature_names, 

569 "model_coeff_name": model_coeff_name, 

570 "varnames": varnames, 

571 "value_of_interest": value_of_interest, 

572 } 

573 # Save outputs 

574 output_dir = Path(output_dir) 

575 output_dir.mkdir(parents=True, exist_ok=True) 

576 

577 # Plot Sobol indices 

578 

579 # Save outputs (use suffix based on model_coeff_name/target) 

580 tag = "no_modelvar" if model_coeff_name is None else str(model_coeff_name) 

581 plot_file = output_dir / f"pce_sobol_indices_{tag}.png" 

582 csv_file = output_dir / f"pce_sobol_indices_{tag}.csv" 

583 

584 plot_sobol_indices( 

585 sobol_first, 

586 sobol_total, 

587 varnames, 

588 save=True, 

589 filename=str(plot_file), 

590 ) 

591 print(f" Saved plot to {plot_file}") 

592 

593 sobol_df = pd.DataFrame( 

594 {"Var Names": varnames, "First_Order": sobol_first, "Total_Order": sobol_total} 

595 ).sort_values("Total_Order", ascending=False) 

596 sobol_df.to_csv(csv_file, index=False) 

597 print(f" Saved indices to {csv_file}") 

598 

599 # Training quality plot 

600 PCE_metamodel = pce_result.getMetaModel() 

601 

602 array_of_interest = database[value_of_interest] 

603 

604 plot_training_quality( 

605 database=database, 

606 varnames=feature_names, 

607 value_of_interest=array_of_interest, 

608 kk_values=kk_values, 

609 input_variable_array_physical=input_array[:n_samples, :n_feature], 

610 n_feature=n_feature, 

611 nvar=n_feature + (0 if model_coeff_name is None else 1), 

612 PCE_metamodel=PCE_metamodel, 

613 save=True, 

614 plot_options=plot_options, 

615 seed=42, 

616 output_dir=output_dir, 

617 ) 

618 

619 return results