Coverage for wifa_uq / postprocessing / PCE_tool / pce_utils.py: 93%
301 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-19 02:10 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-19 02:10 +0000
1import numpy as np
2import pandas as pd
3from pathlib import Path
4import openturns as ot
5import matplotlib.pyplot as plt
6from scipy import stats
7from sklearn.metrics import mean_squared_error, r2_score
8from scipy.stats import wasserstein_distance, ks_2samp, entropy
11def build_input_output_arrays(ds, stochastic_vars, model_vars, output_var):
12 """
13 Build input and output arrays for PCE from dataset
14 """
15 ntimes = len(ds["wind_farm"].values)
16 kk_values = ds[model_vars[0]].values if model_vars else np.array([0.0])
17 if not model_vars:
18 nk = 1
19 else:
20 nk = ds.sizes["sample"]
22 n_feature = len(stochastic_vars)
23 nvar = n_feature + len(model_vars)
25 input_array_physical = np.zeros((ntimes, n_feature))
26 input_array = np.zeros((ntimes * nk, nvar))
27 output_array = np.zeros((ntimes * nk))
29 # Physical variables
30 for j, var in enumerate(stochastic_vars):
31 values = np.array(ds[var])
32 input_array_physical[:, j] = values
33 for i in range(nk):
34 input_array[i * ntimes : (i + 1) * ntimes, j] = values
36 # Model variables
37 for j, var in enumerate(model_vars):
38 for i in range(nk):
39 input_array[i * ntimes : (i + 1) * ntimes, n_feature + j] = kk_values[i]
41 # Output
43 if not model_vars:
44 # output_var expected shape (sample, ntimes)
45 ymean = ds[output_var].mean(dim="sample").values # shape (ntimes,)
46 output_array[:] = np.tile(
47 ymean, nk
48 ) # ou mieux: output_array = np.tile(ymean, nk)
49 else:
50 for i in range(nk):
51 output_array[i * ntimes : (i + 1) * ntimes] = ds[output_var][i, :]
53 varnames = stochastic_vars + model_vars
54 return input_array, output_array, varnames, kk_values, n_feature
57def construct_PCE_ot(
58 input_array, output_array, marginals, copula, degree, LARS=True, q=1
59):
60 """
61 Construct Polynomial Chaos Expansion using OpenTURNS
62 """
63 Nt = input_array.shape[0]
64 Nvar = input_array.shape[1]
66 # Create samples
67 outputSample = ot.Sample(Nt, 1)
68 for i in range(Nt):
69 outputSample[i, 0] = output_array[i]
71 polyColl = ot.PolynomialFamilyCollection(Nvar)
72 collection = ot.DistributionCollection(Nvar)
73 marginal = {}
74 UncorrelatedInputSample = ot.Sample(input_array)
76 # Marginals
77 for i in range(Nvar):
78 varSample = ot.Sample(Nt, 1)
79 for j in range(Nt):
80 varSample[j, 0] = input_array[j, i]
81 minValue = varSample.getMin()[0]
82 maxValue = varSample.getMax()[0]
83 if marginals[i] == "kernel":
84 marginal[i] = ot.KernelSmoothing().build(varSample)
85 elif marginals[i] == "uniform":
86 marginal[i] = ot.Uniform(minValue - 1e-5, maxValue + 1e-5)
87 else:
88 marginal[i] = ot.NormalFactory().build(varSample)
89 collection[i] = ot.Distribution(marginal[i])
91 # Copula
92 if copula == "independent":
93 copula = ot.IndependentCopula(Nvar)
94 elif copula in ["gaussian", "normal"]:
95 copula = ot.NormalCopulaFactory().build(ot.Sample(input_array))
96 else:
97 copula = ot.IndependentCopula(Nvar)
99 UncorrelatedInputDistribution = ot.ComposedDistribution(collection, copula)
101 # Polynomial basis
102 for v in range(Nvar):
103 marginalv = UncorrelatedInputDistribution.getMarginal(v)
104 polyColl[v] = ot.StandardDistributionPolynomialFactory(marginalv)
106 enumerateFunction = ot.HyperbolicAnisotropicEnumerateFunction(Nvar, q)
107 multivariateBasis = ot.OrthogonalProductPolynomialFactory(
108 polyColl, enumerateFunction
109 )
110 P = enumerateFunction.getStrataCumulatedCardinal(degree)
111 adaptativeStrategy = ot.FixedStrategy(multivariateBasis, P)
113 if LARS:
114 basisSequenceFactory = ot.LARS()
115 fittingAlgorithm = ot.CorrectedLeaveOneOut()
116 approximationAlgorithm = ot.LeastSquaresMetaModelSelectionFactory(
117 basisSequenceFactory, fittingAlgorithm
118 )
119 projectionStrategy = ot.LeastSquaresStrategy(
120 UncorrelatedInputSample, outputSample, approximationAlgorithm
121 )
122 algo = ot.FunctionalChaosAlgorithm(
123 UncorrelatedInputSample,
124 outputSample,
125 UncorrelatedInputDistribution,
126 adaptativeStrategy,
127 projectionStrategy,
128 )
129 else:
130 wei_exp = ot.MonteCarloExperiment(
131 UncorrelatedInputDistribution, UncorrelatedInputSample.getSize()
132 )
133 X_UncorrelatedInputSample, weights = wei_exp.generateWithWeights()
134 projectionStrategy = ot.LeastSquaresStrategy()
135 algo = ot.FunctionalChaosAlgorithm(
136 X_UncorrelatedInputSample,
137 weights,
138 outputSample,
139 UncorrelatedInputDistribution,
140 adaptativeStrategy,
141 projectionStrategy,
142 )
144 algo.run()
145 return algo.getResult()
148def compute_sobol_indices(pce_result, nvar):
149 """
150 Compute first and total Sobol indices for each k
151 """
152 first_sobol = np.zeros((nvar))
153 total_sobol = np.zeros((nvar))
154 chaosSI = ot.FunctionalChaosSobolIndices(pce_result)
155 for v in range(nvar):
156 first_sobol[v] = chaosSI.getSobolIndex(v)
157 total_sobol[v] = chaosSI.getSobolTotalIndex(v)
158 return first_sobol, total_sobol
161def plot_sobol_indices(
162 first, total, varnames, save=False, filename="pce_sobol_indices.png"
163):
164 """
165 Plot Sobol indices
166 """
167 colors = ["#002d74", "#e85113"]
168 nvar = len(varnames)
169 bar_width = 0.24
170 bar_positions1 = np.arange(nvar) + 0.5 * bar_width
171 bar_positions2 = np.arange(nvar) + 1.5 * bar_width
173 fig, ax = plt.subplots(figsize=(8, 5))
174 ax.bar(bar_positions1, first, width=bar_width, color=colors[0], label="1st order")
175 ax.bar(bar_positions2, total, width=bar_width, color=colors[1], label="Total order")
176 ax.set_xticks(bar_positions2)
177 ax.set_xticklabels(varnames, rotation=45, ha="right")
178 ax.set_xlabel("Variables")
179 ax.set_ylabel("Sobol indices")
180 ax.legend()
181 ax.grid(True)
182 plt.tight_layout()
183 if save:
184 plt.savefig(filename, dpi=150)
185 plt.show()
188def plot_training_quality(
189 database,
190 varnames,
191 value_of_interest,
192 kk_values,
193 input_variable_array_physical,
194 n_feature,
195 nvar,
196 PCE_metamodel,
197 save=True,
198 plot_options=None,
199 seed=42,
200 output_dir="./PCE_training_quality/",
201):
202 """
203 Evaluate PCE training quality via scatter plots, distribution comparison, and metrics.
205 Special case:
206 - If there is no model variable (nvar == n_feature), PCE is deterministic per time index.
208 Args:
209 database: Dataset of inputs.
210 varnames: Physical (stochastic) variable names.
211 value_of_interest: Target array of shape (sample_size, ntimes).
212 kk_values: Model-variable values per sample (None if no model var).
213 input_variable_array_physical: Physical input array (n_samples, n_feature).
214 n_feature: Number of physical variables.
215 nvar: Total number of input variables (physical + model).
216 PCE_metamodel: OpenTURNS PCE metamodel callable.
217 save: Whether to save figures to disk.
218 plot_options: Dict with flags: {"scatter": bool, "distribution": bool, "metrics": list}.
219 seed: RNG seed for selecting time indices.
220 output_dir: Output directory for figures.
222 Returns:
223 None
224 """
226 ntimes = value_of_interest.shape[1]
228 # Always use dataset sample dimension
229 sample_size = value_of_interest.shape[0]
231 # Detect presence of a model variable
232 has_model_var = (
233 (nvar > n_feature)
234 and (kk_values is not None)
235 and (len(kk_values) == sample_size)
236 )
238 # Compute PCE realizations: shape (sample_size, ntimes)
239 PCE_realizations = np.zeros((sample_size, ntimes))
241 for it in range(ntimes):
242 physical_var = input_variable_array_physical[it, :]
244 if has_model_var:
245 realization = np.zeros(nvar)
246 realization[:n_feature] = physical_var
247 for j, k in enumerate(kk_values):
248 realization[-1] = k
249 PCE_realizations[j, it] = PCE_metamodel(realization)[0]
250 else:
251 # deterministic (no model var): 1 prediction per time index, broadcast over samples
252 realization = np.zeros(n_feature)
253 realization[:] = physical_var
254 yhat = PCE_metamodel(realization)[0]
255 PCE_realizations[:, it] = yhat
257 # ------------------------------------------------------------------
258 # SPECIAL CASE: no model var -> plot value_of_interest[:, :] vs time
259 # ------------------------------------------------------------------
261 if not has_model_var and plot_options.get("scatter", False):
262 # PCE deterministic per time index -> 1 prediction per it
263 obs = np.asarray(value_of_interest) # (sample_size, ntimes)
264 pce_curve = np.asarray(PCE_realizations[0, :]) # (ntimes,)
266 # Build scatter arrays over all (sample, time) pairs
267 x = obs.ravel() # all observed points
268 y = np.tile(pce_curve, sample_size) # predicted repeated over samples
270 fig, ax = plt.subplots(1, 1, figsize=(7, 6))
271 ax.grid(True)
273 ax.plot(
274 x,
275 y,
276 "r.",
277 alpha=0.35,
278 markersize=4,
279 label="PCE vs Data (all samples, all times)",
280 )
282 # y=x reference
283 xmin = float(min(np.min(x), np.min(y)))
284 xmax = float(max(np.max(x), np.max(y)))
285 ax.plot([xmin, xmax], [xmin, xmax], "k--", linewidth=1.5, label="y=x")
287 ax.set_xlabel("Observed values")
288 ax.set_ylabel("PCE prediction")
289 ax.set_title(
290 "Training scatter (no model var): PCE deterministic per time index"
291 )
292 ax.legend()
294 plt.tight_layout()
295 if save:
296 plt.savefig(output_dir / "PCE_training_scatter_no_modelvar.png", dpi=150)
297 # plt.show()
299 return
301 # ------------------------------------------------------------------
302 # Standard case (model var exists): keep your previous behavior
303 # ------------------------------------------------------------------
305 # Fix random seed for reproducibility
306 rng = np.random.default_rng(seed)
307 n_pick = 4 if ntimes >= 4 else ntimes
308 time_indices = rng.choice(ntimes, n_pick, replace=False)
310 # --- Scatter plot ---
311 if plot_options.get("scatter", False):
312 fig, ax = plt.subplots(1, n_pick, figsize=(4.5 * n_pick, 6))
313 if n_pick == 1:
314 ax = [ax]
316 for idx, it in enumerate(time_indices):
317 line_len = 0
318 str_title = ""
319 for var_name in varnames:
320 value_str = f"{var_name}={np.array(database[var_name])[it]:.4g}"
321 if line_len + len(value_str) > 30:
322 str_title += "\n"
323 line_len = 0
324 str_title += value_str + ", "
325 line_len += len(value_str) + 2
326 str_title = str_title.rstrip(", ")
328 ax[idx].set_title(str_title)
329 ax[idx].grid(True)
331 ax[idx].plot(
332 value_of_interest[:, it],
333 PCE_realizations[:, it],
334 "r.",
335 label="PCE vs Data",
336 )
337 ax[idx].plot(
338 value_of_interest[:, it], value_of_interest[:, it], "k--", label="y=x"
339 )
340 ax[idx].set_xlabel("Observed biases")
341 ax[idx].set_ylabel("PCE biases")
342 ax[idx].legend()
344 plt.tight_layout(rect=[0, 0, 1, 0.95])
345 if save:
346 plt.savefig(output_dir / "PCE_training_scatter.png", dpi=150)
347 # plt.show()
349 # --- Distribution comparison ---
350 if plot_options.get("distribution", False):
351 fig, ax = plt.subplots(1, n_pick, figsize=(4.5 * n_pick, 6))
352 if n_pick == 1:
353 ax = [ax]
355 mb_min = float(value_of_interest.min().values)
356 mb_max = float(value_of_interest.max().values)
357 pce_min = float(np.min(PCE_realizations))
358 pce_max = float(np.max(PCE_realizations))
359 xs = np.linspace(min(mb_min, pce_min), max(mb_max, pce_max), 1000)
361 def safe_kde(arr):
362 arr = np.asarray(arr)
363 if arr.size < 2:
364 return None
365 if np.std(arr) <= 1e-12:
366 return None
367 try:
368 return stats.gaussian_kde(arr)
369 except Exception:
370 return None
372 for i, it in enumerate(time_indices):
373 ref = np.asarray(value_of_interest[:, it])
374 pce = np.asarray(PCE_realizations[:, it])
376 ref_kde = safe_kde(ref)
377 pce_kde = safe_kde(pce)
379 if ref_kde is not None:
380 ax[i].plot(xs, ref_kde(xs), color="green", label="Observed")
381 else:
382 ax[i].axvline(
383 np.mean(ref),
384 color="green",
385 linestyle="--",
386 label="Observed (degenerate)",
387 )
389 if pce_kde is not None:
390 ax[i].plot(xs, pce_kde(xs), color="blue", label="PCE")
391 else:
392 ax[i].axvline(
393 np.mean(pce), color="blue", linestyle="--", label="PCE (degenerate)"
394 )
396 ax[i].set_xlabel("Biases")
397 ax[i].set_ylabel("PDF")
398 ax[i].grid(True)
399 if i == 0:
400 ax[i].legend()
402 if save:
403 plt.savefig(output_dir / "PCE_training_distribution.png", dpi=150)
404 # plt.show()
406 # --- Metrics ---
407 metrics_to_plot = plot_options.get("metrics", [])
408 if metrics_to_plot:
409 mb_min = float(value_of_interest.min().values)
410 mb_max = float(value_of_interest.max().values)
411 pce_min = float(np.min(PCE_realizations))
412 pce_max = float(np.max(PCE_realizations))
413 xs = np.linspace(min(mb_min, pce_min), max(mb_max, pce_max), 1000)
415 metric_dict = {"RMSE": [], "R2": [], "Wasserstein": [], "KS": [], "KL": []}
417 def can_kde(arr):
418 arr = np.asarray(arr)
419 return (arr.size >= 2) and (np.std(arr) > 1e-12)
421 for it in range(ntimes):
422 ref = np.asarray(value_of_interest[:, it])
423 pce = np.asarray(PCE_realizations[:, it])
425 metric_dict["RMSE"].append(np.sqrt(mean_squared_error(ref, pce)))
426 metric_dict["R2"].append(r2_score(ref, pce))
427 metric_dict["Wasserstein"].append(wasserstein_distance(ref, pce))
429 ks_stat, _ = ks_2samp(ref, pce)
430 metric_dict["KS"].append(ks_stat)
432 if "KL" in metrics_to_plot and can_kde(ref) and can_kde(pce):
433 try:
434 ref_kde = stats.gaussian_kde(ref)
435 pce_kde = stats.gaussian_kde(pce)
436 p_vals = ref_kde(xs) + 1e-12
437 q_vals = pce_kde(xs) + 1e-12
438 p_vals /= np.sum(p_vals)
439 q_vals /= np.sum(q_vals)
440 metric_dict["KL"].append(entropy(p_vals, q_vals))
441 except Exception:
442 metric_dict["KL"].append(np.nan)
443 else:
444 metric_dict["KL"].append(np.nan)
446 fig, axes = plt.subplots(
447 len(metrics_to_plot), 1, figsize=(10, 3 * len(metrics_to_plot)), sharex=True
448 )
449 if len(metrics_to_plot) == 1:
450 axes = [axes]
452 for ax, metric in zip(axes, metrics_to_plot):
453 ax.plot(metric_dict[metric], label=metric, color="tab:blue")
454 ax.set_ylabel(metric)
455 ax.grid(True)
456 ax.legend()
458 axes[-1].set_xlabel("Time index")
459 plt.tight_layout()
460 if save:
461 plt.savefig(output_dir / "PCE_training_metrics.png", dpi=150)
462 # plt.show()
465def run_pce_sensitivity(database, feature_names, pce_config: dict, output_dir: Path):
466 """
467 PCE-based sensitivity analysis on error.
469 Computes Sobol indices to determine which physical features
470 contribute most to error variance.
472 Args:
473 database: xarray Dataset
474 feature_names: List of feature names corresponding to X columns
475 pce_config: Dict with PCE settings:
476 - degree (int): Polynomial degree, default 5
477 - marginals (str): 'kernel', 'uniform', or 'normal', default 'kernel'
478 - copula (str): 'independent' or 'normal', default 'independent'
479 - q (float): Hyperbolic truncation parameter, default 1.0
480 plot_options: Dict with settings to evalute PCE quality
481 - scatter (bool): activate scatter plot
482 - distribution (bool): activate distribution plot
483 - metrics (str): Metrics to plot["RMSE", "R2", "Wasserstein", "KS", "KL"]
484 output_dir: Path to save plots and CSV output
486 Returns:
487 Dict with:
488 - 'sobol_first': dict mapping feature names to first-order indices
489 - 'sobol_total': dict mapping feature names to total-order indices
490 - 'pce_result': the fitted PCE object (for further analysis if needed)
491 - 'feature_names': list of feature names
492 - 'model_coeff_name': model coefficient name
493 - 'varnames': list of variable names, feature_names + model_coeff_name
494 - 'value_of_interest': value_of_interest, 'ref_power_cap' if model_coeff_name = None, else 'model_bias_cap'
496 """
497 # Extract config with defaults
498 degree = pce_config.get("degree", 5)
499 marginals = pce_config.get("marginals", "kernel")
500 copula = pce_config.get("copula", "independent")
501 q = pce_config.get("q", 1.0)
502 DEFAULT_PLOT_OPTIONS = {
503 "scatter": True,
504 "distribution": False,
505 "metrics": [],
506 }
507 user_plot_options = pce_config.get("plot_options", {}) or {}
508 plot_options = {**DEFAULT_PLOT_OPTIONS, **user_plot_options}
510 print("pce_config keys:", pce_config.keys())
511 print("pce_config:", pce_config)
512 print("plot_options", plot_options)
513 model_coeff_name = pce_config.get("model_coeff_name", None)
515 if isinstance(model_coeff_name, str) and model_coeff_name.strip().lower() == "none":
516 model_coeff_name = None
518 # Decide model vars + target output (hardcoded target names)
519 if model_coeff_name is None:
520 model_vars = []
521 value_of_interest = "ref_power_cap"
522 else:
523 model_vars = [model_coeff_name]
524 value_of_interest = "model_bias_cap"
526 output_dir = Path(output_dir)
527 output_dir.mkdir(parents=True, exist_ok=True)
529 print("\n--- PCE Sensitivity (single run) ---")
530 print(f" model_coeff_name: {model_coeff_name}")
531 print(f" value_of_interest: {value_of_interest}")
533 # Build arrays
534 input_array, output_array, varnames, kk_values, n_feature = (
535 build_input_output_arrays(
536 database, feature_names, model_vars, value_of_interest
537 )
538 )
540 # Validate shapes
541 n_samples, n_features = input_array.shape
543 print("--- Running PCE Sensitivity Analysis ---")
544 print(f" Samples: {n_samples}, Features: {n_features}")
545 print(f" Degree: {degree}, Marginals: {marginals}, Copula: {copula}, q: {q}")
546 print(f" Features: {feature_names}")
548 # Construct PCE metamodel
549 print(" Constructing PCE metamodel...")
550 pce_result = construct_PCE_ot(
551 input_array=input_array,
552 output_array=output_array,
553 marginals=[marginals] * n_features,
554 copula=copula,
555 degree=degree,
556 q=q,
557 )
559 # Compute Sobol indices
560 print(" Computing Sobol indices...")
561 sobol_first, sobol_total = compute_sobol_indices(pce_result, n_features)
563 # Create results dict
564 results = {
565 "sobol_first": dict(zip(varnames, sobol_first)),
566 "sobol_total": dict(zip(varnames, sobol_total)),
567 "pce_result": pce_result,
568 "feature_names": feature_names,
569 "model_coeff_name": model_coeff_name,
570 "varnames": varnames,
571 "value_of_interest": value_of_interest,
572 }
573 # Save outputs
574 output_dir = Path(output_dir)
575 output_dir.mkdir(parents=True, exist_ok=True)
577 # Plot Sobol indices
579 # Save outputs (use suffix based on model_coeff_name/target)
580 tag = "no_modelvar" if model_coeff_name is None else str(model_coeff_name)
581 plot_file = output_dir / f"pce_sobol_indices_{tag}.png"
582 csv_file = output_dir / f"pce_sobol_indices_{tag}.csv"
584 plot_sobol_indices(
585 sobol_first,
586 sobol_total,
587 varnames,
588 save=True,
589 filename=str(plot_file),
590 )
591 print(f" Saved plot to {plot_file}")
593 sobol_df = pd.DataFrame(
594 {"Var Names": varnames, "First_Order": sobol_first, "Total_Order": sobol_total}
595 ).sort_values("Total_Order", ascending=False)
596 sobol_df.to_csv(csv_file, index=False)
597 print(f" Saved indices to {csv_file}")
599 # Training quality plot
600 PCE_metamodel = pce_result.getMetaModel()
602 array_of_interest = database[value_of_interest]
604 plot_training_quality(
605 database=database,
606 varnames=feature_names,
607 value_of_interest=array_of_interest,
608 kk_values=kk_values,
609 input_variable_array_physical=input_array[:n_samples, :n_feature],
610 n_feature=n_feature,
611 nvar=n_feature + (0 if model_coeff_name is None else 1),
612 PCE_metamodel=PCE_metamodel,
613 save=True,
614 plot_options=plot_options,
615 seed=42,
616 output_dir=output_dir,
617 )
619 return results