Usage Guide
This guide provides practical examples of how to use hydroutils for common hydrological analysis tasks.
Getting Started
| import hydroutils as hu
import numpy as np
import pandas as pd
|
1. Statistical Analysis
Basic Hydrological Statistics
Calculate common hydrological performance metrics:
1
2
3
4
5
6
7
8
9
10
11
12 | # Sample data
observed = np.array([10.5, 12.3, 8.7, 15.2, 11.8, 9.4, 13.6])
simulated = np.array([10.1, 12.8, 8.9, 14.7, 11.2, 9.8, 13.1])
# Calculate comprehensive statistics
stats = hu.stat_error(observed, simulated)
print(f"Nash-Sutcliffe Efficiency (NSE): {stats['NSE'][0]:.3f}")
print(f"Root Mean Square Error (RMSE): {stats['RMSE'][0]:.3f}")
print(f"Bias: {stats['Bias'][0]:.3f}")
print(f"Correlation: {stats['Corr'][0]:.3f}")
print(f"Kling-Gupta Efficiency (KGE): {stats['KGE'][0]:.3f}")
|
Kling-Gupta Efficiency
Calculate KGE individually:
| kge_value = hu.KGE(simulated, observed)
print(f"KGE: {kge_value:.3f}")
|
Flow Duration Curve Analysis
| # Calculate flow duration curve slope
fms_value = hu.fms(observed, simulated, lower=0.2, upper=0.7)
print(f"Flow Duration Curve Middle Slope: {fms_value:.3f}")
|
2. Time Series Processing
Unit Conversions
Convert between different streamflow units:
| # Convert cubic meters per second to cubic feet per second
flow_cms = np.array([10.5, 12.3, 8.7, 15.2])
flow_cfs = hu.streamflow_unit_conv(flow_cms, from_unit='cms', to_unit='cfs')
print(f"Flow in CFS: {flow_cfs}")
# Detect time interval
time_series = pd.date_range('2020-01-01', periods=100, freq='D')
interval = hu.detect_time_interval(time_series)
print(f"Detected interval: {interval}")
|
Time Interval Validation
| # Validate unit compatibility
is_compatible = hu.validate_unit_compatibility('cms', 'streamflow')
print(f"CMS compatible with streamflow: {is_compatible}")
# Get time interval information
interval_info = hu.get_time_interval_info('1D')
print(f"Daily interval info: {interval_info}")
|
3. Data Processing with Files
Reading and Processing Data
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 | # Example of processing a CSV file with hydrological data
data = pd.read_csv('streamflow_data.csv', parse_dates=['date'])
# Calculate statistics for multiple stations
stations = ['station_001', 'station_002', 'station_003']
results = {}
for station in stations:
if f'{station}_obs' in data.columns and f'{station}_sim' in data.columns:
obs = data[f'{station}_obs'].dropna()
sim = data[f'{station}_sim'].dropna()
# Align data
min_length = min(len(obs), len(sim))
obs = obs[:min_length]
sim = sim[:min_length]
results[station] = hu.stat_error(obs.values, sim.values)
# Display results
for station, stats in results.items():
print(f"\n{station}:")
print(f" NSE: {stats['NSE'][0]:.3f}")
print(f" RMSE: {stats['RMSE'][0]:.3f}")
|
4. Advanced Statistical Analysis
1
2
3
4
5
6
7
8
9
10
11
12
13
14 | # Calculate statistical properties
flow_data = np.random.lognormal(2, 1, 1000) # Log-normal distributed flow
# Basic statistics
basic_stats = hu.cal_stat(flow_data)
print(f"Basic statistics: {basic_stats}")
# Gamma transformation statistics
gamma_stats = hu.cal_stat_gamma(flow_data)
print(f"Gamma-transformed statistics: {gamma_stats}")
# Four key statistical indices
four_stats = hu.cal_4_stat_inds(flow_data)
print(f"P10, P90, Mean, Std: {four_stats}")
|
Empirical Cumulative Distribution Function
1
2
3
4
5
6
7
8
9
10
11
12 | # Calculate ECDF
sorted_data, probabilities = hu.ecdf(flow_data)
# Plot ECDF (requires matplotlib)
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6))
plt.plot(sorted_data, probabilities)
plt.xlabel('Flow')
plt.ylabel('Probability')
plt.title('Empirical Cumulative Distribution Function')
plt.grid(True)
plt.show()
|
5. Working with Multiple Time Series
Batch Processing
| # Process multiple time series
observed_series = np.random.rand(5, 100) # 5 stations, 100 time steps
simulated_series = observed_series + np.random.normal(0, 0.1, (5, 100))
# Calculate statistics for all series
all_stats = hu.stat_errors(observed_series, simulated_series)
# Extract NSE values for all stations
nse_values = [stats['NSE'][0] for stats in all_stats]
print(f"NSE values for all stations: {nse_values}")
|
6. Practical Example: Complete Workflow
Here's a complete example of a typical hydrological analysis workflow:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69 | import hydroutils as hu
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# 1. Load data
def load_sample_data():
"""Generate sample hydrological data"""
dates = pd.date_range('2020-01-01', '2022-12-31', freq='D')
# Simulate observed streamflow with seasonal pattern
base_flow = 10 + 5 * np.sin(2 * np.pi * np.arange(len(dates)) / 365.25)
observed = base_flow + np.random.normal(0, 2, len(dates))
# Simulate model predictions with some bias and error
simulated = observed * 0.95 + np.random.normal(0, 1.5, len(dates))
return pd.DataFrame({
'date': dates,
'observed': observed,
'simulated': simulated
})
# 2. Load and prepare data
df = load_sample_data()
print(f"Data shape: {df.shape}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
# 3. Calculate comprehensive statistics
stats = hu.stat_error(df['observed'].values, df['simulated'].values)
print("\nPerformance Metrics:")
print(f"NSE: {stats['NSE'][0]:.3f}")
print(f"KGE: {stats['KGE'][0]:.3f}")
print(f"RMSE: {stats['RMSE'][0]:.3f}")
print(f"Bias: {stats['Bias'][0]:.3f}")
print(f"Correlation: {stats['Corr'][0]:.3f}")
# 4. Additional analysis
kge_individual = hu.KGE(df['simulated'].values, df['observed'].values)
print(f"KGE (individual calculation): {kge_individual:.3f}")
# 5. Unit conversion example
flow_cfs = hu.streamflow_unit_conv(df['observed'].values, 'cms', 'cfs')
print(f"Mean flow: {df['observed'].mean():.1f} cms = {flow_cfs.mean():.1f} cfs")
# 6. Visualization (optional)
plt.figure(figsize=(12, 8))
plt.subplot(2, 1, 1)
plt.plot(df['date'], df['observed'], label='Observed', alpha=0.7)
plt.plot(df['date'], df['simulated'], label='Simulated', alpha=0.7)
plt.ylabel('Streamflow (cms)')
plt.title('Time Series Comparison')
plt.legend()
plt.grid(True)
plt.subplot(2, 1, 2)
plt.scatter(df['observed'], df['simulated'], alpha=0.5)
plt.plot([df['observed'].min(), df['observed'].max()],
[df['observed'].min(), df['observed'].max()], 'r--')
plt.xlabel('Observed (cms)')
plt.ylabel('Simulated (cms)')
plt.title(f'Scatter Plot (NSE: {stats["NSE"][0]:.3f})')
plt.grid(True)
plt.tight_layout()
plt.show()
print("\nAnalysis complete!")
|
7. Error Handling and Best Practices
Handling Missing Data
| # Sample data with NaN values
obs_with_nan = np.array([1.0, 2.0, np.nan, 4.0, 5.0])
sim_with_nan = np.array([1.1, 2.2, 3.1, 4.2, np.nan])
# The stat_error function automatically handles NaN values
try:
stats = hu.stat_error(obs_with_nan, sim_with_nan)
print("Statistics calculated successfully with NaN handling")
except Exception as e:
print(f"Error: {e}")
|
Data Validation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29 | # Validate input data before analysis
def validate_data(observed, simulated):
"""Validate input data for hydrological analysis"""
if len(observed) != len(simulated):
raise ValueError("Observed and simulated data must have same length")
if len(observed) == 0:
raise ValueError("Data arrays cannot be empty")
valid_obs = ~np.isnan(observed)
valid_sim = ~np.isnan(simulated)
valid_both = valid_obs & valid_sim
if np.sum(valid_both) < 10:
print("Warning: Less than 10 valid data points")
return valid_both
# Example usage
obs = np.random.rand(100)
sim = obs + np.random.normal(0, 0.1, 100)
# Add some NaN values
obs[5:10] = np.nan
sim[15:20] = np.nan
valid_mask = validate_data(obs, sim)
print(f"Valid data points: {np.sum(valid_mask)}/{len(obs)}")
|
Next Steps