-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrep_utility.py
More file actions
393 lines (323 loc) · 11.5 KB
/
rep_utility.py
File metadata and controls
393 lines (323 loc) · 11.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
"""
module: output_analysis
Provides methods for the confidence interval method for selecting
the number of replications to run with a Discrete-Event Simulation.
"""
import plotly.graph_objects as go
import numpy as np
import pandas as pd
from scipy.stats import t
import warnings
from typing import Protocol, runtime_checkable, Optional
OBSERVER_INTERFACE_ERROR = "Observers of OnlineStatistics must implement " \
+ "ReplicationObserver interface. i.e. " \
+ "update(results: OnlineStatistics) -> None"
@runtime_checkable
class ReplicationObserver(Protocol):
"""
Interface for an observer of an instance of the ReplicationsAnalyser
"""
def update(self, results) -> None:
"""
Add an observation of a replication
Parameters:
-----------
results: OnlineStatistic
The current replication to observe.
"""
pass
class OnlineStatistics:
"""
Welford’s algorithm for computing a running sample mean and
variance. Allowing computation of CIs and half width % deviation
from the mean.
This is a robust, accurate and old(ish) approach (1960s) that
I first read about in Donald Knuth’s art of computer programming vol 2.
"""
def __init__(
self,
data: Optional[np.ndarray] = None,
alpha: Optional[float] = 0.1,
observer: Optional[ReplicationObserver] = None
) -> None:
"""
Initiaise Welford’s algorithm for computing a running sample mean and
variance.
Parameters:
-------
data: array-like, optional (default = None)
Contains an initial data sample.
alpha: float
To compute 100(1 - alpha) confidence interval
observer: ReplicationObserver, optional (default=None)
A user may optionally track the updates to the statistics using a
ReplicationObserver (e.g. ReplicationTabuliser). This allows further
tabular or visual analysis or saving results to file if required.
"""
self.n = 0
self.x_i = None
self.mean = None
# sum of squares of differences from the current mean
self._sq = None
self.alpha = alpha
self._observers = []
if observer is not None:
self.register_observer(observer)
if isinstance(data, np.ndarray):
for x in data:
self.update(x)
def register_observer(self, observer: ReplicationObserver) -> None:
"""
observer: ReplicationRecorder, optional (default = None)
Include a method for recording the replication results at each
update. Part of observer pattern. If None then no results are
observed.
"""
if not isinstance(observer, ReplicationObserver):
raise ValueError(OBSERVER_INTERFACE_ERROR)
self._observers.append(observer)
@property
def variance(self) -> float:
"""
Sample variance of data
Sum of squares of differences from the current mean divided by n - 1
"""
return self._sq / (self.n - 1)
@property
def std(self) -> float:
"""
Standard deviation of data
"""
if self.n > 2:
return np.sqrt(self.variance)
else:
return np.nan
@property
def std_error(self) -> float:
"""
Standard error of the mean
"""
return self.std / np.sqrt(self.n)
@property
def half_width(self) -> float:
"""
Confidence interval half width
"""
dof = self.n - 1
t_value = t.ppf(1 - (self.alpha / 2), dof)
return t_value * self.std_error
@property
def lci(self) -> float:
"""
Lower confidence interval bound
"""
if self.n > 2:
return self.mean - self.half_width
else:
return np.nan
@property
def uci(self) -> float:
"""
Lower confidence interval bound
"""
if self.n > 2:
return self.mean + self.half_width
else:
return np.nan
@property
def deviation(self) -> float:
"""
Precision of the confidence interval expressed as the
percentage deviation of the half width from the mean.
"""
if self.n > 2:
return self.half_width / self.mean
else:
return np.nan
def update(self, x: float) -> None:
"""
Running update of mean and variance implemented using Welford's
algorithm (1962).
See Knuth. D `The Art of Computer Programming` Vol 2. 2nd ed. Page 216.
Params:
------
x: float
A new observation
"""
self.n += 1
self.x_i = x
# init values
if self.n == 1:
self.mean = x
self._sq = 0
else:
# compute the updated mean
updated_mean = self.mean + ((x - self.mean) / self.n)
# update the sum of squares of differences from the current mean
self._sq += (x - self.mean) * (x - updated_mean)
# update the tracked mean
self.mean = updated_mean
self.notify()
def notify(self) -> None:
"""
Notify any observers that a update has taken place.
"""
for observer in self._observers:
observer.update(self)
class ReplicationTabulizer:
"""
Record the replication results from an instance of ReplicationsAlgorithm
in a pandas DataFrame.
Implement as the part of observer pattern. Provides a summary frame
equivalent to the output of a confidence_interval_method
"""
def __init__(self):
# to track online stats
self.stdev = []
self.lower = []
self.upper = []
self.dev = []
self.cumulative_mean = []
self.x_i = []
self.n = 0
def update(self, results: OnlineStatistics) -> None:
"""
Add an observation of a replication
Parameters:
-----------
results: OnlineStatistic
The current replication to observe.
"""
self.x_i.append(results.x_i)
self.cumulative_mean.append(results.mean)
self.stdev.append(results.std)
self.lower.append(results.lci)
self.upper.append(results.uci)
self.dev.append(results.deviation)
self.n += 1
def summary_table(self) -> pd.DataFrame:
"""
Return a dataframe of results equivalent to the confidence interval
method.
"""
# combine results into a single dataframe
results = pd.DataFrame([self.x_i, self.cumulative_mean,
self.stdev, self.lower, self.upper, self.dev]).T
results.columns = ['Mean', 'Cumulative Mean', 'Standard Deviation',
'Lower Interval', 'Upper Interval', '% deviation']
results.index = np.arange(1, self.n+1)
results.index.name = 'replications'
return results
def confidence_interval_method(
replications,
alpha: Optional[float] = 0.05,
desired_precision: Optional[float] = 0.05,
min_rep: Optional[int] = 5,
decimal_places: Optional[int] = 2
):
'''
The confidence interval method for selecting the number of replications
to run in a simulation.
Finds the smallest number of replications where the width of the confidence
interval is less than the desired_precision.
Returns both the number of replications and the full results dataframe.
Parameters:
----------
replications: arraylike
Array (e.g. np.ndarray or list) of replications of a performance metric
alpha: float, optional (default=0.05)
procedure constructs a 100(1-alpha) confidence interval for the
cumulative mean.
desired_precision: float, optional (default=0.05)
Desired mean deviation from confidence interval.
min_rep: int, optional (default=5)
set to a integer > 0 and ignore all of the replications prior to it
when selecting the number of replications to run to achieve the desired
precision. Useful when the number of replications returned does not
provide a stable precision below target.
decimal_places: int, optional (default=2)
sets the number of decimal places of the returned dataframe containing
the results
Returns:
--------
tuple: int, pd.DataFrame
'''
# welford's method to track cumulative mean and construct CIs at each rep
# track the process and construct data table using ReplicationTabuliser
observer = ReplicationTabulizer()
stats = OnlineStatistics(alpha=alpha, data=replications[:2], observer=observer)
# iteratively update.
for i in range(2, len(replications)):
stats.update(replications[i])
results = observer.summary_table()
# get the smallest no. of reps where deviation is less than precision target
try:
n_reps = results.iloc[min_rep:].loc[results['% deviation']
<= desired_precision].iloc[0].name
except IndexError:
# no replications with desired precision
message = 'WARNING: the replications do not reach desired precision'
warnings.warn(message)
n_reps = -1
return n_reps, results.round(decimal_places)
def plotly_confidence_interval_method(n_reps, conf_ints, metric_name,
figsize=(1200, 400)):
"""
Interactive Plotly visualization with deviation hover information
Parameters:
----------
n_reps: int
Minimum number of reps selected
conf_ints: pandas.DataFrame
Results from `confidence_interval_method` function
metric_name: str
Name of the performance measure
figsize: tuple, optional (default=(1200,400))
Plot dimensions in pixels (width, height)
Returns:
-------
plotly.graph_objects.Figure
"""
fig = go.Figure()
# Calculate relative deviations [1][4]
deviation_pct = ((conf_ints['Upper Interval'] - conf_ints['Cumulative Mean']) /
conf_ints['Cumulative Mean'] * 100).round(2)
# Confidence interval bands with hover info
for col, color, dash in zip(['Lower Interval', 'Upper Interval'],
['lightblue', 'lightblue'],
['dot', 'dot']):
fig.add_trace(go.Scatter(
x=conf_ints.index,
y=conf_ints[col],
line=dict(color=color, dash=dash),
name=col,
text=[f'Deviation: {d}%' for d in deviation_pct],
hoverinfo='x+y+name+text'
))
# Cumulative mean line with enhanced hover
fig.add_trace(go.Scatter(
x=conf_ints.index,
y=conf_ints['Cumulative Mean'],
line=dict(color='blue', width=2),
name='Cumulative Mean',
hoverinfo='x+y+name'
))
# Vertical threshold line
fig.add_shape(
type='line',
x0=n_reps,
x1=n_reps,
y0=0,
y1=1,
yref='paper',
line=dict(color='red', dash='dash')
)
# Configure layout
fig.update_layout(
width=figsize[0],
height=figsize[1],
yaxis_title=f'Cumulative Mean: {metric_name}',
hovermode='x unified',
showlegend=True
)
return fig