Coverage for manila/share/drivers/netapp/dataontap/cluster_mode/performance.py: 98%
183 statements
« prev ^ index » next coverage.py v7.11.0, created at 2026-02-18 22:19 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2026-02-18 22:19 +0000
1# Copyright (c) 2016 Clinton Knight
2# All rights reserved.
3#
4# Licensed under the Apache License, Version 2.0 (the "License"); you may
5# not use this file except in compliance with the License. You may obtain
6# a copy of the License at
7#
8# http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13# License for the specific language governing permissions and limitations
14# under the License.
15"""
16Performance metrics functions and cache for NetApp systems.
17"""
19import copy
21from oslo_log import log as logging
23from manila import exception
24from manila.i18n import _
25from manila.share.drivers.netapp.dataontap.client import api as netapp_api
28LOG = logging.getLogger(__name__)
29DEFAULT_UTILIZATION = 50
32class PerformanceLibrary(object):
34 def __init__(self, zapi_client):
36 self.zapi_client = zapi_client
37 self.performance_counters = {}
38 self.pool_utilization = {}
39 self._init_counter_info()
41 def _init_counter_info(self):
42 """Set a few counter names based on Data ONTAP version."""
44 self.system_object_name = None
45 self.avg_processor_busy_base_counter_name = None
47 try:
48 if self.zapi_client.features.SYSTEM_CONSTITUENT_METRICS:
49 self.system_object_name = 'system:constituent'
50 self.avg_processor_busy_base_counter_name = (
51 self._get_base_counter_name('system:constituent',
52 'avg_processor_busy'))
53 elif self.zapi_client.features.SYSTEM_METRICS:
54 self.system_object_name = 'system'
55 self.avg_processor_busy_base_counter_name = (
56 self._get_base_counter_name('system',
57 'avg_processor_busy'))
58 except netapp_api.NaApiError:
59 if self.zapi_client.features.SYSTEM_CONSTITUENT_METRICS:
60 self.avg_processor_busy_base_counter_name = 'cpu_elapsed_time'
61 else:
62 self.avg_processor_busy_base_counter_name = 'cpu_elapsed_time1'
63 LOG.exception('Could not get performance base counter '
64 'name. Performance-based scheduler '
65 'functions may not be available.')
67 def update_performance_cache(self, flexvol_pools, aggregate_pools):
68 """Called periodically to update per-pool node utilization metrics."""
70 # Nothing to do on older systems
71 if not (self.zapi_client.features.SYSTEM_METRICS or
72 self.zapi_client.features.SYSTEM_CONSTITUENT_METRICS):
73 return
75 # Get aggregates and nodes for all known pools
76 aggr_names = self._get_aggregates_for_pools(flexvol_pools,
77 aggregate_pools)
78 node_names, aggr_node_map = self._get_nodes_for_aggregates(aggr_names)
80 # Update performance counter cache for each node
81 node_utilization = {}
82 for node_name in node_names:
83 if node_name not in self.performance_counters:
84 self.performance_counters[node_name] = []
86 # Get new performance counters and save only the last 10
87 counters = self._get_node_utilization_counters(node_name)
88 if not counters:
89 continue
91 self.performance_counters[node_name].append(counters)
92 self.performance_counters[node_name] = (
93 self.performance_counters[node_name][-10:])
95 # Update utilization for each node using newest & oldest sample
96 counters = self.performance_counters[node_name]
97 if len(counters) < 2:
98 node_utilization[node_name] = DEFAULT_UTILIZATION
99 else:
100 node_utilization[node_name] = self._get_node_utilization(
101 counters[0], counters[-1], node_name)
103 # Update pool utilization map atomically
104 pool_utilization = {}
105 all_pools = copy.deepcopy(flexvol_pools)
106 all_pools.update(aggregate_pools)
107 for pool_name, pool_info in all_pools.items():
108 aggr_name = pool_info.get('netapp_aggregate', 'unknown')
109 node_name = aggr_node_map.get(aggr_name)
110 if node_name:
111 pool_utilization[pool_name] = node_utilization.get(
112 node_name, DEFAULT_UTILIZATION)
113 else:
114 pool_utilization[pool_name] = DEFAULT_UTILIZATION
116 self.pool_utilization = pool_utilization
118 def get_node_utilization_for_pool(self, pool_name):
119 """Get the node utilization for the specified pool, if available."""
121 return self.pool_utilization.get(pool_name, DEFAULT_UTILIZATION)
123 def update_for_failover(self, zapi_client, flexvol_pools, aggregate_pools):
124 """Change API client after a whole-backend failover event."""
126 self.zapi_client = zapi_client
127 self.update_performance_cache(flexvol_pools, aggregate_pools)
129 def _get_aggregates_for_pools(self, flexvol_pools, aggregate_pools):
130 """Get the set of aggregates that contain the specified pools."""
132 aggr_names = set()
133 for pool_name, pool_info in aggregate_pools.items():
134 if pool_info.get('netapp_flexgroup', False):
135 continue
136 aggr_names.add(pool_info.get('netapp_aggregate'))
138 for pool_name, pool_info in flexvol_pools.items():
139 if pool_info.get('netapp_flexgroup', False): 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true
140 continue
141 aggr_names.add(pool_info.get('netapp_aggregate'))
143 return list(aggr_names)
145 def _get_nodes_for_aggregates(self, aggr_names):
146 """Get the cluster nodes that own the specified aggregates."""
148 node_names = set()
149 aggr_node_map = {}
151 for aggr_name in aggr_names:
152 node_name = self.zapi_client.get_node_for_aggregate(aggr_name)
153 if node_name: 153 ↛ 151line 153 didn't jump to line 151 because the condition on line 153 was always true
154 node_names.add(node_name)
155 aggr_node_map[aggr_name] = node_name
157 return list(node_names), aggr_node_map
159 def _get_node_utilization(self, counters_t1, counters_t2, node_name):
160 """Get node utilization from two sets of performance counters."""
162 try:
163 # Time spent in the single-threaded Kahuna domain
164 kahuna_percent = self._get_kahuna_utilization(counters_t1,
165 counters_t2)
167 # If Kahuna is using >60% of the CPU, the controller is fully busy
168 if kahuna_percent > 60:
169 return 100.0
171 # Average CPU busyness across all processors
172 avg_cpu_percent = 100.0 * self._get_average_cpu_utilization(
173 counters_t1, counters_t2)
175 # Total Consistency Point (CP) time
176 total_cp_time_msec = self._get_total_consistency_point_time(
177 counters_t1, counters_t2)
179 # Time spent in CP Phase 2 (buffer flush)
180 p2_flush_time_msec = self._get_consistency_point_p2_flush_time(
181 counters_t1, counters_t2)
183 # Wall-clock time between the two counter sets
184 poll_time_msec = self._get_total_time(counters_t1,
185 counters_t2,
186 'total_cp_msecs')
188 # If two polls happened in quick succession, use CPU utilization
189 if total_cp_time_msec == 0 or poll_time_msec == 0:
190 return max(min(100.0, avg_cpu_percent), 0)
192 # Adjusted Consistency Point time
193 adjusted_cp_time_msec = self._get_adjusted_consistency_point_time(
194 total_cp_time_msec, p2_flush_time_msec)
195 adjusted_cp_percent = (100.0 *
196 adjusted_cp_time_msec / poll_time_msec)
198 # Utilization is the greater of CPU busyness & CP time
199 node_utilization = max(avg_cpu_percent, adjusted_cp_percent)
200 return max(min(100.0, node_utilization), 0)
202 except Exception:
203 LOG.exception('Could not calculate node utilization for '
204 'node %s.', node_name)
205 return DEFAULT_UTILIZATION
207 def _get_kahuna_utilization(self, counters_t1, counters_t2):
208 """Get time spent in the single-threaded Kahuna domain."""
210 # Note(cknight): Because Kahuna is single-threaded, running only on
211 # one CPU at a time, we can safely sum the Kahuna CPU usage
212 # percentages across all processors in a node.
213 return sum(self._get_performance_counter_average_multi_instance(
214 counters_t1, counters_t2, 'domain_busy:kahuna',
215 'processor_elapsed_time')) * 100.0
217 def _get_average_cpu_utilization(self, counters_t1, counters_t2):
218 """Get average CPU busyness across all processors."""
220 return self._get_performance_counter_average(
221 counters_t1, counters_t2, 'avg_processor_busy',
222 self.avg_processor_busy_base_counter_name)
224 def _get_total_consistency_point_time(self, counters_t1, counters_t2):
225 """Get time spent in Consistency Points in msecs."""
227 return float(self._get_performance_counter_delta(
228 counters_t1, counters_t2, 'total_cp_msecs'))
230 def _get_consistency_point_p2_flush_time(self, counters_t1, counters_t2):
231 """Get time spent in CP Phase 2 (buffer flush) in msecs."""
233 return float(self._get_performance_counter_delta(
234 counters_t1, counters_t2, 'cp_phase_times:p2_flush'))
236 def _get_total_time(self, counters_t1, counters_t2, counter_name):
237 """Get wall clock time between two successive counters in msecs."""
239 timestamp_t1 = float(self._find_performance_counter_timestamp(
240 counters_t1, counter_name))
241 timestamp_t2 = float(self._find_performance_counter_timestamp(
242 counters_t2, counter_name))
243 return (timestamp_t2 - timestamp_t1) * 1000.0
245 def _get_adjusted_consistency_point_time(self, total_cp_time,
246 p2_flush_time):
247 """Get adjusted CP time by limiting CP phase 2 flush time to 20%."""
249 return (total_cp_time - p2_flush_time) * 1.20
251 def _get_performance_counter_delta(self, counters_t1, counters_t2,
252 counter_name):
253 """Calculate a delta value from two performance counters."""
255 counter_t1 = int(
256 self._find_performance_counter_value(counters_t1, counter_name))
257 counter_t2 = int(
258 self._find_performance_counter_value(counters_t2, counter_name))
260 return counter_t2 - counter_t1
262 def _get_performance_counter_average(self, counters_t1, counters_t2,
263 counter_name, base_counter_name,
264 instance_name=None):
265 """Calculate an average value from two performance counters."""
267 counter_t1 = float(self._find_performance_counter_value(
268 counters_t1, counter_name, instance_name))
269 counter_t2 = float(self._find_performance_counter_value(
270 counters_t2, counter_name, instance_name))
271 base_counter_t1 = float(self._find_performance_counter_value(
272 counters_t1, base_counter_name, instance_name))
273 base_counter_t2 = float(self._find_performance_counter_value(
274 counters_t2, base_counter_name, instance_name))
276 return (counter_t2 - counter_t1) / (base_counter_t2 - base_counter_t1)
278 def _get_performance_counter_average_multi_instance(self, counters_t1,
279 counters_t2,
280 counter_name,
281 base_counter_name):
282 """Calculate an average value from multiple counter instances."""
284 averages = []
285 instance_names = []
286 for counter in counters_t1:
287 if counter_name in counter:
288 instance_names.append(counter['instance-name'])
290 for instance_name in instance_names:
291 average = self._get_performance_counter_average(
292 counters_t1, counters_t2, counter_name, base_counter_name,
293 instance_name)
294 averages.append(average)
296 return averages
298 def _find_performance_counter_value(self, counters, counter_name,
299 instance_name=None):
300 """Given a counter set, return the value of a named instance."""
302 for counter in counters:
303 if counter_name in counter:
304 if (instance_name is None
305 or counter['instance-name'] == instance_name):
306 return counter[counter_name]
307 else:
308 raise exception.NotFound(_('Counter %s not found') % counter_name)
310 def _find_performance_counter_timestamp(self, counters, counter_name,
311 instance_name=None):
312 """Given a counter set, return the timestamp of a named instance."""
314 for counter in counters:
315 if counter_name in counter:
316 if (instance_name is None 316 ↛ 314line 316 didn't jump to line 314 because the condition on line 316 was always true
317 or counter['instance-name'] == instance_name):
318 return counter['timestamp']
319 else:
320 raise exception.NotFound(_('Counter %s not found') % counter_name)
322 def _expand_performance_array(self, object_name, counter_name, counter):
323 """Get array labels and expand counter data array."""
325 # Get array labels for counter value
326 counter_info = self.zapi_client.get_performance_counter_info(
327 object_name, counter_name)
329 array_labels = [counter_name + ':' + label.lower()
330 for label in counter_info['labels']]
331 array_values = counter[counter_name].split(',')
333 # Combine labels and values, and then mix into existing counter
334 array_data = dict(zip(array_labels, array_values))
335 counter.update(array_data)
337 def _get_base_counter_name(self, object_name, counter_name):
338 """Get the name of the base counter for the specified counter."""
340 counter_info = self.zapi_client.get_performance_counter_info(
341 object_name, counter_name)
342 return counter_info['base-counter']
344 def _get_node_utilization_counters(self, node_name):
345 """Get all performance counters for calculating node utilization."""
347 try:
348 return (self._get_node_utilization_system_counters(node_name) +
349 self._get_node_utilization_wafl_counters(node_name) +
350 self._get_node_utilization_processor_counters(node_name))
351 except netapp_api.NaApiError:
352 LOG.exception('Could not get utilization counters from node '
353 '%s', node_name)
354 return None
356 def _get_node_utilization_system_counters(self, node_name):
357 """Get the system counters for calculating node utilization."""
359 system_instance_uuids = (
360 self.zapi_client.get_performance_instance_uuids(
361 self.system_object_name, node_name))
363 system_counter_names = [
364 'avg_processor_busy',
365 self.avg_processor_busy_base_counter_name,
366 ]
367 if 'cpu_elapsed_time1' in system_counter_names: 367 ↛ 370line 367 didn't jump to line 370 because the condition on line 367 was always true
368 system_counter_names.append('cpu_elapsed_time')
370 system_counters = self.zapi_client.get_performance_counters(
371 self.system_object_name, system_instance_uuids,
372 system_counter_names)
374 return system_counters
376 def _get_node_utilization_wafl_counters(self, node_name):
377 """Get the WAFL counters for calculating node utilization."""
379 wafl_instance_uuids = self.zapi_client.get_performance_instance_uuids(
380 'wafl', node_name)
382 wafl_counter_names = ['total_cp_msecs', 'cp_phase_times']
383 wafl_counters = self.zapi_client.get_performance_counters(
384 'wafl', wafl_instance_uuids, wafl_counter_names)
386 # Expand array data so we can use wafl:cp_phase_times[P2_FLUSH]
387 for counter in wafl_counters:
388 if 'cp_phase_times' in counter:
389 self._expand_performance_array(
390 'wafl', 'cp_phase_times', counter)
392 return wafl_counters
394 def _get_node_utilization_processor_counters(self, node_name):
395 """Get the processor counters for calculating node utilization."""
397 processor_instance_uuids = (
398 self.zapi_client.get_performance_instance_uuids('processor',
399 node_name))
401 processor_counter_names = ['domain_busy', 'processor_elapsed_time']
402 processor_counters = self.zapi_client.get_performance_counters(
403 'processor', processor_instance_uuids, processor_counter_names)
405 # Expand array data so we can use processor:domain_busy[kahuna]
406 for counter in processor_counters:
407 if 'domain_busy' in counter:
408 self._expand_performance_array(
409 'processor', 'domain_busy', counter)
411 return processor_counters