Coverage for manila/share/drivers/netapp/dataontap/cluster_mode/performance.py: 98%

183 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2026-02-18 22:19 +0000

1# Copyright (c) 2016 Clinton Knight 

2# All rights reserved. 

3# 

4# Licensed under the Apache License, Version 2.0 (the "License"); you may 

5# not use this file except in compliance with the License. You may obtain 

6# a copy of the License at 

7# 

8# http://www.apache.org/licenses/LICENSE-2.0 

9# 

10# Unless required by applicable law or agreed to in writing, software 

11# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 

12# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 

13# License for the specific language governing permissions and limitations 

14# under the License. 

15""" 

16Performance metrics functions and cache for NetApp systems. 

17""" 

18 

19import copy 

20 

21from oslo_log import log as logging 

22 

23from manila import exception 

24from manila.i18n import _ 

25from manila.share.drivers.netapp.dataontap.client import api as netapp_api 

26 

27 

28LOG = logging.getLogger(__name__) 

29DEFAULT_UTILIZATION = 50 

30 

31 

32class PerformanceLibrary(object): 

33 

34 def __init__(self, zapi_client): 

35 

36 self.zapi_client = zapi_client 

37 self.performance_counters = {} 

38 self.pool_utilization = {} 

39 self._init_counter_info() 

40 

41 def _init_counter_info(self): 

42 """Set a few counter names based on Data ONTAP version.""" 

43 

44 self.system_object_name = None 

45 self.avg_processor_busy_base_counter_name = None 

46 

47 try: 

48 if self.zapi_client.features.SYSTEM_CONSTITUENT_METRICS: 

49 self.system_object_name = 'system:constituent' 

50 self.avg_processor_busy_base_counter_name = ( 

51 self._get_base_counter_name('system:constituent', 

52 'avg_processor_busy')) 

53 elif self.zapi_client.features.SYSTEM_METRICS: 

54 self.system_object_name = 'system' 

55 self.avg_processor_busy_base_counter_name = ( 

56 self._get_base_counter_name('system', 

57 'avg_processor_busy')) 

58 except netapp_api.NaApiError: 

59 if self.zapi_client.features.SYSTEM_CONSTITUENT_METRICS: 

60 self.avg_processor_busy_base_counter_name = 'cpu_elapsed_time' 

61 else: 

62 self.avg_processor_busy_base_counter_name = 'cpu_elapsed_time1' 

63 LOG.exception('Could not get performance base counter ' 

64 'name. Performance-based scheduler ' 

65 'functions may not be available.') 

66 

67 def update_performance_cache(self, flexvol_pools, aggregate_pools): 

68 """Called periodically to update per-pool node utilization metrics.""" 

69 

70 # Nothing to do on older systems 

71 if not (self.zapi_client.features.SYSTEM_METRICS or 

72 self.zapi_client.features.SYSTEM_CONSTITUENT_METRICS): 

73 return 

74 

75 # Get aggregates and nodes for all known pools 

76 aggr_names = self._get_aggregates_for_pools(flexvol_pools, 

77 aggregate_pools) 

78 node_names, aggr_node_map = self._get_nodes_for_aggregates(aggr_names) 

79 

80 # Update performance counter cache for each node 

81 node_utilization = {} 

82 for node_name in node_names: 

83 if node_name not in self.performance_counters: 

84 self.performance_counters[node_name] = [] 

85 

86 # Get new performance counters and save only the last 10 

87 counters = self._get_node_utilization_counters(node_name) 

88 if not counters: 

89 continue 

90 

91 self.performance_counters[node_name].append(counters) 

92 self.performance_counters[node_name] = ( 

93 self.performance_counters[node_name][-10:]) 

94 

95 # Update utilization for each node using newest & oldest sample 

96 counters = self.performance_counters[node_name] 

97 if len(counters) < 2: 

98 node_utilization[node_name] = DEFAULT_UTILIZATION 

99 else: 

100 node_utilization[node_name] = self._get_node_utilization( 

101 counters[0], counters[-1], node_name) 

102 

103 # Update pool utilization map atomically 

104 pool_utilization = {} 

105 all_pools = copy.deepcopy(flexvol_pools) 

106 all_pools.update(aggregate_pools) 

107 for pool_name, pool_info in all_pools.items(): 

108 aggr_name = pool_info.get('netapp_aggregate', 'unknown') 

109 node_name = aggr_node_map.get(aggr_name) 

110 if node_name: 

111 pool_utilization[pool_name] = node_utilization.get( 

112 node_name, DEFAULT_UTILIZATION) 

113 else: 

114 pool_utilization[pool_name] = DEFAULT_UTILIZATION 

115 

116 self.pool_utilization = pool_utilization 

117 

118 def get_node_utilization_for_pool(self, pool_name): 

119 """Get the node utilization for the specified pool, if available.""" 

120 

121 return self.pool_utilization.get(pool_name, DEFAULT_UTILIZATION) 

122 

123 def update_for_failover(self, zapi_client, flexvol_pools, aggregate_pools): 

124 """Change API client after a whole-backend failover event.""" 

125 

126 self.zapi_client = zapi_client 

127 self.update_performance_cache(flexvol_pools, aggregate_pools) 

128 

129 def _get_aggregates_for_pools(self, flexvol_pools, aggregate_pools): 

130 """Get the set of aggregates that contain the specified pools.""" 

131 

132 aggr_names = set() 

133 for pool_name, pool_info in aggregate_pools.items(): 

134 if pool_info.get('netapp_flexgroup', False): 

135 continue 

136 aggr_names.add(pool_info.get('netapp_aggregate')) 

137 

138 for pool_name, pool_info in flexvol_pools.items(): 

139 if pool_info.get('netapp_flexgroup', False): 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true

140 continue 

141 aggr_names.add(pool_info.get('netapp_aggregate')) 

142 

143 return list(aggr_names) 

144 

145 def _get_nodes_for_aggregates(self, aggr_names): 

146 """Get the cluster nodes that own the specified aggregates.""" 

147 

148 node_names = set() 

149 aggr_node_map = {} 

150 

151 for aggr_name in aggr_names: 

152 node_name = self.zapi_client.get_node_for_aggregate(aggr_name) 

153 if node_name: 153 ↛ 151line 153 didn't jump to line 151 because the condition on line 153 was always true

154 node_names.add(node_name) 

155 aggr_node_map[aggr_name] = node_name 

156 

157 return list(node_names), aggr_node_map 

158 

159 def _get_node_utilization(self, counters_t1, counters_t2, node_name): 

160 """Get node utilization from two sets of performance counters.""" 

161 

162 try: 

163 # Time spent in the single-threaded Kahuna domain 

164 kahuna_percent = self._get_kahuna_utilization(counters_t1, 

165 counters_t2) 

166 

167 # If Kahuna is using >60% of the CPU, the controller is fully busy 

168 if kahuna_percent > 60: 

169 return 100.0 

170 

171 # Average CPU busyness across all processors 

172 avg_cpu_percent = 100.0 * self._get_average_cpu_utilization( 

173 counters_t1, counters_t2) 

174 

175 # Total Consistency Point (CP) time 

176 total_cp_time_msec = self._get_total_consistency_point_time( 

177 counters_t1, counters_t2) 

178 

179 # Time spent in CP Phase 2 (buffer flush) 

180 p2_flush_time_msec = self._get_consistency_point_p2_flush_time( 

181 counters_t1, counters_t2) 

182 

183 # Wall-clock time between the two counter sets 

184 poll_time_msec = self._get_total_time(counters_t1, 

185 counters_t2, 

186 'total_cp_msecs') 

187 

188 # If two polls happened in quick succession, use CPU utilization 

189 if total_cp_time_msec == 0 or poll_time_msec == 0: 

190 return max(min(100.0, avg_cpu_percent), 0) 

191 

192 # Adjusted Consistency Point time 

193 adjusted_cp_time_msec = self._get_adjusted_consistency_point_time( 

194 total_cp_time_msec, p2_flush_time_msec) 

195 adjusted_cp_percent = (100.0 * 

196 adjusted_cp_time_msec / poll_time_msec) 

197 

198 # Utilization is the greater of CPU busyness & CP time 

199 node_utilization = max(avg_cpu_percent, adjusted_cp_percent) 

200 return max(min(100.0, node_utilization), 0) 

201 

202 except Exception: 

203 LOG.exception('Could not calculate node utilization for ' 

204 'node %s.', node_name) 

205 return DEFAULT_UTILIZATION 

206 

207 def _get_kahuna_utilization(self, counters_t1, counters_t2): 

208 """Get time spent in the single-threaded Kahuna domain.""" 

209 

210 # Note(cknight): Because Kahuna is single-threaded, running only on 

211 # one CPU at a time, we can safely sum the Kahuna CPU usage 

212 # percentages across all processors in a node. 

213 return sum(self._get_performance_counter_average_multi_instance( 

214 counters_t1, counters_t2, 'domain_busy:kahuna', 

215 'processor_elapsed_time')) * 100.0 

216 

217 def _get_average_cpu_utilization(self, counters_t1, counters_t2): 

218 """Get average CPU busyness across all processors.""" 

219 

220 return self._get_performance_counter_average( 

221 counters_t1, counters_t2, 'avg_processor_busy', 

222 self.avg_processor_busy_base_counter_name) 

223 

224 def _get_total_consistency_point_time(self, counters_t1, counters_t2): 

225 """Get time spent in Consistency Points in msecs.""" 

226 

227 return float(self._get_performance_counter_delta( 

228 counters_t1, counters_t2, 'total_cp_msecs')) 

229 

230 def _get_consistency_point_p2_flush_time(self, counters_t1, counters_t2): 

231 """Get time spent in CP Phase 2 (buffer flush) in msecs.""" 

232 

233 return float(self._get_performance_counter_delta( 

234 counters_t1, counters_t2, 'cp_phase_times:p2_flush')) 

235 

236 def _get_total_time(self, counters_t1, counters_t2, counter_name): 

237 """Get wall clock time between two successive counters in msecs.""" 

238 

239 timestamp_t1 = float(self._find_performance_counter_timestamp( 

240 counters_t1, counter_name)) 

241 timestamp_t2 = float(self._find_performance_counter_timestamp( 

242 counters_t2, counter_name)) 

243 return (timestamp_t2 - timestamp_t1) * 1000.0 

244 

245 def _get_adjusted_consistency_point_time(self, total_cp_time, 

246 p2_flush_time): 

247 """Get adjusted CP time by limiting CP phase 2 flush time to 20%.""" 

248 

249 return (total_cp_time - p2_flush_time) * 1.20 

250 

251 def _get_performance_counter_delta(self, counters_t1, counters_t2, 

252 counter_name): 

253 """Calculate a delta value from two performance counters.""" 

254 

255 counter_t1 = int( 

256 self._find_performance_counter_value(counters_t1, counter_name)) 

257 counter_t2 = int( 

258 self._find_performance_counter_value(counters_t2, counter_name)) 

259 

260 return counter_t2 - counter_t1 

261 

262 def _get_performance_counter_average(self, counters_t1, counters_t2, 

263 counter_name, base_counter_name, 

264 instance_name=None): 

265 """Calculate an average value from two performance counters.""" 

266 

267 counter_t1 = float(self._find_performance_counter_value( 

268 counters_t1, counter_name, instance_name)) 

269 counter_t2 = float(self._find_performance_counter_value( 

270 counters_t2, counter_name, instance_name)) 

271 base_counter_t1 = float(self._find_performance_counter_value( 

272 counters_t1, base_counter_name, instance_name)) 

273 base_counter_t2 = float(self._find_performance_counter_value( 

274 counters_t2, base_counter_name, instance_name)) 

275 

276 return (counter_t2 - counter_t1) / (base_counter_t2 - base_counter_t1) 

277 

278 def _get_performance_counter_average_multi_instance(self, counters_t1, 

279 counters_t2, 

280 counter_name, 

281 base_counter_name): 

282 """Calculate an average value from multiple counter instances.""" 

283 

284 averages = [] 

285 instance_names = [] 

286 for counter in counters_t1: 

287 if counter_name in counter: 

288 instance_names.append(counter['instance-name']) 

289 

290 for instance_name in instance_names: 

291 average = self._get_performance_counter_average( 

292 counters_t1, counters_t2, counter_name, base_counter_name, 

293 instance_name) 

294 averages.append(average) 

295 

296 return averages 

297 

298 def _find_performance_counter_value(self, counters, counter_name, 

299 instance_name=None): 

300 """Given a counter set, return the value of a named instance.""" 

301 

302 for counter in counters: 

303 if counter_name in counter: 

304 if (instance_name is None 

305 or counter['instance-name'] == instance_name): 

306 return counter[counter_name] 

307 else: 

308 raise exception.NotFound(_('Counter %s not found') % counter_name) 

309 

310 def _find_performance_counter_timestamp(self, counters, counter_name, 

311 instance_name=None): 

312 """Given a counter set, return the timestamp of a named instance.""" 

313 

314 for counter in counters: 

315 if counter_name in counter: 

316 if (instance_name is None 316 ↛ 314line 316 didn't jump to line 314 because the condition on line 316 was always true

317 or counter['instance-name'] == instance_name): 

318 return counter['timestamp'] 

319 else: 

320 raise exception.NotFound(_('Counter %s not found') % counter_name) 

321 

322 def _expand_performance_array(self, object_name, counter_name, counter): 

323 """Get array labels and expand counter data array.""" 

324 

325 # Get array labels for counter value 

326 counter_info = self.zapi_client.get_performance_counter_info( 

327 object_name, counter_name) 

328 

329 array_labels = [counter_name + ':' + label.lower() 

330 for label in counter_info['labels']] 

331 array_values = counter[counter_name].split(',') 

332 

333 # Combine labels and values, and then mix into existing counter 

334 array_data = dict(zip(array_labels, array_values)) 

335 counter.update(array_data) 

336 

337 def _get_base_counter_name(self, object_name, counter_name): 

338 """Get the name of the base counter for the specified counter.""" 

339 

340 counter_info = self.zapi_client.get_performance_counter_info( 

341 object_name, counter_name) 

342 return counter_info['base-counter'] 

343 

344 def _get_node_utilization_counters(self, node_name): 

345 """Get all performance counters for calculating node utilization.""" 

346 

347 try: 

348 return (self._get_node_utilization_system_counters(node_name) + 

349 self._get_node_utilization_wafl_counters(node_name) + 

350 self._get_node_utilization_processor_counters(node_name)) 

351 except netapp_api.NaApiError: 

352 LOG.exception('Could not get utilization counters from node ' 

353 '%s', node_name) 

354 return None 

355 

356 def _get_node_utilization_system_counters(self, node_name): 

357 """Get the system counters for calculating node utilization.""" 

358 

359 system_instance_uuids = ( 

360 self.zapi_client.get_performance_instance_uuids( 

361 self.system_object_name, node_name)) 

362 

363 system_counter_names = [ 

364 'avg_processor_busy', 

365 self.avg_processor_busy_base_counter_name, 

366 ] 

367 if 'cpu_elapsed_time1' in system_counter_names: 367 ↛ 370line 367 didn't jump to line 370 because the condition on line 367 was always true

368 system_counter_names.append('cpu_elapsed_time') 

369 

370 system_counters = self.zapi_client.get_performance_counters( 

371 self.system_object_name, system_instance_uuids, 

372 system_counter_names) 

373 

374 return system_counters 

375 

376 def _get_node_utilization_wafl_counters(self, node_name): 

377 """Get the WAFL counters for calculating node utilization.""" 

378 

379 wafl_instance_uuids = self.zapi_client.get_performance_instance_uuids( 

380 'wafl', node_name) 

381 

382 wafl_counter_names = ['total_cp_msecs', 'cp_phase_times'] 

383 wafl_counters = self.zapi_client.get_performance_counters( 

384 'wafl', wafl_instance_uuids, wafl_counter_names) 

385 

386 # Expand array data so we can use wafl:cp_phase_times[P2_FLUSH] 

387 for counter in wafl_counters: 

388 if 'cp_phase_times' in counter: 

389 self._expand_performance_array( 

390 'wafl', 'cp_phase_times', counter) 

391 

392 return wafl_counters 

393 

394 def _get_node_utilization_processor_counters(self, node_name): 

395 """Get the processor counters for calculating node utilization.""" 

396 

397 processor_instance_uuids = ( 

398 self.zapi_client.get_performance_instance_uuids('processor', 

399 node_name)) 

400 

401 processor_counter_names = ['domain_busy', 'processor_elapsed_time'] 

402 processor_counters = self.zapi_client.get_performance_counters( 

403 'processor', processor_instance_uuids, processor_counter_names) 

404 

405 # Expand array data so we can use processor:domain_busy[kahuna] 

406 for counter in processor_counters: 

407 if 'domain_busy' in counter: 

408 self._expand_performance_array( 

409 'processor', 'domain_busy', counter) 

410 

411 return processor_counters