blob: eaf8de7b144067ceae30dcb1540685c12828923d [file] [log] [blame]
Neels Hofmeyrdae3d3c2017-03-28 12:16:58 +02001# osmo_gsm_tester: process management
2#
3# Copyright (C) 2016-2017 by sysmocom - s.f.m.c. GmbH
4#
5# Author: Neels Hofmeyr <neels@hofmeyr.de>
6#
7# This program is free software: you can redistribute it and/or modify
Harald Welte27205342017-06-03 09:51:45 +02008# it under the terms of the GNU General Public License as
Neels Hofmeyrdae3d3c2017-03-28 12:16:58 +02009# published by the Free Software Foundation, either version 3 of the
10# License, or (at your option) any later version.
11#
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Harald Welte27205342017-06-03 09:51:45 +020015# GNU General Public License for more details.
Neels Hofmeyrdae3d3c2017-03-28 12:16:58 +020016#
Harald Welte27205342017-06-03 09:51:45 +020017# You should have received a copy of the GNU General Public License
Neels Hofmeyrdae3d3c2017-03-28 12:16:58 +020018# along with this program. If not, see <http://www.gnu.org/licenses/>.
19
Neels Hofmeyr3531a192017-03-28 14:30:28 +020020import os
21import time
22import subprocess
23import signal
Holger Hans Peter Freyther20b52c12019-02-27 02:31:50 +000024from abc import ABCMeta, abstractmethod
Pau Espin Pedrol0d8deec2017-06-23 11:43:38 +020025from datetime import datetime
Neels Hofmeyr3531a192017-03-28 14:30:28 +020026
Pau Espin Pedrol9a4631c2018-03-28 19:17:34 +020027from . import log
28from .event_loop import MainLoop
Neels Hofmeyr3531a192017-03-28 14:30:28 +020029from .util import Dir
30
Holger Hans Peter Freyther20b52c12019-02-27 02:31:50 +000031class TerminationStrategy(log.Origin, metaclass=ABCMeta):
32 """A baseclass for terminating a collection of processes."""
33
34 def __init__(self):
35 self._processes = []
36
37 def add_process(self, process):
38 """Remembers a process that needs to be terminated."""
39 self._processes.append(process)
40
41 @abstractmethod
42 def terminate_all(self):
43 "Terminates all scheduled processes and waits for the termination."""
44 pass
45
46
47class ParallelTerminationStrategy(TerminationStrategy):
48 """Processes will be terminated in parallel."""
49
Holger Hans Peter Freyther54b4fa92019-02-27 13:00:33 +000050 def _prune_dead_processes(self, poll_first):
51 """Removes all dead processes from the list."""
52 # Remove all processes that terminated!
53 self._processes = list(filter(lambda proc: proc.is_running(poll_first), self._processes))
54
55 def _build_process_map(self):
56 """Builds a mapping from pid to process."""
57 self._process_map = {}
Holger Hans Peter Freyther20b52c12019-02-27 02:31:50 +000058 for process in self._processes:
Holger Hans Peter Freyther54b4fa92019-02-27 13:00:33 +000059 pid = process.pid()
60 if pid is None:
61 continue
62 self._process_map[pid] = process
63
64 def _poll_once(self):
65 """Polls for to be collected children once."""
66 pid, result = os.waitpid(0, os.WNOHANG)
67 # Did some other process die?
68 if pid == 0:
69 return False
70 proc = self._process_map.get(pid)
71 if proc is None:
72 self.dbg("Unknown process with pid(%d) died." % pid)
73 return False
74 # Update the process state and forget about it
75 self.log("PID %d died..." % pid)
76 proc.result = result
77 proc.cleanup()
78 self._processes.remove(proc)
79 del self._process_map[pid]
80 return True
81
82 def _poll_for_termination(self, time_to_wait_for_term=5):
83 """Waits for the termination of processes until timeout|all ended."""
84
85 wait_step = 0.001
86 waited_time = 0
87 while len(self._processes) > 0:
88 # Collect processes until there are none to be collected.
89 while True:
90 try:
91 if not self._poll_once():
92 break
93 except ChildProcessError:
94 break
95
96 # All processes died and we can return before sleeping
97 if len(self._processes) == 0:
98 break
99 waited_time += wait_step
100 # make wait_step approach 1.0
101 wait_step = (1. + 5. * wait_step) / 6.
102 if waited_time >= time_to_wait_for_term:
103 break
104 time.sleep(wait_step)
105
106 def terminate_all(self):
Pau Espin Pedrol04096552019-04-04 16:43:12 +0200107 num_processes = len(self._processes)
108 self.dbg("Scheduled to terminate %d processes." % num_processes)
109 if num_processes == 0:
110 return
Holger Hans Peter Freyther54b4fa92019-02-27 13:00:33 +0000111 self._prune_dead_processes(True)
112 self._build_process_map()
113
114 # Iterate through all signals.
115 for sig in [signal.SIGTERM, signal.SIGINT, signal.SIGKILL]:
116 self.dbg("Starting to kill with %s" % sig.name)
117 for process in self._processes:
118 process.kill(sig)
119 if sig == signal.SIGKILL:
120 continue
121 self._poll_for_termination()
Pau Espin Pedrol04096552019-04-04 16:43:12 +0200122 if len(self._processes) == 0:
123 return
Holger Hans Peter Freyther20b52c12019-02-27 02:31:50 +0000124
125
Neels Hofmeyr3531a192017-03-28 14:30:28 +0200126class Process(log.Origin):
127
Pau Espin Pedrola9bc93d2020-06-12 15:34:28 +0200128 DEFAULT_WAIT_TIMEOUT = 300 # seconds
129
Neels Hofmeyr3531a192017-03-28 14:30:28 +0200130 def __init__(self, name, run_dir, popen_args, **popen_kwargs):
Neels Hofmeyr1a7a3f02017-06-10 01:18:27 +0200131 super().__init__(log.C_RUN, name)
Pau Espin Pedrol58603672018-08-09 13:45:55 +0200132 self.process_obj = None
133 self.result = None
134 self.killed = None
Pau Espin Pedrola9bc93d2020-06-12 15:34:28 +0200135 self.default_wait_timeout = Process.DEFAULT_WAIT_TIMEOUT
Neels Hofmeyr3531a192017-03-28 14:30:28 +0200136 self.name_str = name
Neels Hofmeyr3531a192017-03-28 14:30:28 +0200137 self.run_dir = run_dir
138 self.popen_args = popen_args
139 self.popen_kwargs = popen_kwargs
140 self.outputs = {}
141 if not isinstance(self.run_dir, Dir):
142 self.run_dir = Dir(os.path.abspath(str(self.run_dir)))
143
144 def set_env(self, key, value):
145 env = self.popen_kwargs.get('env') or {}
146 env[key] = value
147 self.popen_kwargs['env'] = env
148
Pau Espin Pedrola9bc93d2020-06-12 15:34:28 +0200149 def set_default_wait_timeout(self, timeout):
150 assert timeout
151 self.default_wait_timeout = timeout
152
Neels Hofmeyr3531a192017-03-28 14:30:28 +0200153 def make_output_log(self, name):
154 '''
155 create a non-existing log output file in run_dir to pipe stdout and
156 stderr from this process to.
157 '''
158 path = self.run_dir.new_child(name)
159 f = open(path, 'w')
160 self.dbg(path)
Pau Espin Pedrol0d8deec2017-06-23 11:43:38 +0200161 f.write('(launched: %s)\n' % datetime.now().strftime(log.LONG_DATEFMT))
Neels Hofmeyr3531a192017-03-28 14:30:28 +0200162 f.flush()
163 self.outputs[name] = (path, f)
164 return f
165
166 def launch(self):
Pau Espin Pedrol3a479c22019-04-05 19:47:40 +0200167 preexec_fn = None
Neels Hofmeyr1a7a3f02017-06-10 01:18:27 +0200168 log.dbg('cd %r; %s %s' % (
169 os.path.abspath(str(self.run_dir)),
170 ' '.join(['%s=%r'%(k,v) for k,v in self.popen_kwargs.get('env', {}).items()]),
171 ' '.join(self.popen_args)))
Neels Hofmeyr3531a192017-03-28 14:30:28 +0200172
Pau Espin Pedrol3a479c22019-04-05 19:47:40 +0200173 if self.popen_args[0] == "sudo":
174 # sudo drops forwarding of signals sent by processes of the same
175 # process group, which means by default will drop signals from
176 # parent and children processes. By moving it to another group, we
177 # will later be able to kill it.
178 # Note: sudo documentation is wrong, since it states it only drops
179 # signals from children.
180 preexec_fn = os.setpgrp
181
Neels Hofmeyr1a7a3f02017-06-10 01:18:27 +0200182 self.process_obj = subprocess.Popen(
183 self.popen_args,
184 stdout=self.make_output_log('stdout'),
185 stderr=self.make_output_log('stderr'),
186 stdin=subprocess.PIPE,
Pau Espin Pedrol3a479c22019-04-05 19:47:40 +0200187 preexec_fn=preexec_fn,
Neels Hofmeyr1a7a3f02017-06-10 01:18:27 +0200188 shell=False,
189 cwd=self.run_dir.path,
190 **self.popen_kwargs)
191 self.set_name(self.name_str, pid=self.process_obj.pid)
192 self.log('Launched')
Neels Hofmeyr3531a192017-03-28 14:30:28 +0200193
Pau Espin Pedrol78087be2018-11-12 18:20:52 +0100194 def launch_sync(self, raise_nonsuccess=True):
Pau Espin Pedrol79df7392018-11-12 18:15:30 +0100195 '''
196 calls launch() method and block waiting for it to finish, serving the
197 mainloop meanwhile.
198 '''
199 try:
200 self.launch()
201 self.wait()
202 except Exception as e:
203 self.terminate()
204 raise e
Pau Espin Pedrol78087be2018-11-12 18:20:52 +0100205 if raise_nonsuccess and self.result != 0:
Pau Espin Pedrol7e30d842020-06-04 16:23:46 +0200206 raise self.RunError('launch_sync()')
Pau Espin Pedrol78087be2018-11-12 18:20:52 +0100207 return self.result
Pau Espin Pedrol79df7392018-11-12 18:15:30 +0100208
Pau Espin Pedrolb1526b92018-05-22 20:32:30 +0200209 def respawn(self):
210 self.dbg('respawn')
211 assert not self.is_running()
212 self.result = None
213 self.killed = None
Pau Espin Pedrol922ce5a2019-09-10 13:44:20 +0200214 return self.launch()
Pau Espin Pedrolb1526b92018-05-22 20:32:30 +0200215
Pau Espin Pedroldd7bb2c2019-09-10 13:44:39 +0200216 def respawn_sync(self, raise_nonsuccess=True):
217 self.dbg('respawn_sync')
218 assert not self.is_running()
219 self.result = None
220 self.killed = None
221 return self.launch_sync(raise_nonsuccess)
222
Neels Hofmeyr3531a192017-03-28 14:30:28 +0200223 def _poll_termination(self, time_to_wait_for_term=5):
224 wait_step = 0.001
225 waited_time = 0
226 while True:
227 # poll returns None if proc is still running
228 self.result = self.process_obj.poll()
229 if self.result is not None:
230 return True
231 waited_time += wait_step
232 # make wait_step approach 1.0
233 wait_step = (1. + 5. * wait_step) / 6.
234 if waited_time >= time_to_wait_for_term:
235 break
236 time.sleep(wait_step)
237 return False
238
Pau Espin Pedrolfd4c1442018-10-25 17:37:23 +0200239 def send_signal(self, sig):
240 os.kill(self.process_obj.pid, sig)
241
Holger Hans Peter Freyther54b4fa92019-02-27 13:00:33 +0000242 def pid(self):
243 if self.process_obj is None:
244 return None
245 return self.process_obj.pid
246
Holger Hans Peter Freyther0d714c92019-02-27 09:50:52 +0000247 def kill(self, sig):
248 """Kills the process with the given signal and remembers it."""
249 self.log('Terminating (%s)' % sig.name)
250 self.send_signal(sig)
251 self.killed = sig
252
Neels Hofmeyr3531a192017-03-28 14:30:28 +0200253 def terminate(self):
254 if self.process_obj is None:
255 return
256 if self.result is not None:
257 return
258
259 while True:
260 # first try SIGINT to allow stdout+stderr flushing
Holger Hans Peter Freyther0d714c92019-02-27 09:50:52 +0000261 self.kill(signal.SIGINT)
Neels Hofmeyr3531a192017-03-28 14:30:28 +0200262 if self._poll_termination():
263 break
264
265 # SIGTERM maybe?
Holger Hans Peter Freyther0d714c92019-02-27 09:50:52 +0000266 self.kill(signal.SIGTERM)
Neels Hofmeyr3531a192017-03-28 14:30:28 +0200267 if self._poll_termination():
268 break
269
270 # out of patience
Holger Hans Peter Freyther0d714c92019-02-27 09:50:52 +0000271 self.kill(signal.SIGKILL)
Neels Hofmeyr3531a192017-03-28 14:30:28 +0200272 break;
273
274 self.process_obj.wait()
275 self.cleanup()
276
277 def cleanup(self):
Pau Espin Pedrol06ada452018-05-22 19:20:41 +0200278 self.dbg('Cleanup')
Neels Hofmeyr3531a192017-03-28 14:30:28 +0200279 self.close_output_logs()
280 if self.result == 0:
281 self.log('Terminated: ok', rc=self.result)
282 elif self.killed:
283 self.log('Terminated', rc=self.result)
284 else:
285 self.err('Terminated: ERROR', rc=self.result)
Pau Espin Pedrol9dbdb622020-05-25 16:45:34 +0200286 self.log_stdout_tail()
Neels Hofmeyr85eb3242017-04-09 22:01:16 +0200287 self.log_stderr_tail()
288
289 def log_stdout_tail(self):
290 m = self.get_stdout_tail(prefix='| ')
291 if not m:
292 return
293 self.log('stdout:\n', m, '\n')
294
295 def log_stderr_tail(self):
296 m = self.get_stderr_tail(prefix='| ')
297 if not m:
298 return
299 self.log('stderr:\n', m, '\n')
Neels Hofmeyr3531a192017-03-28 14:30:28 +0200300
301 def close_output_logs(self):
Neels Hofmeyr3531a192017-03-28 14:30:28 +0200302 for k, v in self.outputs.items():
303 path, f = v
304 if f:
305 f.flush()
306 f.close()
307 self.outputs[k] = (path, None)
308
309 def poll(self):
310 if self.process_obj is None:
311 return
312 if self.result is not None:
313 return
314 self.result = self.process_obj.poll()
315 if self.result is not None:
316 self.cleanup()
317
Neels Hofmeyr5356d0a2017-04-10 03:45:30 +0200318 def is_running(self, poll_first=True):
319 if poll_first:
320 self.poll()
Neels Hofmeyr85eb3242017-04-09 22:01:16 +0200321 return self.process_obj is not None and self.result is None
322
Neels Hofmeyr3531a192017-03-28 14:30:28 +0200323 def get_output(self, which):
Andre Puschmann20087ad2020-06-19 15:44:34 +0200324 ''' Read process output '''
325 path = self.get_output_file(which)
326 if path is None:
327 return None
328 with open(path, 'r') as f2:
329 return f2.read()
330
331 def get_output_file(self, which):
332 ''' Return filename for given output '''
Neels Hofmeyr3531a192017-03-28 14:30:28 +0200333 v = self.outputs.get(which)
334 if not v:
335 return None
336 path, f = v
Andre Puschmann20087ad2020-06-19 15:44:34 +0200337 return path
Neels Hofmeyr3531a192017-03-28 14:30:28 +0200338
339 def get_output_tail(self, which, tail=10, prefix=''):
Neels Hofmeyr5356d0a2017-04-10 03:45:30 +0200340 out = self.get_output(which)
341 if not out:
342 return None
343 out = out.splitlines()
Neels Hofmeyr3531a192017-03-28 14:30:28 +0200344 tail = min(len(out), tail)
Neels Hofmeyr5356d0a2017-04-10 03:45:30 +0200345 return prefix + ('\n' + prefix).join(out[-tail:])
Neels Hofmeyr3531a192017-03-28 14:30:28 +0200346
347 def get_stdout(self):
348 return self.get_output('stdout')
349
350 def get_stderr(self):
351 return self.get_output('stderr')
352
353 def get_stdout_tail(self, tail=10, prefix=''):
354 return self.get_output_tail('stdout', tail, prefix)
355
356 def get_stderr_tail(self, tail=10, prefix=''):
357 return self.get_output_tail('stderr', tail, prefix)
358
Neels Hofmeyr5356d0a2017-04-10 03:45:30 +0200359 def terminated(self, poll_first=True):
360 if poll_first:
361 self.poll()
Neels Hofmeyr3531a192017-03-28 14:30:28 +0200362 return self.result is not None
363
Pau Espin Pedrola9bc93d2020-06-12 15:34:28 +0200364 def wait(self, timeout=None):
365 if timeout is None:
366 timeout = self.default_wait_timeout
Pau Espin Pedrol664e3832020-06-10 19:30:33 +0200367 MainLoop.wait(self.terminated, timeout=timeout)
Neels Hofmeyrdae3d3c2017-03-28 12:16:58 +0200368
Andre Puschmann66272f82020-03-21 21:01:31 +0100369 def stdin_write(self, cmd):
370 '''
371 Send a cmd to the stdin of a process (convert to byte before)
372 '''
373 if self.process_obj.stdin is not None:
374 self.process_obj.stdin.write(cmd.encode("utf-8"))
375 self.process_obj.stdin.flush()
Neels Hofmeyrdae3d3c2017-03-28 12:16:58 +0200376
Pau Espin Pedrol7e30d842020-06-04 16:23:46 +0200377 def RunError(self, msg_prefix):
378 'Get a log.Error filled in with Result information. Use when program is terminated and result !=0'
379 msg = '%s: local process exited with status %d' % (msg_prefix, self.result)
380 return log.Error(msg)
381
Neels Hofmeyr3531a192017-03-28 14:30:28 +0200382class RemoteProcess(Process):
383
Pau Espin Pedrolf6d45ad2020-02-11 14:39:15 +0100384 def __init__(self, name, run_dir, remote_user, remote_host, remote_cwd, popen_args, remote_env={}, **popen_kwargs):
Neels Hofmeyr5356d0a2017-04-10 03:45:30 +0200385 super().__init__(name, run_dir, popen_args, **popen_kwargs)
Pau Espin Pedrol3895fec2017-04-28 16:13:03 +0200386 self.remote_user = remote_user
Neels Hofmeyr3531a192017-03-28 14:30:28 +0200387 self.remote_host = remote_host
388 self.remote_cwd = remote_cwd
Pau Espin Pedrolf6d45ad2020-02-11 14:39:15 +0100389 self.remote_env = remote_env
Neels Hofmeyr3531a192017-03-28 14:30:28 +0200390
391 # hacky: instead of just prepending ssh, i.e. piping stdout and stderr
392 # over the ssh link, we should probably run on the remote side,
393 # monitoring the process remotely.
Neels Hofmeyr5356d0a2017-04-10 03:45:30 +0200394 if self.remote_cwd:
Pau Espin Pedrolf6d45ad2020-02-11 14:39:15 +0100395 cd = 'cd "%s";' % self.remote_cwd
Neels Hofmeyr5356d0a2017-04-10 03:45:30 +0200396 else:
397 cd = ''
Pau Espin Pedrol302c7562018-10-02 13:08:02 +0200398 # We need double -t to force tty and be able to forward signals to
399 # processes (SIGHUP) when we close ssh on the local side. As a result,
400 # stderr seems to be merged into stdout in ssh client.
401 self.popen_args = ['ssh', '-t', '-t', self.remote_user+'@'+self.remote_host,
Pau Espin Pedrolf6d45ad2020-02-11 14:39:15 +0100402 '%s %s %s' % (cd,
403 ' '.join(['%s=%r'%(k,v) for k,v in self.remote_env.items()]),
404 ' '.join(self.popen_args))]
405 self.dbg(self.popen_args, dir=self.run_dir, conf=self.popen_kwargs, remote_env=self.remote_env)
Neels Hofmeyrdae3d3c2017-03-28 12:16:58 +0200406
Pau Espin Pedrol7e30d842020-06-04 16:23:46 +0200407 def RunError(self, msg_prefix):
408 'Overwrite Process method with ssh extra information'
409 # man ssh states it returns 255 if an ssh error occurred:
410 msg = msg_prefix + ': '
411 if self.result == 255:
412 tail = ' (' + (self.get_stderr_tail(tail=1, prefix='') or '').rstrip() + ')'
413 msg += 'local ssh process exited with status %d%s' % (self.result, tail if 'ssh' in tail else '')
414 else:
415 msg += 'remote process exited with status %d' % (self.result)
416 return log.Error(msg)
417
Pau Espin Pedrolfd4c1442018-10-25 17:37:23 +0200418class NetNSProcess(Process):
419 NETNS_EXEC_BIN = 'osmo-gsm-tester_netns_exec.sh'
420 def __init__(self, name, run_dir, netns, popen_args, **popen_kwargs):
421 super().__init__(name, run_dir, popen_args, **popen_kwargs)
422 self.netns = netns
423
424 self.popen_args = ['sudo', self.NETNS_EXEC_BIN, self.netns] + list(popen_args)
425 self.dbg(self.popen_args, dir=self.run_dir, conf=self.popen_kwargs)
426
427 # HACK: Since we run under sudo, only way to kill root-owned process is to kill as root...
428 # This function is overwritten from Process.
429 def send_signal(self, sig):
Pau Espin Pedrole159cd22019-04-03 17:53:54 +0200430 if sig == signal.SIGKILL:
431 # if we kill sudo, its children (bash running NETNS_EXEC_BIN +
432 # tcpdump under it) are kept alive. Let's instead tell the script to
433 # kill tcpdump:
434 sig = signal.SIGUSR1
Pau Espin Pedrolfd4c1442018-10-25 17:37:23 +0200435 kill_cmd = ('kill', '-%d' % int(sig), str(self.process_obj.pid))
Pau Espin Pedrol17a4ed92019-04-03 17:10:31 +0200436 run_local_netns_sync(self.run_dir, self.name()+"-kill"+str(sig), self.netns, kill_cmd)
Pau Espin Pedrolfd4c1442018-10-25 17:37:23 +0200437
Pau Espin Pedrol14022d32020-02-11 14:20:00 +0100438class RemoteNetNSProcess(RemoteProcess):
439 NETNS_EXEC_BIN = 'osmo-gsm-tester_netns_exec.sh'
440 def __init__(self, name, run_dir, remote_user, remote_host, remote_cwd, netns, popen_args, **popen_kwargs):
Pau Espin Pedrol4983eb52020-02-11 19:16:06 +0100441 self.netns = netns
Pau Espin Pedrol14022d32020-02-11 14:20:00 +0100442 args = ['sudo', self.NETNS_EXEC_BIN, self.netns] + list(popen_args)
443 super().__init__(name, run_dir, remote_user, remote_host, remote_cwd, args, **popen_kwargs)
Pau Espin Pedrolfd4c1442018-10-25 17:37:23 +0200444
Pau Espin Pedrole4358a92018-10-01 11:27:55 +0200445def run_local_sync(run_dir, name, popen_args):
446 run_dir =run_dir.new_dir(name)
447 proc = Process(name, run_dir, popen_args)
Pau Espin Pedrol79df7392018-11-12 18:15:30 +0100448 proc.launch_sync()
Pau Espin Pedrol12c5ea42019-11-26 14:25:33 +0100449 return proc
Pau Espin Pedrole4358a92018-10-01 11:27:55 +0200450
Pau Espin Pedrolfd4c1442018-10-25 17:37:23 +0200451def run_local_netns_sync(run_dir, name, netns, popen_args):
452 run_dir =run_dir.new_dir(name)
453 proc = NetNSProcess(name, run_dir, netns, popen_args)
Pau Espin Pedrol79df7392018-11-12 18:15:30 +0100454 proc.launch_sync()
Pau Espin Pedrol12c5ea42019-11-26 14:25:33 +0100455 return proc
Neels Hofmeyrdae3d3c2017-03-28 12:16:58 +0200456# vim: expandtab tabstop=4 shiftwidth=4