Panda3D
|
00001 /* Filename: autorestart.c 00002 * Created by: drose (05Sep02) 00003 * 00004 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 00005 * 00006 * PANDA 3D SOFTWARE 00007 * Copyright (c) Carnegie Mellon University. All rights reserved. 00008 * 00009 * All use of this software is subject to the terms of the revised BSD 00010 * license. You should have received a copy of this license along 00011 * with this source code in a file named "LICENSE." 00012 * 00013 * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ 00014 00015 /*#include "dtool_config.h"*/ 00016 #include "dtoolbase.h" 00017 00018 #include <getopt.h> 00019 #include <stdio.h> 00020 #include <errno.h> 00021 #include <string.h> /* for strerror */ 00022 #include <unistd.h> 00023 #include <sys/types.h> 00024 #include <sys/wait.h> 00025 #include <sys/stat.h> 00026 #include <fcntl.h> 00027 #include <time.h> 00028 #include <signal.h> 00029 #include <stdlib.h> 00030 #include <assert.h> 00031 00032 char **params = NULL; 00033 char *logfile_name = NULL; 00034 char *pidfile_name = NULL; 00035 int logfile_fd = -1; 00036 int stop_on_terminate = 0; 00037 int stop_always = 0; 00038 char *respawn_script = NULL; 00039 int respawn_count_time = 0; 00040 00041 /* If requested, delay these many seconds between restart attempts */ 00042 int respawn_delay_time = 5; 00043 00044 00045 /* We shouldn't respawn more than (spam_respawn_count - 1) times over 00046 spam_respawn_time seconds. */ 00047 int spam_respawn_count = 5; 00048 int spam_respawn_time = 60; 00049 int spam_restart_delay_time = 600; /* Optionally, do not exit if we spam too much; simply sleep for this many seconds*/ 00050 00051 00052 00053 pid_t child_pid = 0; 00054 00055 #define TIME_BUFFER_SIZE 128 00056 00057 /* Keep track of the frequency with which we respawn, so we can report 00058 this to our respawn script. */ 00059 typedef struct respawn_record_struct { 00060 time_t _time; 00061 struct respawn_record_struct *_next; 00062 } respawn_record; 00063 00064 respawn_record *respawns = NULL; 00065 00066 int 00067 record_respawn(time_t now) { 00068 /* Records the respawning event in the respawn_record, and returns 00069 the number of respawns in the last respawn_count_time 00070 interval. */ 00071 respawn_record *rec; 00072 respawn_record *next; 00073 int count; 00074 00075 if (respawn_count_time <= 0) { 00076 /* We're not tracking respawns if respawn_count_time is 0. */ 00077 return 0; 00078 } 00079 00080 rec = (respawn_record *)malloc(sizeof(respawn_record)); 00081 rec->_time = now; 00082 rec->_next = respawns; 00083 respawns = rec; 00084 00085 /* Now walk through the rest of the list and count up the number of 00086 respawn events until we reach a record more than 00087 respawn_count_time seconds old. */ 00088 count = 0; 00089 while (rec->_next != NULL && 00090 (now - rec->_time) <= respawn_count_time) { 00091 rec = rec->_next; 00092 count++; 00093 } 00094 00095 /* The remaining respawn records get removed. */ 00096 next = rec->_next; 00097 rec->_next = NULL; 00098 while (next != NULL) { 00099 rec = next; 00100 next = rec->_next; 00101 free(rec); 00102 } 00103 00104 return count; 00105 } 00106 00107 void 00108 invoke_respawn_script(time_t now) { 00109 char buffer[32]; 00110 char *new_command; 00111 int new_command_length; 00112 00113 /* The process is about to be respawned; run the script that we were 00114 given on the command line. */ 00115 if (respawn_count_time <= 0) { 00116 /* We're not counting respawn times, so just run the script 00117 directly. */ 00118 system(respawn_script); 00119 00120 } else { 00121 /* We are counting respawn times, so append that information as a 00122 parameter to the command. */ 00123 sprintf(buffer, " %d", record_respawn(now)); 00124 new_command_length = strlen(respawn_script) + strlen(buffer); 00125 new_command = (char *)malloc(new_command_length + 1); 00126 strcpy(new_command, respawn_script); 00127 strcat(new_command, buffer); 00128 assert(strlen(new_command) == new_command_length); 00129 00130 system(new_command); 00131 00132 free(new_command); 00133 } 00134 } 00135 00136 void 00137 exec_process() { 00138 /* First, output the command line to the log file. */ 00139 char **p; 00140 for (p = params; *p != NULL; ++p) { 00141 fprintf(stderr, "%s ", *p); 00142 } 00143 fprintf(stderr, "\n"); 00144 execvp(params[0], params); 00145 fprintf(stderr, "Cannot exec %s: %s\n", params[0], strerror(errno)); 00146 00147 /* Exit with a status of 0, to indicate to the parent process that 00148 we should stop. */ 00149 exit(0); 00150 } 00151 00152 int 00153 spawn_process() { 00154 /* Spawns the child process. Returns true if the process terminated 00155 by itself and should be respawned, false if it was explicitly 00156 killed (or some other error condition exists), and it should not 00157 respawn any more. */ 00158 pid_t wresult; 00159 int status; 00160 00161 child_pid = fork(); 00162 if (child_pid < 0) { 00163 /* Fork error. */ 00164 perror("fork"); 00165 return 0; 00166 } 00167 00168 if (child_pid == 0) { 00169 /* Child. Exec the process. */ 00170 fprintf(stderr, "Child pid is %d.\n", getpid()); 00171 exec_process(); 00172 /* Shouldn't get here. */ 00173 exit(1); 00174 } 00175 00176 /* Parent. Wait for the child to terminate, then diagnose the reason. */ 00177 wresult = waitpid(child_pid, &status, 0); 00178 if (wresult < 0) { 00179 perror("waitpid"); 00180 return 0; 00181 } 00182 00183 /* Now that we've returned from waitpid, clear the child pid number 00184 so our signal handler doesn't get too confused. */ 00185 child_pid = 0; 00186 00187 if (WIFSIGNALED(status)) { 00188 int signal = WTERMSIG(status); 00189 fprintf(stderr, "\nprocess caught signal %d.\n\n", signal); 00190 /* A signal exit is a reason to respawn unless the signal is TERM 00191 or KILL. */ 00192 return !stop_on_terminate || (signal != SIGTERM && signal != SIGKILL); 00193 00194 } else { 00195 int exit_status = WEXITSTATUS(status); 00196 fprintf(stderr, "\nprocess exited with status %d.\n\n", WEXITSTATUS(status)); 00197 /* Normal exit is a reason to respawn if the status indicates failure. */ 00198 return !stop_on_terminate || (exit_status != 0); 00199 } 00200 } 00201 00202 void 00203 sigterm_handler() { 00204 pid_t wresult; 00205 int status; 00206 time_t now; 00207 char time_buffer[TIME_BUFFER_SIZE]; 00208 00209 now = time(NULL); 00210 strftime(time_buffer, TIME_BUFFER_SIZE, "%T on %A, %d %b %Y", localtime(&now)); 00211 00212 fprintf(stderr, "\nsigterm caught at %s; shutting down.\n", time_buffer); 00213 if (child_pid == 0) { 00214 fprintf(stderr, "no child process.\n\n"); 00215 00216 } else { 00217 kill(child_pid, SIGTERM); 00218 00219 wresult = waitpid(child_pid, &status, 0); 00220 if (wresult < 0) { 00221 perror("waitpid"); 00222 } else { 00223 fprintf(stderr, "child process terminated.\n\n"); 00224 } 00225 } 00226 exit(1); 00227 } 00228 00229 void 00230 sigalarm_handler() { 00231 fprintf(stderr, "sleep epoch was complete.\n"); 00232 } 00233 00234 void 00235 do_autorestart() { 00236 char time_buffer[TIME_BUFFER_SIZE]; 00237 time_t now; 00238 time_t *spam_respawn = NULL; 00239 int sri, num_sri; 00240 struct sigaction sa; 00241 00242 if (spam_respawn_count > 1) { 00243 spam_respawn = (time_t *)malloc(sizeof(time_t) * spam_respawn_count); 00244 } 00245 00246 /* Make our process its own process group. */ 00247 setpgid(0, 0); 00248 00249 /* Set up a signal handler to trap SIGTERM. */ 00250 sa.sa_handler = sigterm_handler; 00251 sigemptyset(&sa.sa_mask); 00252 sa.sa_flags = 0; 00253 if (sigaction(SIGTERM, &sa, NULL) < 0) { 00254 perror("sigaction"); 00255 } 00256 00257 if (logfile_fd >= 0) { 00258 /* If we have a logfile, dup it onto stdout and stderr. */ 00259 dup2(logfile_fd, STDOUT_FILENO); 00260 dup2(logfile_fd, STDERR_FILENO); 00261 close(logfile_fd); 00262 } 00263 00264 /* Make sure stdin is closed. */ 00265 close(STDIN_FILENO); 00266 00267 now = time(NULL); 00268 strftime(time_buffer, TIME_BUFFER_SIZE, "%T on %A, %d %b %Y", localtime(&now)); 00269 fprintf(stderr, "autorestart begun at %s.\n", time_buffer); 00270 00271 if (pidfile_name != NULL) { 00272 unlink(pidfile_name); 00273 FILE *pidfile = fopen(pidfile_name, "w"); 00274 if (pidfile == NULL) { 00275 fprintf(stderr, "Could not write pidfile %s\n", pidfile_name); 00276 } else { 00277 fprintf(pidfile, "%d\n", getpid()); 00278 fclose(pidfile); 00279 } 00280 } 00281 00282 sri = 1; 00283 num_sri = 1; 00284 if (spam_respawn_count > 1) { 00285 spam_respawn[1] = now; 00286 } 00287 00288 while (spawn_process()) { 00289 now = time(NULL); 00290 00291 if (respawn_script != NULL) { 00292 invoke_respawn_script(now); 00293 } 00294 00295 if (respawn_delay_time) { 00296 sleep(respawn_delay_time); 00297 } 00298 00299 /* Make sure we're not respawning too fast. */ 00300 if (spam_respawn_count > 1) { 00301 sri = (sri + 1) % spam_respawn_count; 00302 spam_respawn[sri] = now; 00303 if (num_sri < spam_respawn_count) { 00304 num_sri++; 00305 } else { 00306 time_t last = spam_respawn[(sri + 1) % spam_respawn_count]; 00307 if (now - last < spam_respawn_time) 00308 { 00309 if(!spam_restart_delay_time) 00310 { 00311 fprintf(stderr, "respawning too fast, giving up.\n"); 00312 break; 00313 } 00314 else 00315 { 00316 num_sri = 1; /* reset num_sri */ 00317 fprintf(stderr, "respawning too fast, will sleep for %d seconds.\n", spam_restart_delay_time); 00318 signal (SIGALRM, sigalarm_handler); 00319 alarm(spam_restart_delay_time); 00320 pause(); 00321 signal (SIGALRM, SIG_IGN); 00322 } 00323 } 00324 } 00325 } 00326 00327 if (stop_always) { 00328 fprintf(stderr, "instructed to not autorestart, exiting.\n"); 00329 break; 00330 } 00331 00332 strftime(time_buffer, TIME_BUFFER_SIZE, "%T on %A, %d %b %Y", localtime(&now)); 00333 fprintf(stderr, "respawning at %s.\n", time_buffer); 00334 } 00335 00336 now = time(NULL); 00337 strftime(time_buffer, TIME_BUFFER_SIZE, "%T on %A, %d %b %Y", localtime(&now)); 00338 fprintf(stderr, "autorestart terminated at %s.\n", time_buffer); 00339 exit(0); 00340 } 00341 00342 void 00343 double_fork() { 00344 pid_t child, grandchild, wresult; 00345 int status; 00346 00347 /* Fork once, then again, to disassociate the child from the command 00348 shell process group. */ 00349 child = fork(); 00350 if (child < 0) { 00351 /* Failure to fork. */ 00352 perror("fork"); 00353 exit(1); 00354 } 00355 00356 if (child == 0) { 00357 /* Child. Fork again. */ 00358 grandchild = fork(); 00359 if (grandchild < 0) { 00360 perror("fork"); 00361 exit(1); 00362 } 00363 00364 if (grandchild == 0) { 00365 /* Grandchild. Begin useful work. */ 00366 do_autorestart(); 00367 /* Shouldn't get here. */ 00368 exit(1); 00369 } 00370 00371 /* Child. Report the new pid, then terminate gracefully. */ 00372 fprintf(stderr, "Spawned, monitoring pid is %d.\n", grandchild); 00373 exit(0); 00374 } 00375 00376 /* Parent. Wait for the child to terminate, then return. */ 00377 wresult = waitpid(child, &status, 0); 00378 if (wresult < 0) { 00379 perror("waitpid"); 00380 exit(1); 00381 } 00382 00383 if (!WIFEXITED(status)) { 00384 if (WIFSIGNALED(status)) { 00385 fprintf(stderr, "child caught signal %d unexpectedly.\n", WTERMSIG(status)); 00386 } else { 00387 fprintf(stderr, "child exited with status %d.\n", WEXITSTATUS(status)); 00388 } 00389 exit(1); 00390 } 00391 } 00392 00393 void 00394 usage() { 00395 fprintf(stderr, 00396 "\n" 00397 "autorestart [opts] program [args . . . ]\n" 00398 "autorestart -h\n\n"); 00399 } 00400 00401 void 00402 help() { 00403 usage(); 00404 fprintf(stderr, 00405 "This program is used to run a program as a background task and\n" 00406 "automatically restart it should it terminate for any reason other\n" 00407 "than normal exit or explicit user kill.\n\n" 00408 00409 "If the program exits with a status of 0, indicating successful\n" 00410 "completion, it is not restarted.\n\n" 00411 00412 "If the program is terminated via a TERM or KILL signal (e.g. via\n" 00413 "kill [pid] or kill -9 [pid]), it is assumed the user meant for the\n" 00414 "process to stop, and it is not restarted.\n\n" 00415 00416 "Options:\n\n" 00417 00418 " -l logfilename\n" 00419 " Route stdout and stderr from the child process into the indicated\n" 00420 " log file.\n\n" 00421 00422 " -p pidfilename\n" 00423 " Write the pid of the monitoring process to the indicated pidfile.\n\n" 00424 00425 " -n\n" 00426 " Do not attempt to restart the process under any circumstance.\n" 00427 " The program can still be used to execute a script on abnormal\n" 00428 " process termination.\n\n" 00429 00430 " -t\n" 00431 " Stop on terminate: don't restart if the child process exits\n" 00432 " normally or is killed with a SIGTERM. With this flag, the\n" 00433 " child process will be restarted only if it exits with a\n" 00434 " non-zero exit status, or if it is killed with a signal other\n" 00435 " than SIGTERM. Without this flag, the default behavior is to\n" 00436 " restarted the child process if it exits for any reason.\n\n" 00437 00438 " -r count,secs,sleep\n" 00439 " Sleep 'sleep' seconds if the process respawns 'count' times\n" 00440 " within 'secs' seconds. This is designed to prevent respawning\n" 00441 " from using too many system resources if something is wrong with\n" 00442 " the child process. The default value is %d,%d,%d. Use -r 0,0,0\n" 00443 " to disable this feature.\n\n" 00444 00445 " -s \"command\"\n" 00446 " Run the indicated command or script each time the process is\n" 00447 " respawned, using the system() call. This may be useful, for\n" 00448 " instance, to notify an operator via email each time a respawn\n" 00449 " occurs. If -c is also specified, an additional parameter will\n" 00450 " be appended to the command, indicating the number of times the\n" 00451 " respawn has occurred in the given time interval.\n\n" 00452 00453 " -c secs\n" 00454 " Specifies the number of seconds over which to count respawn events\n" 00455 " for the purposes of passing an argument to the script named with\n" 00456 " -s.\n\n" 00457 00458 " -d secs\n" 00459 " Specifies the number of seconds to delay for between restarts.\n" 00460 " The default is %d.\n\n" 00461 00462 " -h\n" 00463 " Output this help information.\n\n", 00464 spam_respawn_count, spam_respawn_time, spam_restart_delay_time, respawn_delay_time); 00465 } 00466 00467 void 00468 parse_int_triplet(char *param, int *a, int *b, int *c) { 00469 char *comma; 00470 char *comma2; 00471 00472 comma = strchr(param, ','); 00473 if (comma == NULL) { 00474 fprintf(stderr, "Comma required: %s\n", param); 00475 exit(1); 00476 } 00477 00478 comma2 = strchr(comma+1, ','); 00479 if (comma2 == NULL) { 00480 fprintf(stderr, "Second comma required: %s\n", param); 00481 exit(1); 00482 } 00483 00484 *comma = '\0'; 00485 *comma2 = '\0'; 00486 00487 *a = atoi(param); 00488 *b = atoi(comma + 1); 00489 *c = atoi(comma2 + 1); 00490 } 00491 00492 int 00493 main(int argc, char *argv[]) { 00494 extern char *optarg; 00495 extern int optind; 00496 /* The initial '+' instructs GNU getopt not to reorder switches. */ 00497 static const char *optflags = "+l:p:ntr:s:c:d:wh"; 00498 int flag; 00499 00500 flag = getopt(argc, argv, optflags); 00501 while (flag != EOF) { 00502 switch (flag) { 00503 case 'l': 00504 logfile_name = optarg; 00505 break; 00506 00507 case 'p': 00508 pidfile_name = optarg; 00509 break; 00510 00511 case 'n': 00512 stop_always = 1; 00513 break; 00514 00515 case 't': 00516 stop_on_terminate = 1; 00517 break; 00518 00519 case 'r': 00520 parse_int_triplet(optarg, &spam_respawn_count, &spam_respawn_time, &spam_restart_delay_time); 00521 break; 00522 00523 case 'w': 00524 spam_restart_delay_time = atoi(optarg); 00525 break; 00526 00527 case 's': 00528 respawn_script = optarg; 00529 break; 00530 00531 case 'c': 00532 respawn_count_time = atoi(optarg); 00533 break; 00534 00535 case 'd': 00536 respawn_delay_time = atoi(optarg); 00537 break; 00538 00539 case 'h': 00540 help(); 00541 return 1; 00542 00543 case '?': 00544 case '+': 00545 usage(); 00546 return 1; 00547 00548 default: 00549 fprintf(stderr, "Unhandled switch: -%c\n", flag); 00550 return 1; 00551 } 00552 flag = getopt(argc, argv, optflags); 00553 } 00554 00555 argc -= (optind - 1); 00556 argv += (optind - 1); 00557 00558 if (argc < 2) { 00559 fprintf(stderr, "No program to execute given.\n"); 00560 usage(); 00561 return 1; 00562 } 00563 00564 params = &argv[1]; 00565 00566 if (logfile_name != NULL) { 00567 logfile_fd = open(logfile_name, O_WRONLY | O_CREAT | O_TRUNC, 0666); 00568 if (logfile_fd < 0) { 00569 fprintf(stderr, "Cannot write to logfile %s: %s\n", 00570 logfile_name, strerror(errno)); 00571 return 1; 00572 } 00573 fprintf(stderr, "Generating output to %s.\n", logfile_name); 00574 } 00575 00576 double_fork(); 00577 00578 return 0; 00579 } 00580