Panda3D

autorestart.c

00001 /* Filename: autorestart.c
00002  * Created by:  drose (05Sep02)
00003  *
00004  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
00005  *
00006  * PANDA 3D SOFTWARE
00007  * Copyright (c) Carnegie Mellon University.  All rights reserved.
00008  *
00009  * All use of this software is subject to the terms of the revised BSD
00010  * license.  You should have received a copy of this license along
00011  * with this source code in a file named "LICENSE."
00012  *
00013  * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
00014 
00015 /*#include "dtool_config.h"*/
00016 #include "dtoolbase.h"
00017 
00018 #include <getopt.h>
00019 #include <stdio.h>
00020 #include <errno.h>
00021 #include <string.h>  /* for strerror */
00022 #include <unistd.h>
00023 #include <sys/types.h>
00024 #include <sys/wait.h>
00025 #include <sys/stat.h>
00026 #include <fcntl.h>
00027 #include <time.h>
00028 #include <signal.h>
00029 #include <stdlib.h>
00030 #include <assert.h>
00031 
00032 char **params = NULL;
00033 char *logfile_name = NULL;
00034 char *pidfile_name = NULL;
00035 int logfile_fd = -1;
00036 int stop_on_terminate = 0;
00037 int stop_always = 0;
00038 char *respawn_script = NULL;
00039 int respawn_count_time = 0;
00040 
00041 /* If requested, delay these many seconds between restart attempts */
00042 int respawn_delay_time = 5;
00043 
00044 
00045 /* We shouldn't respawn more than (spam_respawn_count - 1) times over
00046    spam_respawn_time seconds. */
00047 int spam_respawn_count = 5;
00048 int spam_respawn_time = 60;
00049 int spam_restart_delay_time = 600;  /* Optionally, do not exit if we spam too much; simply sleep for this many seconds*/
00050 
00051 
00052 
00053 pid_t child_pid = 0;
00054 
00055 #define TIME_BUFFER_SIZE 128
00056 
00057 /* Keep track of the frequency with which we respawn, so we can report
00058    this to our respawn script. */
00059 typedef struct respawn_record_struct {
00060   time_t _time;
00061   struct respawn_record_struct *_next;
00062 } respawn_record;
00063 
00064 respawn_record *respawns = NULL;
00065 
00066 int
00067 record_respawn(time_t now) {
00068   /* Records the respawning event in the respawn_record, and returns
00069      the number of respawns in the last respawn_count_time
00070      interval. */
00071   respawn_record *rec;
00072   respawn_record *next;
00073   int count;
00074 
00075   if (respawn_count_time <= 0) {
00076     /* We're not tracking respawns if respawn_count_time is 0. */
00077     return 0;
00078   }
00079 
00080   rec = (respawn_record *)malloc(sizeof(respawn_record));
00081   rec->_time = now;
00082   rec->_next = respawns;
00083   respawns = rec;
00084 
00085   /* Now walk through the rest of the list and count up the number of
00086      respawn events until we reach a record more than
00087      respawn_count_time seconds old. */
00088   count = 0;
00089   while (rec->_next != NULL &&
00090          (now - rec->_time) <= respawn_count_time) {
00091     rec = rec->_next;
00092     count++;
00093   }
00094 
00095   /* The remaining respawn records get removed. */
00096   next = rec->_next;
00097   rec->_next = NULL;
00098   while (next != NULL) {
00099     rec = next;
00100     next = rec->_next;
00101     free(rec);
00102   }
00103 
00104   return count;
00105 }
00106 
00107 void
00108 invoke_respawn_script(time_t now) {
00109   char buffer[32];
00110   char *new_command;
00111   int new_command_length;
00112 
00113   /* The process is about to be respawned; run the script that we were
00114      given on the command line. */
00115   if (respawn_count_time <= 0) {
00116     /* We're not counting respawn times, so just run the script
00117        directly. */
00118     system(respawn_script);
00119 
00120   } else {
00121     /* We are counting respawn times, so append that information as a
00122        parameter to the command. */
00123     sprintf(buffer, " %d", record_respawn(now));
00124     new_command_length = strlen(respawn_script) + strlen(buffer);
00125     new_command = (char *)malloc(new_command_length + 1);
00126     strcpy(new_command, respawn_script);
00127     strcat(new_command, buffer);
00128     assert(strlen(new_command) == new_command_length);
00129 
00130     system(new_command);
00131 
00132     free(new_command);
00133   }
00134 }
00135 
00136 void
00137 exec_process() {
00138   /* First, output the command line to the log file. */
00139   char **p;
00140   for (p = params; *p != NULL; ++p) {
00141     fprintf(stderr, "%s ", *p);
00142   }
00143   fprintf(stderr, "\n");
00144   execvp(params[0], params);
00145   fprintf(stderr, "Cannot exec %s: %s\n", params[0], strerror(errno));
00146 
00147   /* Exit with a status of 0, to indicate to the parent process that
00148      we should stop. */
00149   exit(0); 
00150 }
00151 
00152 int
00153 spawn_process() {
00154   /* Spawns the child process.  Returns true if the process terminated
00155      by itself and should be respawned, false if it was explicitly
00156      killed (or some other error condition exists), and it should not
00157      respawn any more. */
00158   pid_t wresult;
00159   int status;
00160 
00161   child_pid = fork();
00162   if (child_pid < 0) {
00163     /* Fork error. */
00164     perror("fork");
00165     return 0;
00166   }
00167 
00168   if (child_pid == 0) {
00169     /* Child.  Exec the process. */
00170     fprintf(stderr, "Child pid is %d.\n", getpid());
00171     exec_process();
00172     /* Shouldn't get here. */
00173     exit(1);
00174   }
00175 
00176   /* Parent.  Wait for the child to terminate, then diagnose the reason. */
00177   wresult = waitpid(child_pid, &status, 0);
00178   if (wresult < 0) {
00179     perror("waitpid");
00180     return 0;
00181   }
00182 
00183   /* Now that we've returned from waitpid, clear the child pid number
00184      so our signal handler doesn't get too confused. */
00185   child_pid = 0;
00186 
00187   if (WIFSIGNALED(status)) {
00188     int signal = WTERMSIG(status);
00189     fprintf(stderr, "\nprocess caught signal %d.\n\n", signal);
00190     /* A signal exit is a reason to respawn unless the signal is TERM
00191        or KILL. */
00192     return !stop_on_terminate || (signal != SIGTERM && signal != SIGKILL);
00193 
00194   } else {
00195     int exit_status = WEXITSTATUS(status);
00196     fprintf(stderr, "\nprocess exited with status %d.\n\n", WEXITSTATUS(status));
00197     /* Normal exit is a reason to respawn if the status indicates failure. */
00198     return !stop_on_terminate || (exit_status != 0);
00199   }
00200 }
00201 
00202 void
00203 sigterm_handler() {
00204   pid_t wresult;
00205   int status;
00206   time_t now;
00207   char time_buffer[TIME_BUFFER_SIZE];
00208 
00209   now = time(NULL);
00210   strftime(time_buffer, TIME_BUFFER_SIZE, "%T on %A, %d %b %Y", localtime(&now));
00211 
00212   fprintf(stderr, "\nsigterm caught at %s; shutting down.\n", time_buffer);
00213   if (child_pid == 0) {
00214     fprintf(stderr, "no child process.\n\n");
00215 
00216   } else {
00217     kill(child_pid, SIGTERM);
00218 
00219     wresult = waitpid(child_pid, &status, 0);
00220     if (wresult < 0) {
00221       perror("waitpid");
00222     } else {
00223       fprintf(stderr, "child process terminated.\n\n");
00224     }
00225   }
00226   exit(1);
00227 }
00228 
00229 void 
00230 sigalarm_handler() {
00231   fprintf(stderr, "sleep epoch was complete.\n");
00232 }
00233 
00234 void
00235 do_autorestart() {
00236   char time_buffer[TIME_BUFFER_SIZE];
00237   time_t now;
00238   time_t *spam_respawn = NULL;
00239   int sri, num_sri;
00240   struct sigaction sa;
00241 
00242   if (spam_respawn_count > 1) {
00243     spam_respawn = (time_t *)malloc(sizeof(time_t) * spam_respawn_count);
00244   }
00245 
00246   /* Make our process its own process group. */
00247   setpgid(0, 0);
00248 
00249   /* Set up a signal handler to trap SIGTERM. */
00250   sa.sa_handler = sigterm_handler;
00251   sigemptyset(&sa.sa_mask);
00252   sa.sa_flags = 0;
00253   if (sigaction(SIGTERM, &sa, NULL) < 0) {
00254     perror("sigaction");
00255   }
00256 
00257   if (logfile_fd >= 0) {
00258     /* If we have a logfile, dup it onto stdout and stderr. */
00259     dup2(logfile_fd, STDOUT_FILENO);
00260     dup2(logfile_fd, STDERR_FILENO);
00261     close(logfile_fd);
00262   }
00263 
00264   /* Make sure stdin is closed. */
00265   close(STDIN_FILENO);
00266 
00267   now = time(NULL);
00268   strftime(time_buffer, TIME_BUFFER_SIZE, "%T on %A, %d %b %Y", localtime(&now));
00269   fprintf(stderr, "autorestart begun at %s.\n", time_buffer);
00270 
00271   if (pidfile_name != NULL) {
00272     unlink(pidfile_name);
00273     FILE *pidfile = fopen(pidfile_name, "w");
00274     if (pidfile == NULL) {
00275       fprintf(stderr, "Could not write pidfile %s\n", pidfile_name);
00276     } else {
00277       fprintf(pidfile, "%d\n", getpid());
00278       fclose(pidfile);
00279     }
00280   }
00281 
00282   sri = 1;
00283   num_sri = 1;
00284   if (spam_respawn_count > 1) {
00285     spam_respawn[1] = now;
00286   }
00287   
00288   while (spawn_process()) {
00289     now = time(NULL);
00290 
00291     if (respawn_script != NULL) {
00292       invoke_respawn_script(now);
00293     }
00294     
00295     if (respawn_delay_time) {
00296       sleep(respawn_delay_time);
00297     }
00298 
00299     /* Make sure we're not respawning too fast. */
00300     if (spam_respawn_count > 1) {
00301       sri = (sri + 1) % spam_respawn_count;
00302       spam_respawn[sri] = now;
00303       if (num_sri < spam_respawn_count) {
00304         num_sri++;
00305       } else {
00306         time_t last = spam_respawn[(sri + 1) % spam_respawn_count];
00307         if (now - last < spam_respawn_time) 
00308         {
00309           if(!spam_restart_delay_time) 
00310           {
00311             fprintf(stderr, "respawning too fast, giving up.\n");
00312             break;
00313           } 
00314           else 
00315           {
00316             num_sri = 1; /* reset num_sri */
00317             fprintf(stderr, "respawning too fast, will sleep for %d seconds.\n", spam_restart_delay_time);
00318             signal (SIGALRM, sigalarm_handler);
00319             alarm(spam_restart_delay_time);
00320             pause();
00321             signal (SIGALRM, SIG_IGN);
00322           }
00323         }
00324       }
00325     }
00326     
00327     if (stop_always) {
00328       fprintf(stderr, "instructed to not autorestart, exiting.\n");
00329       break;
00330     }
00331       
00332     strftime(time_buffer, TIME_BUFFER_SIZE, "%T on %A, %d %b %Y", localtime(&now));
00333     fprintf(stderr, "respawning at %s.\n", time_buffer);
00334   }
00335 
00336   now = time(NULL);
00337   strftime(time_buffer, TIME_BUFFER_SIZE, "%T on %A, %d %b %Y", localtime(&now));
00338   fprintf(stderr, "autorestart terminated at %s.\n", time_buffer);
00339   exit(0);
00340 }
00341 
00342 void
00343 double_fork() {
00344   pid_t child, grandchild, wresult;
00345   int status;
00346 
00347   /* Fork once, then again, to disassociate the child from the command
00348      shell process group. */
00349   child = fork();
00350   if (child < 0) {
00351     /* Failure to fork. */
00352     perror("fork");
00353     exit(1);
00354   }
00355 
00356   if (child == 0) {
00357     /* Child.  Fork again. */
00358     grandchild = fork();
00359     if (grandchild < 0) {
00360       perror("fork");
00361       exit(1);
00362     }
00363 
00364     if (grandchild == 0) {
00365       /* Grandchild.  Begin useful work. */
00366       do_autorestart();
00367       /* Shouldn't get here. */
00368       exit(1);
00369     }
00370 
00371     /* Child.  Report the new pid, then terminate gracefully. */
00372     fprintf(stderr, "Spawned, monitoring pid is %d.\n", grandchild);
00373     exit(0);
00374   }
00375 
00376   /* Parent.  Wait for the child to terminate, then return. */
00377   wresult = waitpid(child, &status, 0);
00378   if (wresult < 0) {
00379     perror("waitpid");
00380     exit(1);
00381   }
00382 
00383   if (!WIFEXITED(status)) {
00384     if (WIFSIGNALED(status)) {
00385       fprintf(stderr, "child caught signal %d unexpectedly.\n", WTERMSIG(status));
00386     } else {
00387       fprintf(stderr, "child exited with status %d.\n", WEXITSTATUS(status));
00388     }
00389     exit(1);
00390   }
00391 }
00392 
00393 void
00394 usage() {
00395   fprintf(stderr,
00396           "\n"
00397           "autorestart [opts] program [args . . . ]\n"
00398           "autorestart -h\n\n");
00399 }
00400 
00401 void
00402 help() {
00403   usage();
00404   fprintf(stderr,
00405           "This program is used to run a program as a background task and\n"
00406           "automatically restart it should it terminate for any reason other\n"
00407           "than normal exit or explicit user kill.\n\n"
00408 
00409           "If the program exits with a status of 0, indicating successful\n"
00410           "completion, it is not restarted.\n\n"
00411 
00412           "If the program is terminated via a TERM or KILL signal (e.g. via\n"
00413           "kill [pid] or kill -9 [pid]), it is assumed the user meant for the\n"
00414           "process to stop, and it is not restarted.\n\n"
00415 
00416           "Options:\n\n"
00417 
00418           "  -l logfilename\n"
00419           "     Route stdout and stderr from the child process into the indicated\n"
00420           "     log file.\n\n"
00421 
00422           "  -p pidfilename\n"
00423           "     Write the pid of the monitoring process to the indicated pidfile.\n\n"
00424 
00425           "  -n\n"
00426           "     Do not attempt to restart the process under any circumstance.\n"
00427           "     The program can still be used to execute a script on abnormal\n"
00428           "     process termination.\n\n"
00429 
00430           "  -t\n"
00431           "     Stop on terminate: don't restart if the child process exits\n"
00432           "     normally or is killed with a SIGTERM.  With this flag, the\n"
00433           "     child process will be restarted only if it exits with a\n"
00434           "     non-zero exit status, or if it is killed with a signal other\n"
00435           "     than SIGTERM.  Without this flag, the default behavior is to\n"
00436           "     restarted the child process if it exits for any reason.\n\n"
00437 
00438           "  -r count,secs,sleep\n"
00439           "     Sleep 'sleep' seconds if the process respawns 'count' times\n"
00440           "     within 'secs' seconds.  This is designed to prevent respawning\n"
00441           "     from using too many system resources if something is wrong with\n"
00442           "     the child process.  The default value is %d,%d,%d. Use -r 0,0,0\n"
00443           "     to disable this feature.\n\n"
00444 
00445           "  -s \"command\"\n"
00446           "     Run the indicated command or script each time the process is\n"
00447           "     respawned, using the system() call.  This may be useful, for\n"
00448           "     instance, to notify an operator via email each time a respawn\n"
00449           "     occurs.  If -c is also specified, an additional parameter will\n"
00450           "     be appended to the command, indicating the number of times the\n"
00451           "     respawn has occurred in the given time interval.\n\n"
00452 
00453           "  -c secs\n"
00454           "     Specifies the number of seconds over which to count respawn events\n"
00455           "     for the purposes of passing an argument to the script named with\n"
00456           "     -s.\n\n"
00457 
00458           "  -d secs\n"
00459           "     Specifies the number of seconds to delay for between restarts.\n"
00460           "     The default is %d.\n\n"
00461 
00462           "  -h\n"
00463           "     Output this help information.\n\n",
00464           spam_respawn_count, spam_respawn_time, spam_restart_delay_time, respawn_delay_time);
00465 }
00466 
00467 void
00468 parse_int_triplet(char *param, int *a, int *b, int *c) {
00469   char *comma;
00470   char *comma2;
00471   
00472   comma = strchr(param, ',');
00473   if (comma == NULL) {
00474     fprintf(stderr, "Comma required: %s\n", param);
00475     exit(1);
00476   }
00477 
00478   comma2 = strchr(comma+1, ',');
00479   if (comma2 == NULL) {
00480     fprintf(stderr, "Second comma required: %s\n", param);
00481     exit(1);
00482   }
00483 
00484   *comma = '\0';
00485   *comma2 = '\0';
00486   
00487   *a = atoi(param);
00488   *b = atoi(comma + 1);
00489   *c = atoi(comma2 + 1);
00490 }
00491 
00492 int 
00493 main(int argc, char *argv[]) {
00494   extern char *optarg;
00495   extern int optind;
00496   /* The initial '+' instructs GNU getopt not to reorder switches. */
00497   static const char *optflags = "+l:p:ntr:s:c:d:wh";
00498   int flag;
00499 
00500   flag = getopt(argc, argv, optflags);
00501   while (flag != EOF) {
00502     switch (flag) {
00503     case 'l':
00504       logfile_name = optarg;
00505       break;
00506 
00507     case 'p':
00508       pidfile_name = optarg;
00509       break;
00510 
00511     case 'n':
00512       stop_always = 1;
00513       break;
00514 
00515     case 't':
00516       stop_on_terminate = 1;
00517       break;
00518 
00519     case 'r':
00520       parse_int_triplet(optarg, &spam_respawn_count, &spam_respawn_time, &spam_restart_delay_time);
00521       break;
00522 
00523     case 'w':
00524       spam_restart_delay_time = atoi(optarg);
00525       break;
00526 
00527     case 's':
00528       respawn_script = optarg;
00529       break;
00530 
00531     case 'c':
00532       respawn_count_time = atoi(optarg);
00533       break;
00534 
00535     case 'd':
00536       respawn_delay_time = atoi(optarg);
00537       break;
00538 
00539     case 'h':
00540       help();
00541       return 1;
00542 
00543     case '?':
00544     case '+':
00545       usage();
00546       return 1;
00547 
00548     default:
00549       fprintf(stderr, "Unhandled switch: -%c\n", flag);
00550       return 1;
00551     }
00552     flag = getopt(argc, argv, optflags);
00553   }
00554 
00555   argc -= (optind - 1);
00556   argv += (optind - 1);
00557 
00558   if (argc < 2) {
00559     fprintf(stderr, "No program to execute given.\n");
00560     usage();
00561     return 1;
00562   }
00563 
00564   params = &argv[1];
00565 
00566   if (logfile_name != NULL) {
00567     logfile_fd = open(logfile_name, O_WRONLY | O_CREAT | O_TRUNC, 0666);
00568     if (logfile_fd < 0) {
00569       fprintf(stderr, "Cannot write to logfile %s: %s\n", 
00570               logfile_name, strerror(errno));
00571       return 1;
00572     }
00573     fprintf(stderr, "Generating output to %s.\n", logfile_name);
00574   }
00575 
00576   double_fork();
00577 
00578   return 0;
00579 }
00580 
 All Classes Functions Variables Enumerations