3 # jobd - perform Girocco maintenance jobs
5 # Run with --help for details
13 eval 'require Pod::Text::Termcap; 1;' and
14 @Pod::Usage
::ISA
= (qw( Pod::Text::Termcap ));
15 defined($ENV{PERLDOC
}) && $ENV{PERLDOC
} ne "" or
16 $ENV{PERLDOC
} = "-oterm -oman";
18 use POSIX
":sys_wait_h";
21 use lib
"__BASEDIR__";
26 BEGIN {noFatalsToBrowser
}
27 use Girocco
::ExecUtil
;
31 return 4 unless defined($cpus) && $cpus ne "" && int($cpus) >= 1;
32 return int($cpus * 2) if $cpus <= 2;
33 return 5 if $cpus < 4;
34 return int($cpus * 1.5) if $cpus <= 10;
41 my $cpus = online_cpus
;
43 my $max_par = $cpus ? _defjobs
($cpus) : 4;
44 my $max_par_intensive = 1;
45 my $load_triggers = $cpus ?
sprintf("%g,%g", $cpus * 1.5, $cpus * 0.75) : "6,3";
46 my $lockfile = "/tmp/jobd-$Girocco::Config::tmpsuffix.lock";
47 my $restart_delay = 300;
51 my ($update_only, $gc_only, $needs_gc_only);
53 my ($load_trig, $load_untrig);
59 my $p = $job->{'project'};
60 check_project_exists
($job) || return;
61 my $projpath = get_project_path
($p);
62 if ($gc_only || $needs_gc_only ||
63 -e
"$projpath/.nofetch" ||
64 -e
"$projpath/.bypass" ||
65 -e
"$projpath/.bypass_fetch" ||
66 is_mirror_disabled
($p)) {
68 setup_gc
($job) unless ! -e
"$projpath/.nofetch" &&
69 -e
"$projpath/.clone_in_progress" && ! -e
"$projpath/.clone_failed";
72 if (-e
"$projpath/.clone_in_progress" && ! -e
"$projpath/.clone_failed") {
73 job_skip
($job, "initial mirroring not complete yet");
76 if (-e
"$projpath/.clone_failed" || -e
"$projpath/.clone_failed_exceeds_limit") {
77 job_skip
($job, "initial mirroring failed");
78 # Still need to gc clones even if they've failed
82 if (my $ts = is_operation_uptodate
($p, 'lastrefresh', rand_adjust
($Girocco::Config
::min_mirror_interval
))) {
83 job_skip
($job, "not needed right now, last run at $ts");
87 if (is_svn_clone
($p)) {
88 # git svn can be very, very slow at times
89 $job->{'timeout_factor'} = 3;
91 exec_job_command
($job, ["$Girocco::Config::basedir/jobd/update.sh", $p], $quiet);
96 my $p = $job->{'project'};
97 check_project_exists
($job) || return;
98 my $projpath = get_project_path
($p);
99 if ($update_only || -e
"$projpath/.nogc" || -e
"$projpath/.bypass" ||
100 (-e
"$projpath/.delaygc" && ! -e
"$projpath/.allowgc" && ! -e
"$projpath/.needsgc")) {
105 if (! -e
"$projpath/.needsgc" && ($needs_gc_only ||
106 ($ts = is_operation_uptodate
($p, 'lastgc', rand_adjust
($Girocco::Config
::min_gc_interval
))))) {
107 job_skip
($job, ($needs_gc_only ?
undef : "not needed right now, last run at $ts"));
110 # allow garbage collection to run for longer than an update
111 $job->{'lastgc'} = get_git_config
($projpath, "gitweb.lastgc");
112 $job->{'timeout_factor'} = 3;
113 exec_job_command
($job, ["$Girocco::Config::basedir/jobd/gc.sh", $p], $quiet);
119 project
=> $job->{'project'},
121 command
=> \
&gc_project
,
123 on_success
=> \
&maybe_setup_gc_again
,
127 sub maybe_setup_gc_again
{
129 # If lastgc was set then gc.sh ran successfully and now it's not set
130 # then queue up another run of gc.sh for the project.
131 # However, just in case, no matter what happens with the extra
132 # gc.sh run no more "bonus" runs are possible to avoid any loops.
133 # This allows a "mini" gc that triggers a full gc to have the
134 # full gc run as part of the same --all-once run through instead
135 # of waiting. A very good thing for users of the --all-once option.
136 if ($job->{'lastgc'}) {
137 my $projpath = get_project_path
($job->{'project'});
138 get_git_config
($projpath, "gitweb.lastgc") or
140 project
=> $job->{'project'},
142 command
=> \
&gc_project
,
148 sub check_project_exists
{
150 my $p = $job->{'project'};
151 if (! -d get_project_path
($p)) {
152 job_skip
($job, "non-existent project");
158 sub get_project_path
{
159 "$Girocco::Config::reporoot/".shift().".git";
162 my $_last_config_path;
166 $_last_config_path = "";
167 $_last_config_id = "";
172 my ($projdir, $name) = @_;
173 defined($projdir) && -d
$projdir && -f
"$projdir/config" or return undef;
174 my $cf = "$projdir/config";
175 my @stat = stat($cf);
176 @stat && $stat[7] && $stat[9] or return undef;
177 my $id = join(":", $stat[0], $stat[1], $stat[7], $stat[9]); # dev,ino,size,mtime
178 if ($_last_config_path ne $cf || $_last_config_id ne $id || ref($_last_config) ne 'HASH') {
179 my $data = read_config_file_hash
($cf);
180 defined($data) or $data = {};
181 $_last_config_path = $_last_config_id = "";
182 $_last_config = $data;
183 $_last_config_id = $id;
184 $_last_config_path = $cf;
186 return $_last_config->{$name};
189 sub is_operation_uptodate
{
190 my ($project, $which, $threshold) = @_;
191 my $path = get_project_path
($project);
192 my $timestamp = get_git_config
($path, "gitweb.$which");
193 defined($timestamp) or $timestamp = '';
194 my $unix_ts = parse_rfc2822_date
($timestamp) || 0;
195 (time - $unix_ts) <= $threshold ?
$timestamp : undef;
198 sub is_mirror_disabled
{
200 my $path = get_project_path
($project);
201 my $baseurl = get_git_config
($path, 'gitweb.baseurl');
202 defined($baseurl) or $baseurl = '';
203 $baseurl =~ s/^\s+//;
204 $baseurl =~ s/\s+$//;
205 return $baseurl eq "" || $baseurl =~ /\s/ || $baseurl =~ /^disabled(?:\s|$)/i;
210 my $path = get_project_path
($project);
211 my $baseurl = get_git_config
($path, 'gitweb.baseurl');
212 defined($baseurl) or $baseurl = '';
213 my $svnurl = get_git_config
($path, 'svn-remote.svn.url');
214 defined($svnurl) or $svnurl = '';
215 return $baseurl =~ /^svn[:+]/i && $svnurl;
223 command
=> \
&update_project
,
224 on_success
=> \
&setup_gc
,
225 on_error
=> \
&setup_gc
,
230 queue_one
($_) for (Girocco
::Project
->get_full_list());
233 ######### Daemon operation {{{1
243 # Kills and reaps the specified pid. Returns exit status ($?) on success
244 # otherwise undef if process could not be killed or reaped
245 # First sends SIGINT and if process does not exit within 15 seconds then SIGKILL
246 # We used to send SIGTERM instead of SIGINT, but by using SIGINT we can take
247 # advantage of "tee -i" in our update scripts and really anything we're killing
248 # should respond the same to either SIGINT or SIGTERM and exit gracefully.
249 # Usage: my $exitcode = kill_gently($pid, $kill_process_group = 0);
252 my $use_pg = shift || 0;
253 # Note that the docs for Perl's kill state that a negative signal
254 # number should be used to kill process groups and that while a
255 # a negative process id (and positive signal number) may also do that
256 # on some platforms, that's not portable.
257 my $pg = $use_pg ?
-1 : 1;
258 my $harsh = time() + 15; # SIGKILL after this delay
259 my $count = kill(2*$pg, $targ); # SIGINT is 2
260 my $reaped = waitpid($targ, WNOHANG
);
261 return undef if $reaped < 0;
262 return $?
if $reaped == $targ;
263 while ($count && time() < $harsh) {
264 select(undef, undef, undef, 0.2);
265 $reaped = waitpid($targ, WNOHANG
);
266 return undef if $reaped < 0;
267 return $?
if $reaped == $targ;
270 $count = kill(9*$pg, $targ); # SIGKILL is 9
271 $reaped = waitpid($targ, WNOHANG
);
272 return undef if $reaped < 0;
273 return $?
if $reaped == $targ;
274 # We should not need to wait to reap a SIGKILL, however, just in case
275 # the system doesn't make a SIGKILL'd process immediately reapable
276 # (perhaps under extremely heavy load) we accomodate a brief delay
277 while ($count && time() < $harsh) {
278 select(undef, undef, undef, 0.2);
279 $reaped = waitpid($targ, WNOHANG
);
280 return undef if $reaped < 0;
281 return $?
if $reaped == $targ;
286 sub handle_softexit
{
287 error
("Waiting for outstanding jobs to finish... ".
288 "^C again to exit immediately");
291 $SIG{'INT'} = \
&handle_exit
;
295 error
("Killing outstanding jobs, please be patient...");
296 $SIG{'TERM'} = 'IGNORE';
298 kill_gently
($_->{'pid'}, 1);
300 unlink $lockfile if ($locked);
306 $opts{'queued_at'} = time;
307 $opts{'dont_run'} = 0;
308 $opts{'intensive'} = 0 unless exists $opts{'intensive'};
316 $job->{'command'}->($job);
317 if ($job->{'dont_run'}) {
326 "[".$job->{'type'}."::".$job->{'project'}."]";
329 # Only one of those per job!
330 sub exec_job_command
{
331 my ($job, $command, $err_only) = @_;
334 $job->{'finished'} = 0;
335 delete $job->{'pid'};
336 if (!defined($pid = fork)) {
337 error
(_job_name
($job) ." Can't fork job: $!");
338 $job->{'finished'} = 1;
343 select(undef, undef, undef, 0.1);
345 open STDIN
, '<', '/dev/null' || do {
346 error
(_job_name
($job) ."Can't read from /dev/null: $!");
350 open STDOUT
, '>', '/dev/null' || do {
351 error
(_job_name
($job) ." Can't write to /dev/null: $!");
355 # New process group so we can keep track of all of its children
356 if (!defined(POSIX
::setpgid
(0, 0))) {
357 error
(_job_name
($job) ." Can't create process group: $!");
362 # Stop perl from complaining
365 $job->{'pid'} = $pid;
366 $job->{'started_at'} = time;
370 my ($job, $msg) = @_;
371 $job->{'dont_run'} = 1;
372 error
(_job_name
($job) ." Skipping job: $msg") unless $quiet || !$msg;
375 sub reap_hanging_jobs
{
377 my $factor = $_->{'timeout_factor'} || 1;
378 if (defined($_->{'started_at'}) && (time - $_->{'started_at'}) > ($kill_after * $factor)) {
379 $_->{'finished'} = 1;
380 my $exitcode = kill_gently
($_->{'pid'}, 1);
383 error
(_job_name
($_) ." KILLED due to timeout" .
384 (($exitcode & 0x7f) == 9 ?
" with SIGKILL": ""));
385 push @jobs_killed, _job_name
($_);
392 if (!$job->{'finished'}) {
393 $job->{'on_success'}->($job) if defined($job->{'on_success'});
394 $job->{'finished'} = 1;
397 $job->{'on_error'}->($job) if defined($job->{'on_error'});
401 sub reap_finished_jobs
{
403 my $finished_any = 0;
404 foreach my $child (grep { !$_->{'pid'} && $_->{'killed'} } @running) {
405 delete $child->{'killed'};
406 reap_one_job
($child);
410 $pid = waitpid(-1, WNOHANG
);
414 my @child = grep { $_->{'pid'} && $_->{'pid'} == $pid } @running;
416 # any non-zero exit status should trigger on_error
417 $child[0]->{'finished'} = 1 if @child;
420 delete $child[0]->{'pid'};
421 reap_one_job
($child[0]);
424 @running = grep { $_->{'finished'} == 0 } @running;
428 sub have_intensive_jobs
{
429 grep { $_->{'intensive'} == 1 } @running;
433 "[". scalar(localtime) ."] ";
437 my $loadinfo = undef;
438 if ($^O
eq "linux") {
439 # Read /proc/loadavg on Linux
440 open(LOADAV
, '<', '/proc/loadavg') or return undef;
444 $loadinfo = 'load average '.join(" ",(split(/\s+/, $info, 4))[0..2]);
446 # Read the output of uptime everywhere else (works on Linux too)
447 open(LOADAV
, '-|', 'uptime') or return undef;
448 $loadinfo = <LOADAV
>;
451 defined($loadinfo) &&
452 $loadinfo =~ /load average[^0-9.]*([0-9.]+)[^0-9.]+([0-9.]+)[^0-9.]+([0-9.]+)/iso or return undef;
453 return (0.0+$1, 0.0+$2, 0.0+$3);
458 my $last_progress = $start;
459 my $last_checkload = $start - 5;
460 my $current_load = $load_trig;
467 my $s = @queue == 1 ?
'' : 's';
468 ferror
("--- Processing %d queued job$s", scalar(@queue));
470 $SIG{'INT'} = \
&handle_softexit
;
471 $SIG{'TERM'} = \
&handle_exit
;
472 while (@queue || @running) {
474 my $proceed_immediately = reap_finished_jobs
();
475 # Check current system load
476 if ($load_trig && (time - $last_checkload) >= 5 && defined((my @loadinfo = get_load_info
())[0])) {
477 my $current_load = $loadinfo[0];
478 if ($current_load > $load_trig && !$overloaded) {
480 error
("PAUSE: system load is at $current_load > $load_trig") if $progress;
481 } elsif ($current_load < $load_untrig && $overloaded) {
483 error
("RESUME: system load is at $current_load < $load_untrig") if $progress;
486 $load_info = ', paused (load '. $current_load .')';
488 $load_info = ', load '. $current_load;
490 $last_checkload = time;
493 if ($progress && (time - $last_progress) >= 60) {
494 ferror
("STATUS: %d queued, %d running, %d finished, %d skipped, %d killed$load_info", scalar(@queue), scalar(@running), $jobs_executed, $jobs_skipped, scalar(@jobs_killed));
498 push @run_status, _job_name
($_)." ". (time - $_->{'started_at'}) ."s";
500 error
("STATUS: currently running: ". join(', ', @run_status));
502 $last_progress = time;
504 # Back off if we're too busy
505 if (@running >= $max_par || have_intensive_jobs
() >= $max_par_intensive || !@queue || $overloaded) {
506 sleep 1 unless $proceed_immediately;
510 run_job
(shift(@queue)) if @queue;
513 my $s = $jobs_executed == 1 ?
'' : 's';
514 ferror
("--- Queue processed in %s. %d job$s executed, %d skipped, %d killed.",
515 human_duration
(time - $start), $jobs_executed, $jobs_skipped, scalar(@jobs_killed));
519 sub run_perpetually
{
521 die "Lockfile '$lockfile' exists. Please make sure no other instance of jobd is running.\n";
523 open LOCK
, '>', $lockfile || die "Cannot create lockfile '$lockfile': $!\n";
530 # touch ctime of lockfile to prevent it from being removed by /tmp cleaning
531 chmod 0640, $lockfile;
532 chmod 0644, $lockfile;
533 # check for restart request
534 open LOCK
, '<', $lockfile || die "Lock file '$lockfile' has disappeared!\n";
535 my $request = <LOCK
>;
537 chomp $request if defined($request);
538 if (defined($request) && $request eq "restart") {
544 sleep($restart_delay) if $perpetual; # Let the system breathe for a moment
551 ######### Helpers {{{1
554 print STDERR ts
().shift()."\n";
557 error
(sprintf($_[0], @_[1..$#_]));
566 my $reexec = Girocco
::ExecUtil
->new;
567 my $realpath0 = realpath
($0);
569 close(DATA
) if fileno(DATA
);
571 Getopt
::Long
::Configure
('bundling');
573 my $parse_res = GetOptions
(
575 pod2usage
(-verbose
=> 2, -exitval
=> 0, -input
=> $realpath0)},
576 'quiet|q' => sub {++$quiet},
577 'progress|P' => sub {++$progress},
578 'kill-after|k=i' => \
$kill_after,
579 'max-parallel|p=i' => \
$max_par,
580 'max-intensive-parallel|i=i' => \
$max_par_intensive,
581 'load-triggers=s' => \
$load_triggers,
582 'restart-delay|d=i' => \
$restart_delay,
583 'lockfile|l=s' => \
$lockfile,
584 'same-pid' => \
$same_pid,
585 'all-once|a' => \
$all_once,
586 'one|o=s' => sub {$one_once{$_[1]} = 1, push(@one, $_[1])
587 unless exists $one_once{$_[1]}},
588 'update-only' => \
$update_only,
589 'gc-only' => \
$gc_only,
590 'needs-gc-only' => \
$needs_gc_only,
591 ) || pod2usage
(-exitval
=> 2, -input
=> $realpath0);
592 fatal
("Error: can only use one out of --all-once and --one")
593 if $all_once && @one;
594 my $onlycnt = ($update_only?
1:0) + ($gc_only?
1:0) + ($needs_gc_only?
1:0);
595 fatal
("Error: can only use one out of --update-only, --gc-only and --needs-gc-only")
597 fatal
("Error: --update-only, --gc-only or --needs-gc-only requires --all-once or --one")
598 if $onlycnt && !($all_once || @one);
600 delete $ENV{'show_progress'};
602 $ENV{'show_progress'} = 0 if $quiet > 1;
604 $progress = 1 unless $progress;
605 $ENV{'show_progress'} = $progress;
608 $load_triggers = '0,0' unless defined((get_load_info
())[0]);
609 ($load_trig, $load_untrig) = split(/,/, $load_triggers);
612 queue_one
($_) foreach @one;
624 if (run_perpetually
() eq "restart") {
625 error
("Restarting in response to restart request... ");
626 $reexec->reexec($same_pid);
627 error
("Continuing after failed restart: $!");
633 ########## Documentation {{{1
639 jobd.pl - Perform Girocco maintenance jobs
646 -h | --help detailed instructions
647 -q | --quiet run quietly
648 -P | --progress show occasional status updates
649 -k SECONDS | --kill-after SECONDS how long to wait before killing jobs
650 -p NUM | --max-parallel NUM how many jobs to run at the same time
651 -i NUM | --max-intensive-parallel NUM how many resource-hungry jobs to run
653 --load-triggers TRIG,UNTRIG stop queueing jobs at load above
654 TRIG and resume at load below UNTRIG
655 -d NUM | --restart-delay SECONDS wait for this many seconds between
657 -l FILE | --lockfile FILE create a lockfile in the given
659 --same-pid keep same pid during graceful restart
660 -a | --all-once process the list only once
661 -o PRJNAME | --one PRJNAME process only one project
662 --update-only process mirror updates only
663 --gc-only perform needed garbage collection only
664 --needs-gc-only perform needed mini gc only
672 Print the full description of jobd.pl's options.
676 Suppress non-error messages, e.g. for use when running this task as a cronjob.
677 When given two or more times suppress update ref change lines in logs as well.
681 Show information about the current status of the job queue occasionally. This
682 is automatically enabled if --quiet is not given. When specified two or more
683 times full ref change details will be shown for updates.
685 =item B<--kill-after SECONDS>
687 Kill supervised jobs after a certain time to avoid hanging the daemon.
689 =item B<--max-parallel NUM>
691 Run no more than that many jobs at the same time. The default is the number
692 of cpus * 2 for 1 or 2 cpus, 5 for 3 cpus and int(cpus * 1.5) for 4 cpus or
693 more with the default capped to 16 when more than 10 cpus are detected.
694 If the number of cpus cannot be determined, the default is 4.
696 =item B<--max-intensive-parallel NUM>
698 Run no more than that many resource-hungry jobs at the same time. Right now,
699 this refers to repacking jobs. The default is 1.
701 =item B<--load-triggers TRIG,UNTRIG>
703 If the first system load average (1 minute average) exceeds TRIG, don't queue
704 any more jobs until it goes below UNTRIG. This is currently only supported on
705 Linux and any other platforms that provide an uptime command with load average
708 If both values are zero, load checks are disabled. The default is the number
709 of cpus * 1.5 for TRIG and half that for UNTRIG. If the number of cpus cannot
710 be determined, the default is 6,3.
712 =item B<--restart-delay NUM>
714 After processing the queue, wait this many seconds until the queue is
715 restarted. The default is 300 seconds.
717 =item B<--lockfile FILE>
719 For perpetual operation, specify the full path to a lock file to create and
720 then remove after finishing/aborting. The default is /tmp/jobd-$suffix.lock
721 where $suffix is a 6-character string uniquely determined by the name and
722 nickname of this Girocco instance. The pid of the running jobd instance will
723 be written to the lock file.
727 When performing a graceful restart, keep the same pid rather than switching to
732 Instead of perpetually processing all projects over and over again, process
733 them just once and then exit.
734 Conflicts with B<--one PRJNAME> option.
736 =item B<--one PRJNAME>
738 Process only the given project (given as just the project name without C<.git>
739 suffix) and then exit. May be repeated to process more than one project.
740 Conflicts with B<--all-once> option.
742 =item B<--update-only>
744 Limit processing to only those projects that need a mirror update.
745 Behaves as though every project has a C<.nogc> file present in it.
746 Requires use of B<--all-once> or B<--one PRJNAME> option.
747 Conflicts with B<--gc-only> and B<--needs-gc-only> options.
751 Limit processing to only those projects that need to have garbage collection
752 run on them. Behaves as though every project has a C<.bypass_fetch> file
753 present in it. Requires use of B<--all-once> or B<--one PRJNAME> option.
754 Conflicts with B<--update-only> and B<--needs-gc-only> options.
756 =item B<--needs-gc-only>
758 Limit processing to only those projects that need to have mini garbage
759 collection run on them. Behaves as though every project with a C<.needsgc>
760 file present in it also has a C<.bypass_fetch> file present in it and as though
761 every project without a C<.needsgc> file present in it has a C<.bypass> file
762 present in it. Requires use of B<--all-once> or B<--one PRJNAME> option.
763 Conflicts with B<--update-only> and B<--gc-only> options.
769 jobd.pl is Girocco's repositories maintenance servant; it periodically checks
770 all the repositories and updates mirrored repositories and repacks push-mode
771 repositories when needed.