# ------------------------------------------------------------ # Copyright (c) 2019-2023 # Q.W.Page Associates Inc. # www.qwpage.com # All rights reserved. # ------------------------------------------------------------ ::namespace eval ::qw::process_file {} ::proc ::qw::process_file::process_kill {sargs} { /* { Kills a process. Does not attempt to remove process record. Bug if the process is not running. Bug on Failure to kill the process. */ } ::set rwb1_debug 0; ::set ProcessId [::sargs::integer_get $sargs .process_id]; ::set Caller [::sargs::get $sargs .caller]; ::if {$rwb1_debug} {::puts "rwb1_debug,process_kill,1000.0,ProcessId==$ProcessId,caller==[::sargs::get $sargs .caller]";} ::if {$ProcessId==0} { ::qw::bug 314120240830104630 "[::qw::procname] - invalid process id \"$ProcessId\". caller==$Caller"; } ::qw::try { ::if {![::qw::process_file::process_id_is_running .process_id $ProcessId .caller [::qw::procname]]} { ::switch -- $::tcl_platform(platform) { "unix" { ::set Data [::exec ps -e | grep "nv2_"]; ::if {$rwb1_debug} {::puts "rwb1_debug,process_kill,1000.1,ProcessId==$ProcessId,ps==\n$Data";} } } ::qw::warning 314120240911103933 "[::qw::procname] - processId $ProcessId is not running before being killed."; # ::qw::bug 314120240911103933 "[::qw::procname] - processId $ProcessId is not running before being killed."; } ::if {$rwb1_debug} {::puts "rwb1_debug,process_kill,1000.2,ProcessId==$ProcessId";} ::switch -- $::tcl_platform(platform) { "windows" { ::if {$rwb1_debug} {::puts "rwb1_debug,process_kill,1000.3";} ::twapi::end_process $ProcessId -force -wait 1000; ::if {$rwb1_debug} {::puts "rwb1_debug,process_kill,1000.4";} } "unix" { ::if {0} { ::package require Tclx; ::kill $ProcessId; } ::if {1} { ::exec kill $ProcessId; # ::qw::sleep 100; } } } ::if {$rwb1_debug} {::puts "rwb1_debug,process_kill,1000.5,ProcessId==$ProcessId";} } catch Exception { ::qw::warning 314120240830104631 "[::qw::procname] - could not kill process. process_id==\"$ProcessId\",caller==\"$Caller\",Exception==$Exception"; ::return; } ::if {[::qw::process_file::process_id_is_running .process_id $ProcessId .caller [::qw::procname]]} { ::switch -- $::tcl_platform(platform) { "unix" { ::set Data [::exec ps -e | grep "nv2_"]; ::foreach Line [::split $Data "\n"] { ::if {[::lindex $Line 0] eq $ProcessId} { ::if {[::lindex $Line 1] eq "?"} { ::if {$rwb1_debug} {::puts "rwb1_debug,process_kill,1000.6,ProcessId==$ProcessId,ps==\n$Data";} ::qw::warning 314120240911103934 "[::qw::procname] - processId $ProcessId is running after kill attempt. Line==$Line"; ::return; } } } ::if {$rwb1_debug} {::puts "rwb1_debug,process_kill,1000.7,ProcessId==$ProcessId,ps==\n$Data";} } } # ::qw::bug 314120240911103934 "[::qw::procname] - processId $ProcessId is running after kill attempt."; } ::if {$rwb1_debug} {::puts "rwb1_debug,process_kill,1000.8,ProcessId==$ProcessId";} } ::proc ::qw::process_file::process_id_is_running {sargs} { /* { Usage: ::set IsRunning [::qw::process_file::process_id_is_running .process_id $ProcessId $ProcessRecords .caller [::qw::methodname]]; */ } ::set rwb1_debug 0; ::if {$::qw::verbose(process_file)} { ::set rwb1_debug 2; } ::set ProcessId [::sargs::get $sargs .process_id]; #::if {$rwb1_debug} {::puts "rwb1_debug,process_id_is_running,1000.1,process_id==$ProcessId";} ::if {$ProcessId eq ""} { ::qw::bug "314120181122161735" "[::qw::procname] - invalid process id \"$ProcessId\"."; } ::if {$ProcessId<0} { ::qw::bug "314120181122161736" "[::qw::procname] - invalid process id \"$ProcessId\"."; } ::switch -- $::tcl_platform(platform) { "windows" { ::set Exists [::twapi::process_exists $ProcessId]; ::if {!$Exists} { # ------------------------------------------------------------ # Windows says process id does not exist so report not running. # ------------------------------------------------------------ ::return 0; } ::return 1; } "unix" { /* { */ } ::set PsFileData [::sargs::get $sargs .ps_file_data]; ::if {[::sargs::size $PsFileData] == 0} { ::set PsFileData [::exec ps -e | grep "nv2_"]; } ::set LineList [::split $PsFileData "\n"] ::foreach Line $LineList { ::if {[::lindex $Line 0] eq $ProcessId} { ::if {[::string first "" $Line]>=0} { ::if {$rwb1_debug} {::puts "rwb1_debug,::qw::process_file::process_id_is_running,7000.0,ProcessId==$ProcessId,running==0";} ::return 0; } ::return 1; } } ::return 0; } } ::qw::bug 314120181122112209 "[::namespace current]::[::qw::procname] - invalid platform \"$::tcl_platform(platform)\"."; } ::proc ::qw::process_file::process_name_is_running {sargs} { /* { Usage: ::set IsRunning [::qw::process_file::process_name_is_running .process_name $ProcessName .process_records $ProcessRecords]; */ } ::set rwb1_debug 0; ::if {$::qw::verbose(process_file)} { ::set rwb1_debug 2; } ::set ProcessName [::sargs::get $sargs .process_name]; #::if {$rwb1_debug} {::puts "rwb1_debug,process_name_is_running,1000.1,process_id==$ProcessId";} ::if {$ProcessName eq ""} { ::qw::bug "314120181122161735" "[::qw::procname] - invalid process name \"$ProcessName\"."; } ::switch -- $::tcl_platform(platform) { "windows" { ::qw::throw "314120240927085042 - bot implemented for windows yet."; ::set Exists [::twapi::process_exists $ProcessId]; ::if {!$Exists} { # ------------------------------------------------------------ # Windows says process id does not exist so report not running. # ------------------------------------------------------------ ::return 0; } ::return 1; ::set NameOfExecutable [::string tolower [::sargs::get $ProcessRecord .nameofexecutable]]; ::set ExecutableName [::string tolower [::file tail $NameOfExecutable]]; ::set ProcessName [::string tolower [::twapi::get_process_name $ProcessId]]; ::if {$ProcessName ne $ExecutableName} { # ------------------------------------------------------------ # Process id likely re-used by different application; report not running. # ------------------------------------------------------------ ::sargs::file::unset $::qw::control(process_list_file) .$ProcessId; ::if {$rwb1_debug} {::puts "rwb1_debug,4545.1,process_id_name_running==0,process record==\n[::sargs::format $ProcessRecord]";} ::return 0; } ::return 1; } "unix" { /* { */ } ::set Data [::exec ps -e | grep "nv2_"]; ::set ProcessName [::file rootname [::file tail $ProcessName]]; ::if {[::string first $ProcessName $Data]<0} { ::return 0; } ::foreach Line [::split $Data "\n"] { ::if {[::lindex $Line 3] eq $ProcessName} { ::if {[::string first "" $Line]>=0} { ::continue } ::return 1; } } ::return 0; } } ::qw::bug 314120181122112209 "[::namespace current]::[::qw::procname] - invalid platform \"$::tcl_platform(platform)\"."; } ::proc ::qw::process_file::process_is_responding {sargs} { /* { Usage: ::set IsResponding [::qw::process_file::process_is_responding .process_id #ProcessId .process_records $Processrecords]; ::qw::control(process_responding_limit_seconds) [::expr 5*60*1000]; If .ping_clock is not updated by this interval, we deem the process to be not responding, i.e. frozen. We ping the process record to update the .ping_clock by 1/2 of the limit. */ } ::set rwb1_debug 0; ::set ProcessId [::sargs::get $sargs .process_id]; ::if {$ProcessId eq ""} { ::qw::bug "314120230421145221" "[::namespace current]::[::qw::procname] - invalid process id \"$ProcessId\"."; } ::if {![::string is integer $ProcessId]} { ::qw::bug "314120241017145807" "[::namespace current]::[::qw::procname] - invalid process id \"$ProcessId\"."; } ::if {$ProcessId<0} { ::qw::bug "314120230421145222" "[::namespace current]::[::qw::procname] - invalid process id \"$ProcessId\"."; } ::if {!$::qw::control(process_file_is_responding_is_enabled)} { /* { This was causing bugs. Eliminating this attempt to detect frozen processes reverts us back to the Jul 2024 functionality that seemed to work. */ } ::return 1; } ::set ProcessRecords [::sargs::get $sargs .process_records]; ::if {![::sargs::exists $sargs .process_records]} { ::if {![::qw::mutex_manager mutex_is_locked .mutex_name $::qw::control(process_list_file)]} { ::qw::mutex_manager mutex_lock .mutex_name $::qw::control(process_list_file) .lock_caller [::qw::procname]; ::qw::finally [::list ::qw::mutex_manager mutex_unlock .mutex_name $::qw::control(process_list_file) .unlock_caller [::qw::procname]]; } ::set ProcessRecords [::qw::process_file::load]; } ::set ProcessRecord [::sargs::get $ProcessRecords .$ProcessId]; ::set HubPingClock [::sargs::integer_get $ProcessRecord .ping_clock]; ::set IntervalSeconds [::expr [::clock seconds]-$HubPingClock]; ::if {$IntervalSeconds<$::qw::control(process_responding_limit_seconds)} { # ------------------------------------------------------------ # Process is running and responding. # ------------------------------------------------------------ /* { The process has to ping within the responding limit or we consider it non-responding. This number is somewhat arbitrary of course. */ } ::if {$rwb1_debug} { ::if {[::sargs::get $ProcessRecord .app_name] eq "app_name_service_node"} { # ::puts "rwb1_debug,garbage_collect,node is running,[::clock format [::clock seconds] -format %Y%m%d%H%M%S]"; } } ::return 1; } # ------------------------------------------------------------ # Process is running but not responding. # ------------------------------------------------------------ ::if {$rwb1_debug} { ::puts "rwb1_debug,garbage_collect,process not responding,ProcessId==$ProcessId"; ::puts "rwb1_debug,garbage_collect,process not responding,caller==[::sargs::get $sargs .caller]"; ::puts "rwb1_debug,garbage_collect,process not responding::qw::control(process_ping_interval_seconds)==$::qw::control(process_ping_interval_seconds)"; ::puts "rwb1_debug,garbage_collect,process not responding,clock==[::clock format [::clock seconds] -format %Y%m%d%H%M%S]"; ::puts "rwb1_debug,garbage_collect,process not responding,IntervalSeconds==$IntervalSeconds,::qw::control(process_responding_limit_seconds)==$::qw::control(process_responding_limit_seconds)"; ::puts "rwb1_debug,garbage_collect,process not responding,HubPingClock==$HubPingClock,IntervalSeconds==$IntervalSeconds"; ::puts "rwb1_debug,garbage_collect,process not responding,process_record==\n[::sargs::format $ProcessRecord]"; } ::return 0; } ::proc ::qw::process_file::garbage_collect {sargs} { /* { Uses the process_list.txt file to check all processes. (1) Deletes any process_record whose process is not running. (2) Kills process if running but not responding and deletes process_record. (3) Kills any node whose hub is not running. (2) Kills any running hub process if its hub record is inactive. (3) Runs any non-running hub if its process record is active. Deletes and record that has no process or a process that is not responding,killing that process. Returns the remaining process list as a convenience. Also kills all service_nodes if their service_hub is not running or not responding. */ } ::set rwb1_debug 0; ::if {$::qw::verbose(process_file)} { ::set rwb1_debug 2; } ::set ProcessRecords [::sargs::get $sargs .process_records]; ::if {![::sargs::exists $sargs .process_records]} { ::if {![::qw::mutex_manager mutex_is_locked .mutex_name $::qw::control(process_list_file)]} { ::qw::mutex_manager mutex_lock .mutex_name $::qw::control(process_list_file) .lock_caller [::qw::procname]; ::qw::finally [::list ::qw::mutex_manager mutex_unlock .mutex_name $::qw::control(process_list_file) .unlock_caller [::qw::procname]]; } ::set ProcessRecords [::qw::process_file::load]; } # ::set ProcessRecords [::sargs::file::get $::qw::control(process_list_file)]; ::switch -- $::tcl_platform(platform) { "windows" { ::set LineList ""; } "unix" { ::set LineList [::split [::exec ps -e | grep "nv2_"] "\n"]; } } ::switch -- $::tcl_platform(platform) { "windows" { ::set PsFileData ""; } "unix" { ::set PsFileData [::exec ps -e | grep "nv2_"]; } } ::foreach {Name ProcessRecord} $ProcessRecords { # ------------------------------------------------------------ # Delete process_file records for inactive processes. # ------------------------------------------------------------ ::if {$rwb1_debug} { ::set AppName [::sargs::get $ProcessRecord .app_name]; } ::if {![::sargs::exists $ProcessRecord .process_id]} { /* { Encountered a bad process record during debugging. This shouldn't happen, but when it does, bugs will occur later. So get rid of the bad record now. */ } ::qw::warning "314220230511120307" "[::qw::procname] - bad process record,name==$Name,ProcessRecord==\n[::sargs::format $ProcessRecord]"; ::if {!$rwb1_debug} { ::sargs::var::unset ProcessRecords $Name; ::sargs::file::unset $::qw::control(process_list_file) $Name; } ::if {$rwb1_debug} { ::puts "rwb1_debug,314220230511120307,$ProcessRecords==\n$ProcessRecords"; ::sargs::var::unset ProcessRecords $Name; ::sargs::file::unset $::qw::control(process_list_file) $Name; ::set ProcessRecords1 [::qw::process_file::load .caller [::qw::procname]]; ::puts "rwb1_debug,314220230511120308,$ProcessRecords1==\n$ProcessRecords1"; ::if {[::sargs::exists $ProcessRecords1 $Name]} { ::qw::bug 314220230511120309 "Could not unset $Name,$ProcessRecords1==\n$ProcessRecords1"; } } ::continue; } ::set ProcessId [::sargs::integer_get $ProcessRecord .process_id]; ::if {![::qw::process_file::process_id_is_running .process_id $ProcessId .caller [::qw::procname] .ps_file_data $PsFileData]} { # ------------------------------------------------------------ # There is a process record but it's not running. Delete the record. # ------------------------------------------------------------ ::if {$rwb1_debug} { ::if {$AppName ne "app_name_service_stub_checker"} { ::puts "rwb1_debug,::qw::process_file::garbage_collect,unsetting process_file record because process not running,process record==\n[::sargs::format $ProcessRecord]"; } } ::sargs::var::unset ProcessRecords $Name; ::sargs::file::unset $::qw::control(process_list_file) $Name; ::continue; } ::if {![::qw::process_file::process_is_responding .process_id $ProcessId .process_records $ProcessRecords .caller [::qw::procname]]} { # ------------------------------------------------------------ # Process is running but frozen. Kill it and delete the process record. # ------------------------------------------------------------ ::if {$rwb1_debug} { ::if {$AppName ne "app_name_service_stub_checker"} { ::puts "rwb1_debug,::qw::process_file::garbage_collect,killing process because not responding,process record==\n[::sargs::format $ProcessRecord]"; } } ::qw::process_file::process_kill .process_id $ProcessId .caller [::qw::procname]; ::if {$rwb1_debug} {::puts "rwb1_debug,::qw::process_file::garbage_collect,1000.10,killing process,process_file record==\n[::sargs::format $ProcessRecord]";} ::sargs::var::unset ProcessRecords $Name; ::sargs::file::unset $::qw::control(process_list_file) $Name; ::continue; } } # ------------------------------------------------------------ # Get the list of process record paths for the service_hubs. # ------------------------------------------------------------ ::set ServiceHubPathList [::sargs::select_field_value .structure $ProcessRecords .field .app_name .value "app_name_service_hub"]; # ------------------------------------------------------------ # Kill any nodes who's hub is not running. # ------------------------------------------------------------ ::foreach {Name ProcessRecord} $ProcessRecords { /* { During development we managed to leave some orphan service_nodes running. An orphan service_node is running but it's service_hub isn't. Here we kill any service nodes whose service hub is not running. Note that all processes are now deemed running because we already removed inactive processes. */ } ::if {[::sargs::get $ProcessRecord .app_name] ne "app_name_service_node"} { # We only want service nodes. ::continue; } ::set NodeHubProcessId [::sargs::integer_get $ProcessRecord .service_hub_process_id]; ::set NodeHasServiceHub 0; ::foreach ServiceHubPath $ServiceHubPathList { /* { Search for the node's hub process id in the list of active hub paths. */ } ::if {[::sargs::integer_get $ProcessRecords $ServiceHubPath.process_id] eq $NodeHubProcessId} { /* { Found the node's active service hub. Node is not an orphan so nothing to do. */ } ::set NodeHasServiceHub 1; ::break; } } ::if {$NodeHasServiceHub==0} { /* { Service node has no hub so it's an orphan and we kill orphans. Tclx provides the ::kill command and it was tested on windows and linux. */ } ::set ServiceNodeProcessId [::sargs::integer_get $ProcessRecord .process_id]; ::qw::try { ::if {$rwb1_debug} {::puts "rwb1_debug,::qw::process_file::garbage_collect,killing node process because node's hub not running,process record==\n[::sargs::format $ProcessRecord]";} ::qw::process_file::process_kill .process_id $ServiceNodeProcessId .caller [::qw::procname]; } catch Exception { ::qw::warning 314120190527110442 "Could not kill service_node, exception==$Exception"; } ::if {$rwb1_debug} { ::if {[::qw::process_file::process_id_is_running .process_id $ServiceNodeProcessId .caller [::qw::procname]]} { ::puts "314120240920160420,data==\n[::exec ps -e | grep nv2_]"; ::qw::bug 314120240920160420 "[::qw::procname] - couldn't kill process id \"$ServiceNodeProcessId\"."; } } ::sargs::var::unset ProcessRecords $Name; ::sargs::file::unset $::qw::control(process_list_file) $Name; } } } ::proc ::qw::process_file::kill_all {sargs} { /* { Usage: kill_all; If the list is empty, kills everything in the process_file. */ } ::set rwb1_debug 0; ::if {$::qw::verbose(process_file)} { ::set rwb1_debug 2; } ::qw::profile::finally "::qw::process_file::kill_all"; ::set PID [::pid]; ::package require Tclx; /* { We don't lock the process_file. We are killing all of the processes in it and one of them might be frozen with the file locked. That would freeze this process too if we let it. */ } ::if {![::qw::mutex_manager mutex_is_locked .mutex_name $::qw::control(process_list_file)]} { ::qw::mutex_manager mutex_lock .mutex_name $::qw::control(process_list_file) .lock_caller [::qw::procname]; ::qw::finally [::list ::qw::mutex_manager mutex_unlock .mutex_name $::qw::control(process_list_file) .unlock_caller [::qw::procname]]; } ::set ProcessRecords [::qw::process_file::load .caller [::qw::procname]]; # ::set ProcessRecords [::sargs::file::get $::qw::control(process_list_file)]; ::if {$rwb1_debug} { ::puts "rwb1_debug,kill_all_all,1000.0,1,ps file==\n[::exec ps -e | grep nv2_]"; ::puts "rwb1_debug,kill_all_all,1000.0,2,ProcessRecords==\n$ProcessRecords"; } ::set ProcessIdList [::list]; ::if {0} { ::set ProcessIdList [::list]; ::foreach {Name ProcessRecord} $ProcessRecords { ::set ProcessId [::sargs::get $ProcessRecord .process_id] ::if {$ProcessId ne $PID} { ::lappend ProcessIdList $ProcessId; } } ::if {$rwb1_debug} { ::puts "rwb1_debug,kill_all_all,1000.1.0,length==[::llength $ProcessIdList]"; ::puts "rwb1_debug,kill_all_all,1000.1.1,ProcessIdList==$ProcessIdList"; } ::if {[::llength $ProcessIdList]>0} { ::eval ::exec kill $ProcessIdList; ::file delete -force $::qw::control(process_list_file); } ::set ProcessIdList [::list]; ::foreach Line [::split [::exec ps -e | grep "nv2_"] "\n"] { ::set ProcessId [::lindex $Line 0]; ::if {$ProcessId ne $PID} { ::lappend ProcessIdList $ProcessId; } } ::if {$rwb1_debug} { ::puts "rwb1_debug,kill_all_all,1000.2.0,length==[::llength $ProcessIdList]"; ::puts "rwb1_debug,kill_all_all,1000.2.1,ProcessIdList==$ProcessIdList"; } ::if {[::llength $ProcessIdList]>0} { ::eval ::exec kill $ProcessIdList; } ::if {$rwb1_debug} {::puts "rwb1_debug,314120241001111036,kill_all,1000.99";} ::return; } ::set ProgressLimit [::expr {[::llength $ProcessRecords]/2}]; ::set ProgressMinimum 10; ::set Progress ""; ::if {$ProgressLimit>=$ProgressMinimum} { /* { This operation could take a while, i.e. the linux_stress_test has 60 processes running. Using a red progress updates the process_list file and ensures that a checker does not kill this process. */ } ::set Progress [::QW::PROGRESS::OPERATION [::qw::progress::auto] \ .database_id "" \ .file "" \ .limit $ProgressLimit \ .operation "Killing NewViews processes" \ .status "Killing NewViews processes." \ .destroy_skip 1 \ ]; ::qw::finally [::list ::itcl::delete object $Progress]; } ::set ProgressAccumulator 0; # ------------------------------------------------------------ # kill all processes of each listed app_name, in order # ------------------------------------------------------------ ::if {0} { ::foreach {Name ProcessRecord} $ProcessRecords { ::set ProcessId [::sargs::get $ProcessRecord .process_id]; ::if {$ProcessId eq $PID} { # ------------------------------------------------------------ # Suicide is avoided. If you want to die do it yourself. # ------------------------------------------------------------ ::continue; } ::if {$Progress ne ""} { ::incr ProgressAccumulator 1; ::if {$ProgressAccumulator>=1} { $Progress increment $ProgressAccumulator; ::set ProgressAccumulator 0; } } ::if {[::qw::process_file::process_id_is_running .process_id $ProcessId .caller [::qw::procname]]} { ::qw::process_file::process_kill .process_id $ProcessId .caller [::qw::procname]; ::sargs::file::unset $::qw::control(process_list_file) $Name; ::sargs::var::unset ProcessRecords $Name; } ::if {$rwb1_debug} { ::if {[::qw::process_file::process_id_is_running .process_id $ProcessId .caller [::qw::procname]]} { ::qw::bug 314120240909111830 "Couldn't kill process id \"$ProcessId\". ProcessRecords==\n$ProcessRecords"; } } } ::if {$Progress ne ""} { ::if {$ProgressAccumulator>0} { $Progress increment $ProgressAccumulator; ::set ProgressAccumulator 0; } } ::return; } ::foreach AppName { app_name_service_stub_checker app_name_service_node app_name_service_hub app_name_service_stub } { ::foreach {Name ProcessRecord} $ProcessRecords { ::set ProcessId [::sargs::get $ProcessRecord .process_id]; ::if {$ProcessId eq $PID} { # ------------------------------------------------------------ # Suicide is avoided. If you want to die do it yourself. # ------------------------------------------------------------ ::continue; } ::if {$AppName eq [::sargs::get $ProcessRecord .app_name]} { ::if {$Progress ne ""} { ::incr ProgressAccumulator 1; ::if {$ProgressAccumulator>=1} { $Progress increment $ProgressAccumulator; ::set ProgressAccumulator 0; } } ::if {[::qw::process_file::process_id_is_running .process_id $ProcessId .caller [::qw::procname]]} { ::qw::process_file::process_kill .process_id $ProcessId .caller [::qw::procname]; ::sargs::file::unset $::qw::control(process_list_file) $Name; ::sargs::var::unset ProcessRecords $Name; } ::if {$rwb1_debug} { ::if {[::qw::process_file::process_id_is_running .process_id $ProcessId .caller [::qw::procname]]} { ::qw::bug 314120240909111830 "Couldn't kill process id \"$ProcessId\". ProcessRecords==\n$ProcessRecords"; } } } } } ::if {$rwb1_debug} { ::set Data [::exec ps -e | grep "nv2_"]; ::set Lines [::split $Data "\n"]; ::puts "rwb1_debug,kill_all_all,3000.11,after,length==[::llength $Lines],lines==\n$Data"; } ::if {$Progress ne ""} { ::if {$ProgressAccumulator>0} { $Progress increment $ProgressAccumulator; ::set ProgressAccumulator 0; } } ::switch -- $::tcl_platform(platform) { "windows" { } "unix" { /* { Sometimes qw leave processes dangling. They are running (or are zombies) but they have no record in process_file.txt. We therefore get all processes from LineList and kill any stubs, hubs, or nodes, as identified by their executable name. */ } ::set LineList [::split [::exec ps -e | grep "nv2_"] "\n"]; ::if {$rwb1_debug} {::puts "rwb1_debug,process_kill,6000.8,Count==[::llength $LineList]";} ::set ProcessIdList [::list]; ::if {0} { ::foreach Line $LineList { ::set ProcessId [::lindex $Line 0]; ::if {$ProcessId eq [::pid]} { # No suicides. ::continue; } ::lappend ProcessIdList $ProcessId; } ::eval ::exec kill $ProcessIdList; ::return; } ::foreach Line $LineList { ::set ProcessId [::lindex $Line 0]; ::if {$ProcessId eq [::pid]} { # No suicides. ::continue; } ::if {![::string is integer $ProcessId]} { ::continue; } ::set ExecutableName [::lindex $Line 3]; ::if {$rwb1_debug} {::puts "rwb1_debug,process_kill,6000.11,ExecutableName==$ExecutableName";} ::switch -glob -- $ExecutableName { "nv2" { ::if {$rwb1_debug} {::puts "rwb1_debug,process_kill,6000.11.0,ExecutableName==$ExecutableName";} } "nv2_stub*" { ::if {$rwb1_debug} {::puts "rwb1_debug,process_kill,6000.11.1,ExecutableName==$ExecutableName";} } "nv2_hub*" { ::if {$rwb1_debug} {::puts "rwb1_debug,process_kill,6000.11.2,ExecutableName==$ExecutableName";} } "nv2_node*" { ::if {$rwb1_debug} {::puts "rwb1_debug,process_kill,6000.11.3,ExecutableName==$ExecutableName";} } default { ::if {$rwb1_debug} {::puts "rwb1_debug,process_kill,6000.12,ExecutableName==$ExecutableName,Line==$Line";} ::continue; } } ::if {$rwb1_debug} {::puts "rwb1_debug,process_kill,6000.13,Line==$Line";} ::qw::process_file::process_kill .process_id $ProcessId .caller [::qw::procname]; ::if {$rwb1_debug} {::puts "rwb1_debug,process_kill,6000.14,Line==$Line";} } /* { Lat's see if the processes are really killed. */ } ::set LineList [::split [::exec ps -e | grep "nv2_"] "\n"]; ::if {$rwb1_debug} {::puts "rwb1_debug,process_kill,6000.15,Count==[::llength $LineList]";} ::foreach Line $LineList { ::set ProcessId [::lindex $Line 0]; ::if {$ProcessId eq [::pid]} { ::continue; } ::qw::warning 314120240916102716 "kill_all - Could not kill process. ProcessId==$ProcessId,Line==$Line."; } } } } ::proc ::qw::process_file::ping_clock_background {} { /* { The stub can find out if a hub is running and if not, restart it. But a running hub could be frozen, or performing a long duration operation. To detect this the hub updates it's process_record on a regular basis using ping_clock_background. It simply sets, i.e. updates, it's process record's .ping_clock field. The stub can now check this field in the process_record and assume the hub is frozen if the .ping_clock field hasn't been updated for a while, say a minute. The red process window also updates the process record .ping_clock whenever it updates itself, say once per second, so we can also cover the case where the hub is performing a long duration operation. Note that a hub is unlikely to do any such thing, as most long durection operationd sre performed in nodes. But this technique will also allow the hub to detect whether any of its nodes are frozen but not busy. Note: we better also have the blue progress update the process record. */ } ::set rwb1_debug 0; ::set Seconds [::clock seconds]; ::set MyPID [::pid]; ::set ::qw::process_file::ping_clock_background_id ""; ::if {$rwb1_debug} {::puts "rwb1_debug,ping_clock_background,1000.0,exe==[::file tail [::info nameofexecutable]],clock==$Seconds";} ::if {![::qw::mutex_manager mutex_is_locked .mutex_name $::qw::control(process_list_file)]} { ::qw::mutex_manager mutex_lock .mutex_name $::qw::control(process_list_file) .lock_caller [::qw::procname]; ::qw::finally [::list ::qw::mutex_manager mutex_unlock .mutex_name $::qw::control(process_list_file) .unlock_caller [::qw::procname]]; } ::qw::process_file::check .caller [::qw::procname]; ::if {[::sargs::file::exists $::qw::control(process_list_file) .$MyPID]} { /* { We had an incident where the process record was already unset and then the ping background came along and created a process record with the single .ping_clock field. That could only lead to trouble. */ } ::qw::process_file::check .caller [::qw::procname]; ::sargs::file::set $::qw::control(process_list_file) \ .$MyPID.ping_clock $Seconds \ .$MyPID.ping_date [::clock format $Seconds -format %Y%m%d%H%M%S] \ ; ::qw::process_file::check .caller [::qw::procname]; ::if {![::sargs::file::exists $::qw::control(process_list_file) .$MyPID.ping_clock]} { ::set ProcessRecord [::sargs::file::get $::qw::control(process_list_file) .$MyPID]; ::set Handle [::open $::qw::control(process_list_file) r]; ::set ProcessRecords [::read $Handle]; ::close $Handle; ::qw::bug 314120241110191242 "Can't find process record field \".$MyPID.ping_clock\"."; } } # ------------------------------------------------------------ # Set the next ping_clock update. # ------------------------------------------------------------ /* { We have arbitrarily set the trigger for not responding to 30 minutes. If the ping clock is more than 30 minutes since update we kill the process. We update the ping clock 30/10, i.e. every 3 minutes; again this is basically arbitrary. */ } ::set Interval [::expr {$::qw::control(process_ping_interval_seconds)*1000}]; ::qw::process_file::ping_clock_background_id_set \ [::after $Interval { ::if {[::qw::command_exists ::qw::process_file::ping_clock_background]} { ::qw::process_file::ping_clock_background; } }]; } ::set ::qw::process_file::ping_clock_background_id ""; ::proc ::qw::process_file::ping_clock_background_id_set {AfterId} { /* { qw._qw_tcl was trying to set the background_id but the qw_lib was not loaded and caused an error that namespace was not created yet, By using a proc to set it, we automatically load the qw_lib. */ } ::set ::qw::process_file::ping_clock_background_id $AfterId; } ::proc ::qw::process_file::dump_process_records {{Comment ""}} { ::set rwb1_debug 0; ::if {!$rwb1_debug} { ::return; } ::if {$Comment ne ""} { ::puts $Comment; } ::if {![::qw::mutex_manager mutex_is_locked .mutex_name $::qw::control(process_list_file)]} { ::qw::mutex_manager mutex_lock .mutex_name $::qw::control(process_list_file) .lock_caller [::qw::procname]; ::qw::finally [::list ::qw::mutex_manager mutex_unlock .mutex_name $::qw::control(process_list_file) .unlock_caller [::qw::procname]]; } ::set ProcessRecords [::qw::process_file::load .caller [::qw::procname]]; # ::set ProcessRecords [::sargs::file::get $::qw::control(process_list_file)]; ::puts "rwb1_debug,::qw::process_file::dump_process_records,ProcessRecords==\n[::sargs::format $ProcessRecords]"; ::foreach {Name ProcessRecord} $ProcessRecords { ::puts "rwb1_debug,process_record==\n[::sargs::format [::sargs $Name $ProcessRecord]]"; } } ::proc ::qw::process_file::infinite_loop {} { /* { This is used during testing to simulate process not responding. */ } ::set rwb1_debug 0; ::set Seconds [::clock seconds]; ::while {1} { ::set Seconds1 [::clock seconds]; ::if {$Seconds!=$Seconds} { ::set Seconds $Seconds1; ::puts "::qw::process_file::infinite_loop - seconds==$Seconds"; } } } ::proc ::qw::process_file::check {sargs} { ::set rwb1_debug 0; ::if {$rwb1_debug} { ::set Milli [::clock clicks -milliseconds]; } ::set Caller [::sargs::get $sargs .caller]; ::if {![::qw::mutex_manager mutex_is_locked .mutex_name $::qw::control(process_list_file)]} { ::qw::bug 314120241113155912 "[::qw::method_name] - accessing process_list_file without a mutex."; } ::set ProcessRecords [::sargs::file::get $::qw::control(process_list_file)]; ::if {$rwb1_debug} { ::if {[::sargs::size $ProcessRecords]==0} { ::puts "rwb1_debug,314120241101111253,sargs==$sargs"; ::puts "rwb1_debug,314120241101111253,[::qw::procname] - empty file." ::qw::bug 314120241101111253 "[::qw::procname] - empty file."; } } ::if {$rwb1_debug} { ::set FieldList [::list]; ::lappend FieldList .app_name; ::lappend FieldList .release; ::lappend FieldList .process_id; ::lappend FieldList .nameofexecutable; ::lappend FieldList .argv; ::lappend FieldList .commandline; ::lappend FieldList .boot_clock; ::lappend FieldList .boot_date; ::lappend FieldList .ping_clock; ::lappend FieldList .ping_date; # ::lappend FieldList .service_hub_process_id; # ::lappend FieldList .service_hub_node_id; ::foreach {Name ProcessRecord} $ProcessRecords { ::foreach Field $FieldList { ::if {![::sargs::exists $ProcessRecord $Field]} { ::puts "rwb1_debug,314120241101111254,Caller==$Caller"; ::puts "rwb1_debug,314120241101111254,Name==$Name,ProcessRecord==\n$ProcessRecord"; ::puts "rwb1_debug,314120241101111254,Name==$Name,ProcessRecords==\n$ProcessRecords"; ::qw::bug 314120241101111254 "[::qw::procname] - can't find field \"$Field\"."; } } } } ::if {$rwb1_debug} { ::set Milli [::expr {[::clock clicks -milliseconds]-$Milli}]; ::puts "rwb1_debug,314120241107104433,Milli==$Milli"; } } ::proc ::qw::process_file::load {sargs} { /* { Caller could instead load the file directly but this proc also checks that the file contents are a well-formed sargs. */ } ::set rwb1_debug 0; ::qw::profile::finally "::qw::process_file::load"; ::if {![::qw::mutex_manager mutex_is_locked .mutex_name $::qw::control(process_list_file)]} { ::qw::bug 314120241113155913 "[::qw::method_name] - accessing process_list_file without a mutex."; } ::if {$rwb1_debug} { ::sargs::file::check $sargs .path $::qw::control(process_list_file); } ::set ProcessRecords [::sargs::file::get $::qw::control(process_list_file)]; ::return $ProcessRecords; }