Hello friends,
We have seen some rare crashes of eosxd process in our cta deployment. We are running fusex 5.1.11, eos-server 5.1.11 (xrootd 5.5.7).
Error:
Segmentation fault eosxd -ofsname=${name} -f
Config:
{
"name" : "aarnet-cloudstor",
"hostport" : "ourmgm.local:1094",
"remotemountdir" : "/eos/aarnet-cloudstor",
"localmountdir" : "/eos/aarnet-cloudstor",
"mdcachedir" : "/srv/eos/fusex/aarnet-cloudstor",
"mdzmqtarget" : "tcp://ourmgm.local:1100",
"mdzmqidentity" : "crlt-e10",
"options" : {
"debug" : 0,
"debuglevel" : 4,
"libfusethreads" : 0,
"md-kernelcache" : 1,
"md-kernelcache.enoent.timeout" : 5,
"md-backend.timeout" : 86400,
"md-backend.put.timeout" : 120,
"data-kernelcache" : 1,
"mkdir-is-sync" : 1,
"create-is-sync" : 1,
"symlink-is-sync" : 1,
"rename-is-sync" : 1,
"rmdir-is-sync" : 0,
"global-flush" : 0,
"global-locking" : 1,
"fd-limit" : 524288,
"no-fsync" : [ ".db", ".db-journal", ".sqlite", ".sqlite-journal", ".db3", ".db3-journal", "*.o" ],
"overlay-mode" : 000,
"rm-rf-protect-levels" : 1,
"rm-rf-bulk" : 1,
"show-tree-size" : 1,
"free-md-asap" : 1,
"cpu-core-affinity" : 0,
"no-xattr" : 1,
"no-link" : 1,
"nocache-graceperiod" : 5,
"leasetime" : 300,
"write-size-flush-interval" : 5
},
"auth" : {
"krb5" : 0,
"gsi-first" : 0,
"sss" : 0,
"ssskeytab" : "/etc/eos.cdn.keytab",
"shared-mount" : 1,
"environ-deadlock-timeout" : 100,
"forknoexec-heuristic" : 1
},
"recovery" : {
"read-open" : 1,
"read-open-noserver" : 1,
"read-open-noserver-retrywindow" : 15,
"write-open" : 0,
"write-open-noserver" : 0,
"write-open-noserver-retrywindow" : 15
},
"cache" : {
"type" : "disk",
"size-mb" : 1000,
"size-ino" : 65536,
"journal-mb" : 16134,
"journal-ino" : 65536,
"clean-threshold" : 85.0,
"location" : "/srv/eos/fusex/cache/aarnet-cloudstor/",
"journal" : "/srv/eos/fusex/journal/aarnet-cloudstor/",
"read-ahead-strategy" : "dynamic",
"read-ahead-bytes-nominal" : 1048576,
"read-ahead-bytes-max" : 8388608,
"read-ahead-blocks-max" : 8388608,
"max-read-ahead-buffer" : 1073741824,
"max-write-buffer" : 1073741824
}
}
Fusex logs:
230512 09:39:54 t=1683884394.102888 f=WaitPrefetch l=ERROR ino:800000008b1d519b s=data:970 pre-read failed error=[FATAL] Unknown error code: software caused connection abort: request timeout
230512 09:39:54 t=1683884394.105715 f=recover_ropen l=WARN ino:800000008b1d5198 s=data:1174 recover read-open [1]
230512 09:39:54 t=1683884394.105727 f=recover_ropen l=WARN ino:800000008b1d5198 s=data:1207 recover reopening file for read
230512 09:39:54 t=1683884394.105851 f=recover_ropen l=WARN ino:800000008b1d5198 s=data:1222 applying exclusion list: tried=crlt-s56.cdn.aarnet.edu.au,
230512 09:39:54 t=1683884394.106236 f=HandleResponseWithHosts l=ERROR tid=00007f05ec3f7700 s=xrdclproxy:559 state=failed async open returned errmsg=[ERROR] Socket timeout
---- high rate error messages suppressed ----
fusermount: failed to unmount /eos/aarnet-cloudstor: Invalid argument
# umounthandler: executing fusermount -u -z /eos/aarnet-cloudstor
# umounthandler: sighandler received signal 11 - emitting signal 11 again
230512 09:39:54 t=1683884394.236312 f=lookupNonLocalJail l=ALERT tid=00007f058e7ff700 s=SecurityChecker:212 Failed to openat file
# umounthandler: executing fusermount -u -z /eos/aarnet-cloudstor
# umounthandler: sighandler received signal 11 - emitting signal 11 again
fusermount: failed to unmount /eos/aarnet-cloudstor: Invalid argument
# umounthandler: executing fusermount -u -z /eos/aarnet-cloudstor
# umounthandler: sighandler received signal 11 - emitting signal 11 again
fusermount: failed to unmount /eos/aarnet-cloudstor: Invalid argument
fusermount: failed to unmount /eos/aarnet-cloudstor: Invalid argument
fusermount: failed to unmount /eos/aarnet-cloudstor: Invalid argument
This happened a couple of times and we are seeing Unknown error code: software caused connection abort: request timeout
each time.
Any ideas on how to debug this further?
Thank you
Denis