Hello,
We have some news about the eosxd crashes. After activating the backtrace to level 2, we had one case of crash with a client v 4.5.15 which produced this output
191213 08:10:04 t=1576221004.001608 f=worker l=WARN tid=00007fa9d03fa700 s=EnvironmentReader:144 Reading /proc/39600/environ took 14ms (uid=33)
191213 08:18:22 t=1576221502.815290 f=worker l=WARN tid=00007fa9d23fe700 s=EnvironmentReader:144 Reading /proc/21213/environ took 15ms (uid=33)
Stack trace (most recent call last) in thread 26682:
#10 Object ", at 0xffffffffffffffff, in
#9 Object "/usr/lib64/libc.so.6, at 0x7fa9d74d5eac, in clone
#8 Source "pthread_create.c", line 0, in start_thread [0x7fa9d77acdd4]
#7 Object "/usr/lib64/libfuse.so.2, at 0x7fa9db769400, in fuse_session_loop
#6 Object "/usr/lib64/libfuse.so.2, at 0x7fa9db76cb6a, in fuse_reply_iov
BFD: Dwarf Error: Could not find abbrev number 90.
BFD: Dwarf Error: Could not find abbrev number 84.
BFD: Dwarf Error: Could not find abbrev number 5766.
BFD: Dwarf Error: Could not find abbrev number 5766.
#5 Source "/usr/src/debug/eos-4.5.15-1/fusex/eosfuse.cc", line 2455, in [0x5dfd8a]
2452: memset(&e, 0, sizeof(e));
2453: {
2454: metad::shared_md md;
>2455: md = Instance().mds.lookup(req, parent, name);
2456:
2457: if (md->id() && !md->deleted()) {
2458: cap::shared_cap pcap = Instance().caps.acquire(req, parent,
#4 Source "/usr/src/debug/eos-4.5.15-1/fusex/md/md.cc", line 231, in [0x63eb5d]
228: // try to get the meta data record
229: // --------------------------------------------------
230: pmd->Locker().UnLock();
> 231: md = get(req, inode, "", false, pmd, name);
232:
233: if (md) {
234: md->Locker().Lock();
#3 Source "/usr/src/debug/eos-4.5.15-1/fusex/md/md.cc", line 802, in [0x63da3e]
799: uint64_t l_ino;
800:
801: // store the retrieved meta data blob
> 802: if (!(l_ino = apply(req, *it, listing))) {
803: eos_static_crit("msg=\"failed to apply response\"");
804: } else {
805: ino = l_ino;
#2 Source "/opt/rh/devtoolset-6/root/usr/include/c++/6.3.1/bits/stl_map.h", line 981, in [0x635f31]
#1 Source "/opt/rh/devtoolset-6/root/usr/include/c++/6.3.1/bits/stl_tree.h", line 2274, in [0x5fe291]
#0 Object "/usr/lib64/libstdc++.so.6, at 0x7fa9d7f4e5a4, in std::_Rb_tree_rebalance_for_erase(std::_Rb_tree_node_base*, std::_Rb_tree_node_base&)
Segmentation fault (Address not mapped to object [(nil)])
# umounthandler: executing fusermount -u -z /eos/jeodpp# umounthandler: sighandler received signal 11 - emitting signal 11 again
###### cleaning stale cache directory '/var/cache/eos/fusex/md-cache/jeodpp/fb91bd84-11f6-11ea-a19f-e04f4306cd94'
###### cleaning stale cache directory '/var/cache/eos/fusex/md-cache/jeodpp/7d6a3950-166e-11ea-914d-e04f4306cd94'
###### cleaning stale cache directory '/var/cache/eos/fusex/md-cache/jeodpp/309ca5ae-1d85-11ea-b155-e04f4306cd94'
###### cleaning stale cache directory '/var/cache/eos/fusex/md-cache/jeodpp/7aaf34fe-1d85-11ea-9ef6-e04f4306cd94'
191213 09:52:32 t=1576227152.989587 f=mdcommunicate l=WARN tid=00007fc98e3fb700 s=md:2691 MGM asked us to set our heartbeat interval to 10 seconds, enable dentry-messaging, enable writesizeflush, accepts appname, accepts mdquery and server-version=4.5.15::1
191213 09:52:33 t=1576227153.082138 f=run l=WARN tid=00007fc9a1df4e00 s=eosfuse:1541 ********************************************************************************
191213 09:52:33 t=1576227153.082193 f=run l=WARN tid=00007fc9a1df4e00 s=eosfuse:1543 eosxd started version 4.5.15 - FUSE protocol version 28
Hope that this can help to understand it.