diff --git a/support-d/gdb/README.md b/support-d/gdb/README.md new file mode 100644 index 0000000000..d3d198c5b5 --- /dev/null +++ b/support-d/gdb/README.md @@ -0,0 +1,76 @@ +`gdb` scripts +----------- + +Originally from: https://github.com/facebook/folly/tree/593b6e76881042031b7f21d898c8e0874ea79fe0/folly/experimental/gdb + +This directory contains a collection of `gdb` scripts that we have found helpful. +These scripts use the [gdb extension Python API](https://sourceware.org/gdb/current/onlinedocs/gdb/Python.html#Python). + +### How to run the scripts + +To run the scripts, fire up `gdb` and load a script with `source -v`. Example: + +```lang=bash +$ gdb -p 123456 +(gdb) source -v ./folly/experimental/gdb/deadlock.py +Type "deadlock" to detect deadlocks. +# At this point, any new commands defined in `deadlock.py` are available. +(gdb) deadlock +Found deadlock! +... +``` + +### What does each script do? + +#### `deadlock.py` - Detect deadlocks + +Consider the following program that always deadlocks: + +```lang=cpp +void deadlock3() { + std::mutex m1, m2, m3; + folly::Baton<> b1, b2, b3; + + auto t1 = std::thread([&m1, &m2, &b1, &b2] { + std::lock_guard g1(m1); + b1.post(); + b2.wait(); + std::lock_guard g2(m2); + }); + + auto t2 = std::thread([&m3, &m2, &b3, &b2] { + std::lock_guard g2(m2); + b2.post(); + b3.wait(); + std::lock_guard g3(m3); + }); + + auto t3 = std::thread([&m3, &m1, &b3, &b1] { + std::lock_guard g3(m3); + b3.post(); + b1.wait(); + std::lock_guard g1(m1); + }); + + t1.join(); + t2.join(); + t3.join(); +} +``` + +The `deadlock.py` script introduces a new `deadlock` command that can help +us identify the threads and mutexes involved with the deadlock. + +```lang=bash +$ gdb -p 2174496 +(gdb) source -v ./folly/experimental/gdb/deadlock.py +Type "deadlock" to detect deadlocks. +(gdb) deadlock +Found deadlock! +Thread 2 (LWP 2174497) is waiting on mutex (0x00007ffcff42a4c0) held by Thread 3 (LWP 2174498) +Thread 3 (LWP 2174498) is waiting on mutex (0x00007ffcff42a4f0) held by Thread 4 (LWP 2174499) +Thread 4 (LWP 2174499) is waiting on mutex (0x00007ffcff42a490) held by Thread 2 (LWP 2174497) +``` + +NOTE: This script only works on Linux and requires debug symbols to be installed +for the `pthread` library. diff --git a/support-d/gdb/deadlock.py b/support-d/gdb/deadlock.py new file mode 100644 index 0000000000..feeae0713f --- /dev/null +++ b/support-d/gdb/deadlock.py @@ -0,0 +1,474 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +from collections import defaultdict +from enum import Enum + +import gdb + + +class DiGraph: + """ + Adapted from networkx: http://networkx.github.io/ + Represents a directed graph. Edges can store (key, value) attributes. + """ + + def __init__(self): + # Map of node -> set of nodes + self.adjacency_map = {} + # Map of (node1, node2) -> map string -> arbitrary attribute + # This will not be copied in subgraph() + self.attributes_map = {} + + def neighbors(self, node): + return self.adjacency_map.get(node, set()) + + def edges(self): + edges = [] + for node, neighbors in self.adjacency_map.items(): + for neighbor in neighbors: + edges.append((node, neighbor)) + return edges + + def nodes(self): + return self.adjacency_map.keys() + + def attributes(self, node1, node2): + return self.attributes_map[(node1, node2)] + + def add_edge(self, node1, node2, **kwargs): + if node1 not in self.adjacency_map: + self.adjacency_map[node1] = set() + if node2 not in self.adjacency_map: + self.adjacency_map[node2] = set() + self.adjacency_map[node1].add(node2) + self.attributes_map[(node1, node2)] = kwargs + + def remove_node(self, node): + self.adjacency_map.pop(node, None) + for _, neighbors in self.adjacency_map.items(): + neighbors.discard(node) + + def subgraph(self, nodes): + graph = DiGraph() + for node in nodes: + for neighbor in self.neighbors(node): + if neighbor in nodes: + graph.add_edge(node, neighbor) + return graph + + def node_link_data(self): + """ + Returns the graph as a dictionary in a format that can be + serialized. + """ + data = { + "directed": True, + "multigraph": False, + "graph": {}, + "links": [], + "nodes": [], + } + + # Do one pass to build a map of node -> position in nodes + node_to_number = {} + for node in self.adjacency_map.keys(): + node_to_number[node] = len(data["nodes"]) + data["nodes"].append({"id": node}) + + # Do another pass to build the link information + for node, neighbors in self.adjacency_map.items(): + for neighbor in neighbors: + link = self.attributes_map[(node, neighbor)].copy() + link["source"] = node_to_number[node] + link["target"] = node_to_number[neighbor] + data["links"].append(link) + return data + + +def strongly_connected_components(G): # noqa: C901 + """ + Adapted from networkx: http://networkx.github.io/ + Parameters + ---------- + G : DiGraph + Returns + ------- + comp : generator of sets + A generator of sets of nodes, one for each strongly connected + component of G. + """ + preorder = {} + lowlink = {} + scc_found = {} + scc_queue = [] + i = 0 # Preorder counter + for source in G.nodes(): + if source not in scc_found: + queue = [source] + while queue: + v = queue[-1] + if v not in preorder: + i = i + 1 + preorder[v] = i + done = 1 + v_nbrs = G.neighbors(v) + for w in v_nbrs: + if w not in preorder: + queue.append(w) + done = 0 + break + if done == 1: + lowlink[v] = preorder[v] + for w in v_nbrs: + if w not in scc_found: + if preorder[w] > preorder[v]: + lowlink[v] = min([lowlink[v], lowlink[w]]) + else: + lowlink[v] = min([lowlink[v], preorder[w]]) + queue.pop() + if lowlink[v] == preorder[v]: + scc_found[v] = True + scc = {v} + while scc_queue and preorder[scc_queue[-1]] > preorder[v]: + k = scc_queue.pop() + scc_found[k] = True + scc.add(k) + yield scc + else: + scc_queue.append(v) + + +def simple_cycles(G): # noqa: C901 + """ + Adapted from networkx: http://networkx.github.io/ + Parameters + ---------- + G : DiGraph + Returns + ------- + cycle_generator: generator + A generator that produces elementary cycles of the graph. + Each cycle is represented by a list of nodes along the cycle. + """ + + def _unblock(thisnode, blocked, B): + stack = {thisnode} + while stack: + node = stack.pop() + if node in blocked: + blocked.remove(node) + stack.update(B[node]) + B[node].clear() + + # Johnson's algorithm requires some ordering of the nodes. + # We assign the arbitrary ordering given by the strongly connected comps + # There is no need to track the ordering as each node removed as processed. + # save the actual graph so we can mutate it here + # We only take the edges because we do not want to + # copy edge and node attributes here. + subG = G.subgraph(G.nodes()) + sccs = list(strongly_connected_components(subG)) + while sccs: + scc = sccs.pop() + # order of scc determines ordering of nodes + startnode = scc.pop() + # Processing node runs 'circuit' routine from recursive version + path = [startnode] + blocked = set() # vertex: blocked from search? + closed = set() # nodes involved in a cycle + blocked.add(startnode) + B = defaultdict(set) # graph portions that yield no elementary circuit + stack = [(startnode, list(subG.neighbors(startnode)))] + while stack: + thisnode, nbrs = stack[-1] + if nbrs: + nextnode = nbrs.pop() + if nextnode == startnode: + yield path[:] + closed.update(path) + elif nextnode not in blocked: + path.append(nextnode) + stack.append((nextnode, list(subG.neighbors(nextnode)))) + closed.discard(nextnode) + blocked.add(nextnode) + continue + # done with nextnode... look for more neighbors + if not nbrs: # no more nbrs + if thisnode in closed: + _unblock(thisnode, blocked, B) + else: + for nbr in subG.neighbors(thisnode): + if thisnode not in B[nbr]: + B[nbr].add(thisnode) + stack.pop() + path.pop() + # done processing this node + subG.remove_node(startnode) + H = subG.subgraph(scc) # make smaller to avoid work in SCC routine + sccs.extend(list(strongly_connected_components(H))) + + +def find_cycle(graph): + """ + Looks for a cycle in the graph. If found, returns the first cycle. + If nodes a1, a2, ..., an are in a cycle, then this returns: + [(a1,a2), (a2,a3), ... (an-1,an), (an, a1)] + Otherwise returns an empty list. + """ + cycles = list(simple_cycles(graph)) + if cycles: + nodes = cycles[0] + nodes.append(nodes[0]) + edges = [] + prev = nodes[0] + for node in nodes[1:]: + edges.append((prev, node)) + prev = node + return edges + else: + return [] + + +def get_stacktrace(thread_id): + """ + Returns the stack trace for the thread id as a list of strings. + """ + gdb.execute("thread %d" % thread_id, from_tty=False, to_string=True) + output = gdb.execute("bt", from_tty=False, to_string=True) + stacktrace_lines = output.strip().split("\n") + return stacktrace_lines + + +def is_thread_blocked_with_frame( + thread_id, top_line, expected_top_lines, expected_frame +): + """ + Returns True if we found expected_top_line in top_line, and + we found the expected_frame in the thread's stack trace. + """ + if all(expected not in top_line for expected in expected_top_lines): + return False + stacktrace_lines = get_stacktrace(thread_id) + return any(expected_frame in line for line in stacktrace_lines) + + +class MutexType(Enum): + """Types of mutexes that we can detect deadlocks.""" + + PTHREAD_MUTEX_T = "pthread_mutex_t" + PTHREAD_RWLOCK_T = "pthread_rwlock_t" + + @staticmethod + def get_mutex_type(thread_id, top_line): + """ + Returns the probable mutex type, based on the first line + of the thread's stack. Returns None if not found. + """ + + WAITLIST = [ + "__lll_lock_wait", + "futex_abstimed_wait", + "futex_abstimed_wait_cancelable", + "futex_reltimed_wait", + "futex_reltimed_wait_cancelable", + "futex_wait", + "futex_wait_cancelable", + ] + + if is_thread_blocked_with_frame(thread_id, top_line, WAITLIST, "pthread_mutex"): + return MutexType.PTHREAD_MUTEX_T + if is_thread_blocked_with_frame( + thread_id, top_line, WAITLIST, "pthread_rwlock" + ): + return MutexType.PTHREAD_RWLOCK_T + return None + + @staticmethod + def get_mutex_owner_and_address_func_for_type(mutex_type): + """ + Returns a function to resolve the mutex owner and address for + the given type. The returned function f has the following + signature: + + f: args: (map of thread lwp -> thread id), blocked thread lwp + returns: (lwp of thread owning mutex, mutex address) + or (None, None) if not found. + + Returns None if there is no function for this mutex_type. + """ + if mutex_type == MutexType.PTHREAD_MUTEX_T: + return get_pthread_mutex_t_owner_and_address + if mutex_type == MutexType.PTHREAD_RWLOCK_T: + return get_pthread_rwlock_t_owner_and_address + return None + + +def print_cycle(graph, lwp_to_thread_id, cycle): + """Prints the threads and mutexes involved in the deadlock.""" + for m, n in cycle: + print( + "Thread %d (LWP %d) is waiting on %s (0x%016x) held by " + "Thread %d (LWP %d)" + % ( + lwp_to_thread_id[m], + m, + graph.attributes(m, n)["mutex_type"].value, + graph.attributes(m, n)["mutex"], + lwp_to_thread_id[n], + n, + ) + ) + + +def get_thread_info(): + """ + Returns a pair of: + - map of LWP -> thread ID + - map of blocked threads LWP -> potential mutex type + """ + # LWP -> thread ID + lwp_to_thread_id = {} + + # LWP -> potential mutex type it is blocked on + blocked_threads = {} + + output = gdb.execute("info threads", from_tty=False, to_string=True) + lines = output.strip().split("\n")[1:] + regex = re.compile(r"[\s\*]*(\d+).*Thread.*\(LWP (\d+)\).*") + for line in lines: + try: + thread_id = int(regex.match(line).group(1)) + thread_lwp = int(regex.match(line).group(2)) + lwp_to_thread_id[thread_lwp] = thread_id + mutex_type = MutexType.get_mutex_type(thread_id, line) + if mutex_type: + blocked_threads[thread_lwp] = mutex_type + except Exception: + continue + + return (lwp_to_thread_id, blocked_threads) + + +def get_pthread_mutex_t_owner_and_address(lwp_to_thread_id, thread_lwp): + """ + Finds the thread holding the mutex that this thread is blocked on. + Returns a pair of (lwp of thread owning mutex, mutex address), + or (None, None) if not found. + """ + # Go up the stack to the pthread_mutex_lock frame + gdb.execute( + "thread %d" % lwp_to_thread_id[thread_lwp], from_tty=False, to_string=True + ) + gdb.execute("frame 1", from_tty=False, to_string=True) + + # Get the owner of the mutex by inspecting the internal + # fields of the mutex. + try: + mutex_info = gdb.parse_and_eval("mutex").dereference() + mutex_owner_lwp = int(mutex_info["__data"]["__owner"]) + return (mutex_owner_lwp, int(mutex_info.address)) + except gdb.error: + return (None, None) + + +def get_pthread_rwlock_t_owner_and_address(lwp_to_thread_id, thread_lwp): + """ + If the thread is waiting on a write-locked pthread_rwlock_t, this will + return the pair of: + (lwp of thread that is write-owning the mutex, mutex address) + or (None, None) if not found, or if the mutex is read-locked. + """ + # Go up the stack to the pthread_rwlock_{rd|wr}lock frame + gdb.execute( + "thread %d" % lwp_to_thread_id[thread_lwp], from_tty=False, to_string=True + ) + gdb.execute("frame 2", from_tty=False, to_string=True) + + # Get the owner of the mutex by inspecting the internal + # fields of the mutex. + try: + rwlock_info = gdb.parse_and_eval("rwlock").dereference() + rwlock_data = rwlock_info["__data"] + field_names = ["__cur_writer", "__writer"] + fields = rwlock_data.type.fields() + field = [f for f in fields if f.name in field_names][0] + rwlock_owner_lwp = int(rwlock_data[field]) + # We can only track the owner if it is currently write-locked. + # If it is not write-locked or if it is currently read-locked, + # possibly by multiple threads, we cannot find the owner. + if rwlock_owner_lwp != 0: + return (rwlock_owner_lwp, int(rwlock_info.address)) + else: + return (None, None) + except gdb.error: + return (None, None) + + +class Deadlock(gdb.Command): + """Detects deadlocks""" + + def __init__(self): + super(Deadlock, self).__init__("deadlock", gdb.COMMAND_NONE) + + def invoke(self, arg, from_tty): + """Prints the threads and mutexes in a deadlock, if it exists.""" + lwp_to_thread_id, blocked_threads = get_thread_info() + + # Nodes represent threads. Edge (A,B) exists if thread A + # is waiting on a mutex held by thread B. + graph = DiGraph() + + # Go through all the blocked threads and see which threads + # they are blocked on, and build the thread wait graph. + for thread_lwp, mutex_type in blocked_threads.items(): + get_owner_and_address_func = ( + MutexType.get_mutex_owner_and_address_func_for_type(mutex_type) + ) + if not get_owner_and_address_func: + continue + mutex_owner_lwp, mutex_address = get_owner_and_address_func( + lwp_to_thread_id, thread_lwp + ) + if mutex_owner_lwp and mutex_address: + graph.add_edge( + thread_lwp, + mutex_owner_lwp, + mutex=mutex_address, + mutex_type=mutex_type, + ) + + # A deadlock exists if there is a cycle in the graph. + cycle = find_cycle(graph) + if cycle: + print("Found deadlock!") + print_cycle(graph, lwp_to_thread_id, cycle) + else: + print("No deadlock detected. " "Do you have debug symbols installed?") + + +def load(): + # instantiate the Deadlock command + Deadlock() + print('Type "deadlock" to detect deadlocks.') + + +def info(): + return "Detect deadlocks" + + +if __name__ == "__main__": + load()