Hi,
On Thu, Mar 2, 2023 at 12:14 PM Alexander Aring <[email protected]> wrote:
>
> This patch adds a python tool to plot with use of python plotly [0] the
> dlm_controld cluster-wide lockdb. It scans dlm_controld plock log files
> for the required information about possible contention (waiters) and the
> acquired lock modes over time.
>
> Example:
>
> ./lockdb_plot /var/log/dlm_controld/plock.log
>
> and plotly will start a webserver and you can see a generated, interactive
> plot about the lockranges in y axis and time in x axis. Depending on
> usage it's limited and does not scale. Ther exists arguments like --info
> to get plock.log timing information and with --ts_start/ts_end you can
> filter a specific time range to only show a small part of the plock log
> to reduce the amount of rectangles need to be drawn by the browser.
>
> It shows only lock modes and contention, no lock requests. Means A trylock
> is only visible when it was successful. A non trylock will be drawn and
> has possible contention.
>
> With this plot it should be easy to figure out on which node a specific
> lock range was acquired and resulted in contention.
>
> [0] https://plotly.com/python/
> ---
> python/tools/lockdb_plot | 429 +++++++++++++++++++++++++++++++++++++++
> 1 file changed, 429 insertions(+)
> create mode 100755 python/tools/lockdb_plot
>
> diff --git a/python/tools/lockdb_plot b/python/tools/lockdb_plot
> new file mode 100755
> index 00000000..761bdea2
> --- /dev/null
> +++ b/python/tools/lockdb_plot
> @@ -0,0 +1,429 @@
> +#!/usr/bin/env python3
> +
> +import plotly.graph_objects as go
> +from enum import Enum
> +import argparse
> +import sys
> +
> +NSECS_PER_SECS = 1000000000
> +
> +def nsecs_to_secs(nsecs):
> + return nsecs/NSECS_PER_SECS
> +
> +def secs_to_nsecs(secs):
> + return secs * NSECS_PER_SECS
> +
> +# entry point of collecting data
> +nodes = dict()
> +# will parse log to get our_nodeid
> +our_nodeid = None
> +# legend groups, tmp set to show legend entry only once
> +legend_groups = set()
> +
> +class EType(Enum):
> + LOCK = 1
> + WAITER = 2
> +
> +def lookup_lockdb(nodeid, ls, number):
> + if nodeid not in nodes:
> + nodes[nodeid] = dict()
> +
> + lockspaces = nodes[nodeid]
> + if ls not in lockspaces:
> + lockspaces[ls] = dict()
> +
> + if number not in lockspaces[ls]:
> + lockspaces[ls][number] = LockDb(nodeid, ls, number)
> +
> + return lockspaces[ls][number]
> +
> +def plotly_rect_shape(x0, y0, x1, y1, color, opacity, text, group):
> + # we show it in secs
> + x0 = nsecs_to_secs(x0)
> + x1 = nsecs_to_secs(x1)
> +
> + rv = go.Scatter(x=[x0,x0,x1,x1,x0],
> + y=[y0,y1,y1,y0,y0],
> + fill="toself",
> + fillcolor=color,
> + mode='lines',
> + name=group,
> + legendgroup=group,
> + showlegend=group not in legend_groups,
> + visible=True,
> + text=text,
> + line=dict(color=color,
> + width=1),
> + opacity=opacity)
> +
> + legend_groups.add(group)
> + return rv
> +
> +
> +class LockDbRect():
> + def __init__(self, **kwargs):
> + self.ls = kwargs["ls"]
> + self.number = kwargs["number"]
> + self.pid = kwargs["pid"]
> + self.nodeid = kwargs["nodeid"]
> + self.owner = kwargs["owner"]
> + self.ts_start = kwargs["ts_start"]
> + self.ts_end = kwargs["ts_end"]
> + # +1 because lock range 1000-1000 will then be 999-1000
> + # exclusive vs inclusive we need to decide how to display
> + # it but we don't set a zero height
> + self.lk_start = kwargs["lk_start"] - 1
> + self.lk_end = kwargs["lk_end"]
> +
> + def create_shape(self):
> + return plotly_rect_shape(self.ts_start,
> + self.lk_start,
> + self.ts_end,
> + self.lk_end,
> + self.get_color(),
> + self.get_opacity(),
> + self.get_text(),
> + self.get_group())
> +
> + def get_duration(self):
> + return self.ts_end - self.ts_start
> +
> + def get_range(self):
> + return self.lk_end - self.lk_start
> +
> + def __str__(self):
> + return str(self.__dict__)
> +
> + def __repr__(self):
> + return str(self)
> +
> +class LockDbRectWaiter(LockDbRect):
> + def __init__(self, **kwargs):
> + super(LockDbRectWaiter, self).__init__(**kwargs)
> +
> + def get_color(self):
> + return "green"
> +
> + def get_opacity(self):
> + return 0.2
> +
> + def get_text(self):
> + # TODO make it human readable for high numbers?
> + ts_start = nsecs_to_secs(self.ts_start);
> + ts_end = nsecs_to_secs(self.ts_end);
> + dt = nsecs_to_secs(self.get_duration());
> +
> + text = "nodeid: {}, pid: {}, owner: {}<br>".format(self.nodeid,
> self.pid, hex(self.owner)) + \
> + "ts_start: {0:.3f},".format(ts_start) + \
> + "ts_end: {0:.3f},".format(ts_end) + \
> + "dt: {0:.3f}<br>".format(dt) + \
> + "lk_start: {}, lk_end: {}, dy:
> {}<br>".format(hex(self.lk_start), hex(self.lk_end), hex(self.get_range()))
> +
> + return text
> +
> + def get_text_pl(self):
> + text = "contention:<br>" + \
> + "start: {0:.3f},".format(nsecs_to_secs(self.ts_start)) + \
> + "end: {0:.3f},".format(nsecs_to_secs(self.ts_end)) + \
> + "dt: {0:.3f}".format(nsecs_to_secs(self.get_duration()))
> +
there are some missing spaces after each comma. I will try to add a
function and use this function for the same code below.
> + return text
> +
> + def get_group(self):
> + return "contention"
> +
> +class LockDbRectPLock(LockDbRect):
> + def __init__(self, **kwargs):
> + super(LockDbRectPLock, self).__init__(**kwargs)
> + self.mode = kwargs["mode"]
> + if self.mode == "WR":
> + self.color = "red"
> + self.group = "write lock"
> + elif self.mode == "RD":
> + self.color = "blue"
> + self.group = "read lock"
> + else:
> + print("{}: received unknown mode".format(self.ts_start))
> + return
> +
> + self.waiter = None
> +
> + def __str__(self):
> + return str(self.__dict__)
> +
> + def __repr__(self):
> + return str(self)
> +
> + def has_waiter(self):
> + return self.waiter
> +
> + def set_waiter(self, wa):
> + if self.has_waiter():
> + print("{}: has already a waiter set".format(self.ts_start))
> + return
> +
> + self.waiter = wa
> +
> + def get_waiter(self):
> + return self.waiter
> +
> + def get_color(self):
> + return self.color
> +
> + def get_opacity(self):
> + return 0.4
> +
> + def get_text(self):
> + # TODO make it human readable for high numbers?
> + ts_start = nsecs_to_secs(self.ts_start);
> + ts_end = nsecs_to_secs(self.ts_end);
> + dt = nsecs_to_secs(self.get_duration());
> +
> + text = "nodeid: {}, pid: {}, owner: {}<br>".format(self.nodeid,
> self.pid, hex(self.owner)) + \
> + "ts_start: {0:.3f},".format(ts_start) + \
> + "ts_end: {0:.3f},".format(ts_end) + \
> + "dt: {0:.3f}<br>".format(dt) + \
here...
> + "lk_start: {}, lk_end: {}, dy:
> {}<br>".format(hex(self.lk_start), hex(self.lk_end), hex(self.get_range()))
> +
> + if self.has_waiter():
> + text = text + "<br>{}".format(self.get_waiter().get_text_pl())
> +
> + return text;
> +
> + def get_group(self):
> + return self.group
> +
> +class LockDb:
> + def __init__(self, nodeid, ls, number):
> + self.nodeid = nodeid
> + self.ls = ls
> + self.number = number
> + self.__ts = dict()
> + self.plocks = list()
> + self.__last_plock = dict()
> + self.waiters = list()
> + self.__last_waiter = dict()
> + self.__ts_start_min = None
> + self.__ts_end_max = 0
> +
> + def event_start(self, vals):
> + self.__ts[vals["ptr"]] = vals
> +
> + def __plock(self, vals_start, vals_end):
> + pl = LockDbRectPLock(ls=vals_start["ls"],
> + number=vals_start["number"],
> + pid=vals_start["pid"],
> + nodeid=vals_start["nodeid"],
> + owner=vals_start["owner"],
> + ts_start=vals_start["ts"],
> + ts_end=vals_end["ts"],
> + lk_start=vals_start["start"],
> + lk_end=vals_start["end"],
> + mode=vals_start["mode"])
> + self.plocks.append(pl)
> + self.__last_plock[vals_end["ptr"]] = pl
> +
> + try:
> + pl.set_waiter(self.__last_waiter[vals_start["owner"]])
> + del(self.__last_waiter[vals_start["owner"]])
> + except KeyError:
> + pass
> +
> + def __waiter(self, vals_start, vals_end):
> + wa = LockDbRectWaiter(ls=vals_start["ls"],
> + number=vals_start["number"],
> + pid=vals_start["pid"],
> + nodeid=vals_start["nodeid"],
> + owner=vals_start["owner"],
> + ts_start=vals_start["ts"],
> + ts_end=vals_end["ts"],
> + lk_start=vals_start["start"],
> + lk_end=vals_start["end"])
> + self.waiters.append(wa)
> + self.__last_waiter[vals_start["owner"]] = wa
> +
> + def event_end(self, vals):
> + try:
> + vals_start = self.__ts[vals["ptr"]]
> + del(self.__ts[vals["ptr"]])
> + except KeyError:
> + print("{}: cannot find previous start,
> ignoring".format(vals["ts"]))
> + return
> +
> + if not self.__ts_start_min:
> + self.__ts_start_min = vals_start["ts"]
> + else:
> + self.__ts_start_min = min(self.__ts_start_min, vals_start["ts"])
> +
> + if not self.__ts_start_min:
> + self.__ts_end_max = vals["ts"]
> + else:
> + self.__ts_end_max = max(self.__ts_end_max, vals["ts"])
> +
> + if vals["event"] == EType.LOCK:
> + self.__plock(vals_start, vals)
> + elif vals["event"] == EType.WAITER:
> + self.__waiter(vals_start, vals)
> + else:
> + print("{}: unknown event type".format(vals["ts"]))
> +
> + def add_shapes(self, fig):
> + global legend_groups
> + legend_groups = set()
> +
> + # first waiters to be in background
> + for wa in self.waiters:
> + fig.add_trace(wa.create_shape())
> +
> + # next layer to draw lock states
> + for pl in self.plocks:
> + fig.add_trace(pl.create_shape())
> +
> + def get_ts_min(self):
> + return self.__ts_start_min
> +
> + def get_ts_max(self):
> + return self.__ts_end_max
> +
> + def __str__(self):
> + return "ls {} inode {}".format(self.ls, self.number)
> +
> + def __repr__(self):
> + return str(self)
> +
> +parser = argparse.ArgumentParser(description="Plots dlm_controld lock
> databases")
> +parser.add_argument('-i', "--info", action="store_true")
> +parser.add_argument('-s', "--ts_start", type=float)
> +parser.add_argument('-e', "--ts_end", type=float)
> +
> +parser.add_argument('plock_logs', type=argparse.FileType('r'), nargs='+')
> +args = parser.parse_args()
> +
> +for f in args.plock_logs:
> + lines = f.readlines()
> +
> + # Current way to figure out our nodeid
> + for line in lines:
> + if "read plock" in line:
> + lsplit = line.split()
> + npo = lsplit[11].split('/')
> + nodeid = int(npo[0])
> + our_nodeid = nodeid
> + break
> +
> + if not our_nodeid:
> + print("{}: could not determine our nodeid".format(f))
> + continue
> +
> + for line in lines:
> + if "state plock" in line:
> + pass
> + elif "state waiter" in line:
> + pass
> + else:
> + continue
> +
> + vals = dict()
> + lsplit = line.split()
> + vals["ts"] = int(lsplit[3])
> +
> + if args.ts_start and secs_to_nsecs(args.ts_start) > int(vals["ts"]):
> + continue
> +
> + if args.ts_end and secs_to_nsecs(args.ts_end) < int(vals["ts"]):
> + break;
> +
> + vals["ls"] = lsplit[4]
> + vals["op"] = lsplit[7]
> + vals["number"] = int(lsplit[8], 16)
> + vals["ptr"] = lsplit[9]
> +
> + db = lookup_lockdb(our_nodeid, vals["ls"], vals["number"])
> + if "state plock" in line:
> + vals["event"] = EType.LOCK
> + if vals["op"] == "start":
> + vals["mode"] = lsplit[10]
> + se = lsplit[11].split('-')
> + vals["start"] = int(se[0], 16)
> + vals["end"] = int(se[1], 16)
> + npo = lsplit[12].split('/')
> + vals["nodeid"] = int(npo[0])
> + vals["pid"] = int(npo[1])
> + vals["owner"] = int(npo[2], 16)
> + db.event_start(vals)
> + elif vals["op"] =="end":
> + db.event_end(vals)
> + else:
> + print("{}: unknown state plock operation:
> {}".format(vals["ts"], vals["op"]))
> + elif "state waiter" in line:
> + vals["event"] = EType.WAITER
> + if vals["op"] == "start":
> + se = lsplit[10].split('-')
> + vals["start"] = int(se[0], 16)
> + vals["end"] = int(se[1], 16)
> + npo = lsplit[11].split('/')
> + vals["nodeid"] = int(npo[0])
> + vals["pid"] = int(npo[1])
> + vals["owner"] = int(npo[2], 16)
> + db.event_start(vals)
> + elif vals["op"] == "end":
> + db.event_end(vals)
> + else:
> + print("{}: unknown state waiter operation:
> {}".format(vals["ts"], vals["op"]))
> + else:
> + print("received unknown line event: {}".format(line))
> +
> +if args.info:
> + ts_max = None
> + ts_min = None
> +
> + for nodeid in nodes:
> + for ls in nodes[nodeid]:
> + for number in nodes[nodeid][ls]:
> + db = lookup_lockdb(nodeid, ls, number)
> + if not ts_min:
> + ts_min = db.get_ts_min()
> + else:
> + ts_min = min(ts_min, db.get_ts_min())
> +
> + if not ts_max:
> + ts_max = db.get_ts_max()
> + else:
> + ts_max = min(ts_max, db.get_ts_max())
> +
> + print("global info:")
> + print("\tts_start: {0:.3f} secs".format(nsecs_to_secs(ts_min)))
> + print("\tts_end: {0:.3f} secs".format(nsecs_to_secs(ts_max)))
> + print("\tduration: {0:.3f} secs".format(nsecs_to_secs(ts_max) -
> nsecs_to_secs(ts_min)))
> +
> + print()
> + print("per inode lock info:")
> + for nodeid in nodes:
> + print("nodeid: {}".format(nodeid))
> + for ls in nodes[nodeid]:
> + print("\tls: {}".format(ls))
> + for number in nodes[nodeid][ls]:
> + print("\t\tinode: {}".format(hex(number)))
> + db = lookup_lockdb(nodeid, ls, number)
> + print("\t\t\tts_start: {0:.3f}
> secs".format(nsecs_to_secs(db.get_ts_min())))
> + print("\t\t\tts_end: {0:.3f}
> secs".format(nsecs_to_secs(db.get_ts_max())))
> + print("\t\t\tduration: {0:.3f}
> secs".format(nsecs_to_secs(db.get_ts_max()) - nsecs_to_secs(db.get_ts_min())))
> +
I will add a "top 5" high score of the longest duration of contention
state... So if somebody is looking for long contention states it is
easy to find them with a fast parsing of the log files.
- Alex