On Mon, 26 Aug 2013 14:24:12 -0400, Andres Chavez <fluxboxtrem...@gmail.com> wrote: > Hi, can anyone tell me the best or at least the most used real time > bandwith monitoring tool, when using the PF+ALTQ solution please? > > thanks in advance.
We use Graphite for the display of data received by statsd, we then run the following script using cron every 60 seconds to transmit the PF queues to statsd (I'm not a great coder by any stretch of the imagination so please feel free to clean/optimise/improve it (pls share and let me know if you do ;) NB; test_master is a check to exit the script if it is not the CARP master, as this script also runs on all our remote office firewalls which have IPSec VPNs to head office (where the graphite/statsd server is located). Currently if a CARP backup firewall tries to send data back, it will try and send the data itself (and not via the CARP master) even thought the IPSec tunnel is only valid on the CARP master, this causes the firewalls back at head office to immediately distrust the IPSec VPN and so break it completely due to receiving the IPSec payload from the backup. I have mentioned this before on here and think I need to build a gif tunnel or do something else (someone gave a good suggestion for this but I haven't had time to fix it). NB; phys_iface is generated by a 'puppet facter' in our case, this can of course be replaced with a manual string. NB; The drop_rates_file_base file is monitored by Nagios to provide us an Alarm if a particular queue is saturating and dropping packets heavily. NB; Replace <%=@hostname-%> with the servers hostname or let puppet do it #!/usr/local/bin/python # Script to extract queue stats from the ALTQ PF Queues, and transmit to statsd for graphite graphing, and local logging for Nagios alerting # Written and maintained by Andrew Lemin import re import subprocess import socket import logging from subprocess import Popen, PIPE from time import sleep, time logging.basicConfig(filename='/var/log/pf_queue_monitor.log', level=logging.DEBUG, format='%(asctime)s %(levelname)s %(name)s %(message)s') logger=logging.getLogger(__name__) # Check is 'master' import sys def test_master(): p1 = subprocess.Popen(str("/sbin/ifconfig").split(), stdout=subprocess.PIPE) p1.wait() p2 = subprocess.Popen(["grep","status: master"], stdin=p1.stdout, stdout=subprocess.PIPE) p2.wait() p3 = subprocess.Popen(str("wc -l").split(), stdin=p2.stdout, stdout=subprocess.PIPE) p1.stdout.close() p2.stdout.close() master = int(p3.communicate()[0]) if master == 0: sys.exit() test_master() ## all_iface set by puppet from 'facter interfaces' #all_iface = "<%=@interfaces%>".split(',') all_iface = "lo0,em0,em1,em2,em3,em4,em5,enc0,pflog0,pflog1,pflog2,pflow0,pfsync0,carp0,carp1,carp2,carp3,carp4".split(',') regex = re.compile('lo|pf|carp|enc') phys_iface = [x for x in all_iface if not regex.match(x)] drop_rates_file_base="/var/spool/pf_queue_drop_rate" run_time=55 graphite="<IP ADDRESS of STATSD>" def netcat(hostname, port, content): s = socket.socket(family=socket.AF_INET,type=socket.SOCK_DGRAM) s.settimeout(3) try: s.sendto(content,(hostname,port)) except socket.error as err: logger.error(err) #print(content) s.close() s = None def get_queue_stats(iface): try: p1 = subprocess.Popen(str("/sbin/pfctl -s queue -v").split(), stdout=subprocess.PIPE) p1.wait() p2 = subprocess.Popen(str("grep -A1 %s" % (iface)).split(), stdin=p1.stdout, stdout=subprocess.PIPE) p2.wait() p3 = subprocess.Popen(str("sed -e s/^q/Zq/;").split(), stdin=p2.stdout, stdout=subprocess.PIPE) p3.wait() p4 = subprocess.Popen(["tr","\n"," "], stdin=p3.stdout, stdout=subprocess.PIPE) p4.wait() p5 = subprocess.Popen(["tr","Z","\n"], stdin=p4.stdout, stdout=subprocess.PIPE) p5.wait() p6 = subprocess.Popen(["sed","-e","s/ */ /g"], stdin=p5.stdout, stdout=subprocess.PIPE) p6.wait() p7 = subprocess.Popen(str("grep _wan_").split(), stdin=p6.stdout, stdout=subprocess.PIPE) p7.wait() p8 = subprocess.Popen(str("grep -v root_").split(), stdin=p7.stdout, stdout=subprocess.PIPE) p8.wait() p9 = subprocess.Popen(str("grep -v {.*}").split(), stdin=p8.stdout, stdout=subprocess.PIPE) p1.stdout.close() p2.stdout.close() p3.stdout.close() p4.stdout.close() p5.stdout.close() p6.stdout.close() p7.stdout.close() p8.stdout.close() output = p9.communicate()[0] except Exception as err: logger.error(err) stats={} for entry in output[1:].splitlines(): split_entry = str(entry).split(" ") queue_name = split_entry[1] pkts = int(split_entry[-10]) kb = 8*int(split_entry[-8])/1024 drop_pkts = int(split_entry[-5]) drop_kb = 8*int(split_entry[-3])/1024 alloc_kb_str = split_entry[5] if alloc_kb_str[-2:] == "Gb": alloc_kb = int(float( alloc_kb_str[:-2] )) * 1024 * 1024 elif alloc_kb_str[-2:] == "Mb": alloc_kb = int(float( alloc_kb_str[:-2] )) * 1024 elif alloc_kb_str[-2:] == "Kb": alloc_kb = int(float( alloc_kb_str[:-2] )) else: alloc_kb = 0 stats[queue_name] = [pkts,kb,drop_pkts,drop_kb,alloc_kb] try: p1 = subprocess.Popen(str("netstat -b -n -I %s" % (iface)).split(), stdout=subprocess.PIPE) p1.wait() p2 = subprocess.Popen(str("grep %s" % (iface)).split(), stdin=p1.stdout, stdout=subprocess.PIPE) p2.wait() p3 = subprocess.Popen(str("tail -1").split(), stdin=p2.stdout, stdout=subprocess.PIPE) p3.wait() p4 = subprocess.Popen(["sed","-e","s/ */ /g"], stdin=p3.stdout, stdout=subprocess.PIPE) p4.wait() p5 = subprocess.Popen(["tr","\n"," "], stdin=p4.stdout, stdout=subprocess.PIPE) p1.stdout.close() p2.stdout.close() p3.stdout.close() p4.stdout.close() totalin = p5.communicate()[0] except Exception as err: logger.error(err) splitin = str(totalin).split(" ") kbtotalin = 8*int(splitin[4])/1024 try: p1 = subprocess.Popen(str("netstat -b -n -I %s" % (iface)).split(), stdout=subprocess.PIPE) p1.wait() p2 = subprocess.Popen(str("grep %s" % (iface)).split(), stdin=p1.stdout, stdout=subprocess.PIPE) p2.wait() p3 = subprocess.Popen(str("tail -1").split(), stdin=p2.stdout, stdout=subprocess.PIPE) p3.wait() p4 = subprocess.Popen(["sed","-e","s/ */ /g"], stdin=p3.stdout, stdout=subprocess.PIPE) p4.wait() p5 = subprocess.Popen(["tr","\n"," "], stdin=p4.stdout, stdout=subprocess.PIPE) p1.stdout.close() p2.stdout.close() p3.stdout.close() p4.stdout.close() totalout = p5.communicate()[0] except Exception as err: logger.error(err) splitout = str(totalout).split(" ") kbtotalout = 8*int(splitout[5])/1024 return (stats,kbtotalin,kbtotalout) def get_all_stats(): all_stats = {} all_kbtotalin = {} all_kbtotalout = {} for iface in phys_iface: all_stats[iface],all_kbtotalin[iface],all_kbtotalout[iface] = get_queue_stats(iface) return (all_stats,all_kbtotalin,all_kbtotalout) def transmit_stats(base_stats,current_stats,start_kbtotalin,current_kbtotalin,start_kbtotalout,current_kbtotalout,interval=1): for iface in list(current_stats.keys()): prev_totin = start_kbtotalin[iface] curr_totin = current_kbtotalin[iface] totin_rate = (curr_totin - prev_totin) / interval if totin_rate > 0: netcat(graphite, 8125, bytes(("traffic.<%=@hostname-%>.%s.ingress.kbps:%d|g" % (iface,totin_rate)), 'utf-8')) prev_totout = start_kbtotalout[iface] curr_totout = current_kbtotalout[iface] totout_rate = (curr_totout - prev_totout) / interval if totout_rate > 0: netcat(graphite, 8125, bytes(("traffic.<%=@hostname-%>.%s.egress.kbps:%d|g" % (iface,totout_rate)), 'utf-8')) prev_array = base_stats[iface] curr_array = current_stats[iface] for queue_name in list(curr_array.keys()): pkt_rate = (curr_array[queue_name][0] - prev_array[queue_name][0]) / interval kb_rate = (curr_array[queue_name][1] - prev_array[queue_name][1]) / interval pkt_drop_rate = (curr_array[queue_name][2] - prev_array[queue_name][2]) / interval kb_drop_rate = (curr_array[queue_name][3] - prev_array[queue_name][3]) / interval kb_alloc = curr_array[queue_name][4] if pkt_rate > 0: pkt_drop_ratio = (pkt_drop_rate / pkt_rate) * 100 else: pkt_drop_ratio = 0 if pkt_rate >= 0: netcat(graphite, 8125, bytes(("traffic.<%=@hostname-%>.%s.%s.pps:%d|g" % (iface,queue_name,pkt_rate)), 'utf-8')) if kb_rate >= 0: netcat(graphite, 8125, bytes(("traffic.<%=@hostname-%>.%s.%s.kbps:%d|g" % (iface,queue_name,kb_rate)), 'utf-8')) if pkt_drop_rate >= 0: netcat(graphite, 8125, bytes(("traffic.<%=@hostname-%>.%s.%s.drop_pps:%d|g" % (iface,queue_name,pkt_drop_rate)), 'utf-8')) if kb_drop_rate >= 0: netcat(graphite, 8125, bytes(("traffic.<%=@hostname-%>.%s.%s.drop_kbps:%d|g" % (iface,queue_name,kb_drop_rate)), 'utf-8')) if pkt_drop_ratio >= 0: netcat(graphite, 8125, bytes(("traffic.<%=@hostname-%>.%s.%s.drop_ratio:%d|g" % (iface,queue_name,pkt_drop_ratio)), 'utf-8')) if kb_alloc >= 0: netcat(graphite, 8125, bytes(("traffic.<%=@hostname-%>.%s.%s.queue_size:%d|g" % (iface,queue_name,kb_alloc)), 'utf-8')) if pkt_drop_ratio >= 0: try: fh = open("%s_%s.data" % (drop_rates_file_base,iface),'w') fh.write("%s %d\n" % (queue_name,pkt_drop_ratio)) except Exception as err: logger.error(err) fh.close() # TODO: Verify drop_rates_file_base file age in nagios nrpe script start_time = int(time()) start_stats,start_kbtotalin,start_kbtotalout = get_all_stats() base_stats = start_stats count = 0 while count < run_time: sleep(1) # TODO: Analyze jitter values count = count+1 end_time = int(time()) current_stats,current_kbtotalin,current_kbtotalout = get_all_stats() transmit_stats(start_stats,current_stats,start_kbtotalin,current_kbtotalin,start_kbtotalout,current_kbtotalout,(end_time-start_time)) Hope this does what you're looking for, Andrew Lemin