| #!/usr/bin/env python3
#
# Count .atom feed subscriptions from a nginx formatted
# access log file. Counts IPv4/IPv6 addresses only once.
# Parses subscription counts from User-Agents who provide
# this information. Filters out any low-traffic feeds
# with fewer than ten subscribers (false matches).
from io import open
from operator import itemgetter
from re import search
from sys import argv, exit
import os
paths = []
feed_data = {}
unique = {}
if len(argv) > 1:
for path in argv[1:]:
if os.access(path, os.R_OK):
paths.append(path)
else:
exit('The log file at {0} is not readable.'.format(path))
else:
exit('Usage: {0} <nginx-log-files>'.format(argv[0]))
for path in paths:
with open(path, 'r', encoding='utf-8') as logfile:
for logline in logfile:
logparse = search('^(.*)\ \-\ \-\ .*[GET|HEAD]\ (\/.*\.atom)\ .*\"(.*)\"$', logline)
if logparse is not None:
user = logparse.group(1)
feed = logparse.group(2)
agent = logparse.group(3)
subscribercount = search('([0-9]+)\ [subs|readers]', agent)
if subscribercount is not None:
newsubscribers = int(subscribercount.group(1))
else:
newsubscribers = 1
if feed not in feed_data:
feed_data[feed] = 0
if feed not in unique:
unique[feed] = []
if user not in unique[feed]:
unique[feed].append(user)
feed_data[feed] = feed_data[feed] + newsubscribers
feedcounts = [(count,feed) for feed, count in feed_data.items() if count >= 10]
feedcounts = sorted(feedcounts, key=itemgetter(0))
for count, feed in feedcounts:
print("{0} subscribers in {1}".format(count, feed))
|