#!/usr/bin/env python3
#
# Count .atom feed subscriptions from a nginx formatted
# access log file. Counts IPv4/IPv6 addresses only once.
# Parses subscription counts from User-Agents who provide
# this information. Filters out any low-traffic feeds
# with fewer than ten subscribers (false matches).

from io import open
from operator import itemgetter
from re import search
from sys import argv, exit
import os

paths = []
feed_data = {}
unique = {}

if len(argv) > 1:
    for path in argv[1:]:
        if os.access(path, os.R_OK):
            paths.append(path)
        else:
            exit('The log file at {0} is not readable.'.format(path))
else:
    exit('Usage: {0} <nginx-log-files>'.format(argv[0]))

for path in paths:
    with open(path, 'r', encoding='utf-8') as logfile:
        for logline in logfile:
            logparse = search('^(.*)\ \-\ \-\ .*[GET|HEAD]\ (\/.*\.atom)\ .*\"(.*)\"$', logline)
            if logparse is not None:
                user = logparse.group(1)
                feed = logparse.group(2)
                agent = logparse.group(3)
                subscribercount = search('([0-9]+)\ [subs|readers]', agent)
                if subscribercount is not None:
                    newsubscribers = int(subscribercount.group(1))
                else:
                    newsubscribers = 1

                if feed not in feed_data:
                    feed_data[feed] = 0
                if feed not in unique:
                    unique[feed] = []
                if user not in unique[feed]:
                    unique[feed].append(user)
                    feed_data[feed] = feed_data[feed] + newsubscribers

feedcounts = [(count,feed) for feed, count in feed_data.items() if count >= 10]
feedcounts = sorted(feedcounts, key=itemgetter(0))

for count, feed in feedcounts:
    print("{0} subscribers in {1}".format(count, feed))