rra
4 years ago
1 changed files with 82 additions and 0 deletions
@ -0,0 +1,82 @@ |
|||||
|
import apachelogs, gzip, os, sqlite3 |
||||
|
import multiprocessing, time |
||||
|
|
||||
|
|
||||
|
parser = apachelogs.LogParser(apachelogs.COMBINED) |
||||
|
|
||||
|
#entry = parser.parse('207.237.221.51 - - [22/Jan/2021:00:14:58 +0100] "GET /feeds/all.rss.xml HTTP/2.0" 304 0 "-" "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0"') |
||||
|
|
||||
|
def create_db(): |
||||
|
db_name = 'traffic_stats2.db' |
||||
|
if not os.path.exists(db_name): |
||||
|
conn = sqlite3.connect(db_name) |
||||
|
c = conn.cursor() |
||||
|
c.execute("""CREATE TABLE traffic |
||||
|
(datetimes timestamp, |
||||
|
bytes integer);""") |
||||
|
conn.commit() |
||||
|
c.close() |
||||
|
|
||||
|
def populate_db(pairlist): |
||||
|
#timeout is so we can avoid database locks |
||||
|
#if multiple processes try to write to it shortly after another |
||||
|
conn = sqlite3.connect('traffic_stats2.db', timeout=30) |
||||
|
c = conn.cursor() |
||||
|
c.execute('''PRAGMA synchronous = OFF''') |
||||
|
c.execute('''PRAGMA journal_mode = OFF''') |
||||
|
sqliteparam = """ INSERT INTO 'traffic' |
||||
|
('datetimes', 'bytes') VALUES |
||||
|
(?,?);""" |
||||
|
|
||||
|
c.executemany(sqliteparam, pairlist) |
||||
|
conn.commit() |
||||
|
c.close() |
||||
|
|
||||
|
|
||||
|
def parse_log(gzip_file): |
||||
|
pairs = [] |
||||
|
|
||||
|
try: |
||||
|
logfile = gzip.open(gzip_file, 'r') |
||||
|
log = logfile.readlines() |
||||
|
for line in log: |
||||
|
if line: |
||||
|
line = line.decode('ascii').strip().strip('\x00') #hacky way to remove null bytes? https://stackoverflow.com/a/56904541 |
||||
|
try: |
||||
|
entry = parser.parse(str(line)) |
||||
|
pairs.append((entry.request_time, entry.bytes_sent)) |
||||
|
except Exception as e: |
||||
|
# print(e) |
||||
|
# this catches malformed entries but that is less than 0.1% of total set |
||||
|
pass |
||||
|
populate_db(pairs) |
||||
|
print(gzip_file,'\n') |
||||
|
except Exception as e: |
||||
|
print(e,g) |
||||
|
|
||||
|
files_to_process = [] |
||||
|
|
||||
|
for g in os.listdir('/home/r/Programming/ltm-2020-stats/traffic/solar_logs'): |
||||
|
if os.path.isfile(g): |
||||
|
if g.endswith('.gzip'): |
||||
|
if g: |
||||
|
if not g.startswith('2019'): |
||||
|
if not g.startswith('2021'): |
||||
|
files_to_process.append(g) |
||||
|
|
||||
|
files_to_process.sort() |
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
starttime = time.time() |
||||
|
create_db() |
||||
|
|
||||
|
manager = multiprocessing.Manager() |
||||
|
pool = multiprocessing.Pool() |
||||
|
pool.map(parse_log, files_to_process) |
||||
|
pool.close() |
||||
|
print() |
||||
|
print('saving data') |
||||
|
print('Time taken = {} seconds'.format(time.time() - starttime)) |
||||
|
|
||||
|
|
||||
|
|
Loading…
Reference in new issue