1.1. Pandas分析步骤
载入数据
将 access_time 的日期进行 COUNT。类似如下SQL:
SELECT DATE_FORMAT(access_time, '%H'), count(*) FROM log GROUP BY DATE_FORMAT(access_time, '%H');
1.2. 代码
cat pd_ng_log_stat.py
#!/usr/bin/env python
#-*- coding: utf-8 -*-
from ng_line_parser import NgLineParser
import pandas as pd
import socket
import struct
class PDNgLogStat(object):
def __init__(self):
self.ng_line_parser = NgLineParser()
def _log_line_iter(self, pathes):
"""解析文件中的每一行并生成一个迭代器"""
for path in pathes:
with open(path, 'r') as f:
for index, line in enumerate(f):
self.ng_line_parser.parse(line)
yield self.ng_line_parser.to_dict()
def load_data(self, path):
"""通过给的文件路径加载数据生成 DataFrame"""
self.df = pd.DataFrame(self._log_line_iter(path))
def pv_hour(self):
"""计算在一天当中每个时段的访问情况"""
group_by_cols = ['access_time'] # 需要分组的列,只计算和显示该列
# 下面我们是按 hh(小时) 形式来分组的, 所以需要定义分组策略:
# 分组策略为: self.df['access_time'].map(lambda x: x.split().pop().split(':')[0])
pv_hour_grp = self.df[group_by_cols].groupby(
self.df['access_time'].map(lambda x: x.split().pop().split(':')[0]))
return pv_hour_grp.agg(['count'])
def main():
file_pathes = ['www.ttmark.com.access.log']
pd_ng_log_stat = PDNgLogStat()
pd_ng_log_stat.load_data(file_pathes)
# 统计每小时 pv
print pd_ng_log_stat.pv_hour()
if __name__ == '__main__':
main()
运行统计和输出结果
python pd_ng_log_stat.py
access_time
count
access_time
00 31539
01 34824
02 27895
03 29669
04 27742
05 26797
06 29384
07 31102
08 38257
09 43060
10 48064
11 57923
12 56413
13 57971
14 47260
15 46364
16 45721
17 48884
18 49318
19 49162
20 43641
21 42525
22 40371
23 34953