我有一个标准 syslog 格式的日志文件。它看起来像这样,但每秒有数百行:
Jan 11 07:48:46 blahblahblah...
Jan 11 07:49:00 blahblahblah...
Jan 11 07:50:13 blahblahblah...
Jan 11 07:51:22 blahblahblah...
Jan 11 07:58:04 blahblahblah...
它不会在午夜准确滚动,但它永远不会超过两天。
我经常需要从这个文件中提取时间片。我想为此编写一个通用脚本,可以像这样调用:
$ timegrep 22:30-02:00 /logs/something.log
...并让它从 22:30 开始拉出线路,跨越午夜边界,直到第二天凌晨 2 点。
有几点需要注意:
- 我不想费心在命令行上输入日期,只需输入时间。程序应该足够聪明,能够找出答案。
- 日志日期格式不包括年份,因此它应该根据当前年份进行猜测,但仍然在元旦前后做正确的事情。
- 我希望它速度快——它应该利用行按顺序在文件中查找的事实并使用二进制搜索。
在我花大量时间写这篇文章之前,它已经存在了吗?
答案1
更新:我用更新版本替换了原始代码,并进行了大量改进。我们称之为(实际?)alpha 质量。
此版本包括:
- 命令行选项处理
- 命令行日期格式验证
- 一些
try
街区 - 行读取移至函数中
原文:
你知道什么?“寻找”,你就会找到!这是一个 Python 程序,它在文件中四处寻找,并使用或多或少的二分搜索。它是相当比 AWK 脚本更快另一个人写道。
它的质量是(预)alpha 版。它应该有try
块和输入验证以及大量测试,并且毫无疑问会更加 Pythonic。但这里是为了娱乐。哦,它是为 Python 2.6 编写的。
新代码:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# timegrep.py by Dennis Williamson 20100113
# in response to http://serverfault.com/questions/101744/fast-extraction-of-a-time-range-from-syslog-logfile
# thanks to serverfault user http://serverfault.com/users/1545/mike
# for the inspiration
# Perform a binary search through a log file to find a range of times
# and print the corresponding lines
# tested with Python 2.6
# TODO: Make sure that it works if the seek falls in the middle of
# the first or last line
# TODO: Make sure it's not blind to a line where the sync read falls
# exactly at the beginning of the line being searched for and
# then gets skipped by the second read
# TODO: accept arbitrary date
# done: add -l long and -s short options
# done: test time format
version = "0.01a"
import os, sys
from stat import *
from datetime import date, datetime
import re
from optparse import OptionParser
# Function to read lines from file and extract the date and time
def getdata():
"""Read a line from a file
Return a tuple containing:
the date/time in a format such as 'Jan 15 20:14:01'
the line itself
The last colon and seconds are optional and
not handled specially
"""
try:
line = handle.readline(bufsize)
except:
print("File I/O Error")
exit(1)
if line == '':
print("EOF reached")
exit(1)
if line[-1] == '\n':
line = line.rstrip('\n')
else:
if len(line) >= bufsize:
print("Line length exceeds buffer size")
else:
print("Missing newline")
exit(1)
words = line.split(' ')
if len(words) >= 3:
linedate = words[0] + " " + words[1] + " " + words[2]
else:
linedate = ''
return (linedate, line)
# End function getdata()
# Set up option handling
parser = OptionParser(version = "%prog " + version)
parser.usage = "\n\t%prog [options] start-time end-time filename\n\n\
\twhere times are in the form hh:mm[:ss]"
parser.description = "Search a log file for a range of times occurring yesterday \
and/or today using the current time to intelligently select the start and end. \
A date may be specified instead. Seconds are optional in time arguments."
parser.add_option("-d", "--date", action = "store", dest = "date",
default = "",
help = "NOT YET IMPLEMENTED. Use the supplied date instead of today.")
parser.add_option("-l", "--long", action = "store_true", dest = "longout",
default = False,
help = "Span the longest possible time range.")
parser.add_option("-s", "--short", action = "store_true", dest = "shortout",
default = False,
help = "Span the shortest possible time range.")
parser.add_option("-D", "--debug", action = "store", dest = "debug",
default = 0, type = "int",
help = "Output debugging information.\t\t\t\t\tNone (default) = %default, Some = 1, More = 2")
(options, args) = parser.parse_args()
if not 0 <= options.debug <= 2:
parser.error("debug level out of range")
else:
debug = options.debug # 1 = print some debug output, 2 = print a little more, 0 = none
if options.longout and options.shortout:
parser.error("options -l and -s are mutually exclusive")
if options.date:
parser.error("date option not yet implemented")
if len(args) != 3:
parser.error("invalid number of arguments")
start = args[0]
end = args[1]
file = args[2]
# test for times to be properly formatted, allow hh:mm or hh:mm:ss
p = re.compile(r'(^[2][0-3]|[0-1][0-9]):[0-5][0-9](:[0-5][0-9])?$')
if not p.match(start) or not p.match(end):
print("Invalid time specification")
exit(1)
# Determine Time Range
yesterday = date.fromordinal(date.today().toordinal()-1).strftime("%b %d")
today = datetime.now().strftime("%b %d")
now = datetime.now().strftime("%R")
if start > now or start > end or options.longout or options.shortout:
searchstart = yesterday
else:
searchstart = today
if (end > start > now and not options.longout) or options.shortout:
searchend = yesterday
else:
searchend = today
searchstart = searchstart + " " + start
searchend = searchend + " " + end
try:
handle = open(file,'r')
except:
print("File Open Error")
exit(1)
# Set some initial values
bufsize = 4096 # handle long lines, but put a limit them
rewind = 100 # arbitrary, the optimal value is highly dependent on the structure of the file
limit = 75 # arbitrary, allow for a VERY large file, but stop it if it runs away
count = 0
size = os.stat(file)[ST_SIZE]
beginrange = 0
midrange = size / 2
oldmidrange = midrange
endrange = size
linedate = ''
pos1 = pos2 = 0
if debug > 0: print("File: '{0}' Size: {1} Today: '{2}' Now: {3} Start: '{4}' End: '{5}'".format(file, size, today, now, searchstart, searchend))
# Seek using binary search
while pos1 != endrange and oldmidrange != 0 and linedate != searchstart:
handle.seek(midrange)
linedate, line = getdata() # sync to line ending
pos1 = handle.tell()
if midrange > 0: # if not BOF, discard first read
if debug > 1: print("...partial: (len: {0}) '{1}'".format((len(line)), line))
linedate, line = getdata()
pos2 = handle.tell()
count += 1
if debug > 0: print("#{0} Beg: {1} Mid: {2} End: {3} P1: {4} P2: {5} Timestamp: '{6}'".format(count, beginrange, midrange, endrange, pos1, pos2, linedate))
if searchstart > linedate:
beginrange = midrange
else:
endrange = midrange
oldmidrange = midrange
midrange = (beginrange + endrange) / 2
if count > limit:
print("ERROR: ITERATION LIMIT EXCEEDED")
exit(1)
if debug > 0: print("...stopping: '{0}'".format(line))
# Rewind a bit to make sure we didn't miss any
seek = oldmidrange
while linedate >= searchstart and seek > 0:
if seek < rewind:
seek = 0
else:
seek = seek - rewind
if debug > 0: print("...rewinding")
handle.seek(seek)
linedate, line = getdata() # sync to line ending
if debug > 1: print("...junk: '{0}'".format(line))
linedate, line = getdata()
if debug > 0: print("...comparing: '{0}'".format(linedate))
# Scan forward
while linedate < searchstart:
if debug > 0: print("...skipping: '{0}'".format(linedate))
linedate, line = getdata()
if debug > 0: print("...found: '{0}'".format(line))
if debug > 0: print("Beg: {0} Mid: {1} End: {2} P1: {3} P2: {4} Timestamp: '{5}'".format(beginrange, midrange, endrange, pos1, pos2, linedate))
# Now that the preliminaries are out of the way, we just loop,
# reading lines and printing them until they are
# beyond the end of the range we want
while linedate <= searchend:
print line
linedate, line = getdata()
if debug > 0: print("Start: '{0}' End: '{1}'".format(searchstart, searchend))
handle.close()
答案2
这将根据条目与当前时间(“现在”)的关系打印开始时间和结束时间之间的条目范围。
用法:
timegrep [-l] start end filename
例子:
$ timegrep 18:47 03:22 /some/log/file
(long)选项-l
可产生最长的输出。如果开始时间的小时和分钟值小于结束时间和现在,则开始时间将被解释为昨天。如果开始时间和结束时间的 HH:MM 值都大于“现在”,则结束时间将被解释为今天。
假设“现在”是“1 月 11 日 19:00”,则各种示例开始和结束时间的解释方式如下(除非-l
另有说明):
开始 结束 范围 开始 范围 结束 19:01 23:59 1 月 10 日 1 月 10 日 19:01 00:00 1 月 10 日 1 月 11 日 00:00 18:59 1 月 11 日 1 月 11 日 18:59 18:58 1 月 10 日 1 月 10 日 19:01 23:59 1 月 10 日 1 月 11 日 # -l 00:00 18:59 1 月 10 日 1 月 11 日 # -l 18:59 19:01 1 月 10 日 1 月 11 日 # -l
几乎所有脚本都已设置完毕。最后两行完成所有工作。
警告:未进行参数验证或错误检查。尚未彻底测试边缘情况。这是使用gawk
其他版本的 AWK 编写的,可能会出错。
#!/usr/bin/awk -f
BEGIN {
arg=1
if ( ARGV[arg] == "-l" ) {
long = 1
ARGV[arg++] = ""
}
start = ARGV[arg]
ARGV[arg++] = ""
end = ARGV[arg]
ARGV[arg++] = ""
yesterday = strftime("%b %d", mktime(strftime("%Y %m %d -24 00 00")))
today = strftime("%b %d")
now = strftime("%R")
if ( start > now || start > end || long )
startdate = yesterday
else
startdate = today
if ( end > now && end > start && start > now && ! long )
enddate = yesterday
else
enddate = today
fi
startdate = startdate " " start
enddate = enddate " " end
}
$1 " " $2 " " $3 > enddate {exit}
$1 " " $2 " " $3 >= startdate {print}
我认为 AWK 在文件搜索方面非常高效。我认为没有其他任何东西在搜索文件方面会更快未索引文本文件。
答案3
通过在网上快速搜索,可以发现有些内容可以根据关键词进行提取(比如 FIRE 或类似的:),但是没有可以从文件中提取日期范围的内容。
按照你的建议去做似乎并不难:
- 搜索开始时间。
- 打印出该行。
- 如果结束时间 < 开始时间,并且一行的日期 > 结束时间且 < 开始时间,则停止。
- 如果结束时间 > 开始时间,且线路日期 > 结束时间,则停止。
看起来很简单,如果你不介意的话,我可以为你用 Ruby 编写:)
答案4
尽管这个答案已经太晚了,但对某些人来说还是有益的。
我已将@Dennis Williamson 的代码转换为可以用于其他 Python 内容的 Python 类。
我已经添加了对多个日期支持的支持。
import os
from stat import *
from datetime import date, datetime
import re
# @TODO Support for rotated log files - currently using the current year for 'Jan 01' dates.
class LogFileTimeParser(object):
"""
Extracts parts of a log file based on a start and enddate
Uses binary search logic to speed up searching
Common usage: validate log files during testing
Faster than awk parsing for big log files
"""
version = "0.01a"
# Set some initial values
BUF_SIZE = 4096 # self.handle long lines, but put a limit to them
REWIND = 100 # arbitrary, the optimal value is highly dependent on the structure of the file
LIMIT = 75 # arbitrary, allow for a VERY large file, but stop it if it runs away
line_date = ''
line = None
opened_file = None
@staticmethod
def parse_date(text, validate=True):
# Supports Aug 16 14:59:01 , 2016-08-16 09:23:09 Jun 1 2005 1:33:06PM (with or without seconds, miliseconds)
for fmt in ('%Y-%m-%d %H:%M:%S %f', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M',
'%b %d %H:%M:%S %f', '%b %d %H:%M', '%b %d %H:%M:%S',
'%b %d %Y %H:%M:%S %f', '%b %d %Y %H:%M', '%b %d %Y %H:%M:%S',
'%b %d %Y %I:%M:%S%p', '%b %d %Y %I:%M%p', '%b %d %Y %I:%M:%S%p %f'):
try:
if fmt in ['%b %d %H:%M:%S %f', '%b %d %H:%M', '%b %d %H:%M:%S']:
return datetime.strptime(text, fmt).replace(datetime.now().year)
return datetime.strptime(text, fmt)
except ValueError:
pass
if validate:
raise ValueError("No valid date format found for '{0}'".format(text))
else:
# Cannot use NoneType to compare datetimes. Using minimum instead
return datetime.min
# Function to read lines from file and extract the date and time
def read_lines(self):
"""
Read a line from a file
Return a tuple containing:
the date/time in a format supported in parse_date om the line itself
"""
try:
self.line = self.opened_file.readline(self.BUF_SIZE)
except:
raise IOError("File I/O Error")
if self.line == '':
raise EOFError("EOF reached")
# Remove \n from read lines.
if self.line[-1] == '\n':
self.line = self.line.rstrip('\n')
else:
if len(self.line) >= self.BUF_SIZE:
raise ValueError("Line length exceeds buffer size")
else:
raise ValueError("Missing newline")
words = self.line.split(' ')
# This results into Jan 1 01:01:01 000000 or 1970-01-01 01:01:01 000000
if len(words) >= 3:
self.line_date = self.parse_date(words[0] + " " + words[1] + " " + words[2],False)
else:
self.line_date = self.parse_date('', False)
return self.line_date, self.line
def get_lines_between_timestamps(self, start, end, path_to_file, debug=False):
# Set some initial values
count = 0
size = os.stat(path_to_file)[ST_SIZE]
begin_range = 0
mid_range = size / 2
old_mid_range = mid_range
end_range = size
pos1 = pos2 = 0
# If only hours are supplied
# test for times to be properly formatted, allow hh:mm or hh:mm:ss
p = re.compile(r'(^[2][0-3]|[0-1][0-9]):[0-5][0-9](:[0-5][0-9])?$')
if p.match(start) or p.match(end):
# Determine Time Range
yesterday = date.fromordinal(date.today().toordinal() - 1).strftime("%Y-%m-%d")
today = datetime.now().strftime("%Y-%m-%d")
now = datetime.now().strftime("%R")
if start > now or start > end:
search_start = yesterday
else:
search_start = today
if end > start > now:
search_end = yesterday
else:
search_end = today
search_start = self.parse_date(search_start + " " + start)
search_end = self.parse_date(search_end + " " + end)
else:
# Set dates
search_start = self.parse_date(start)
search_end = self.parse_date(end)
try:
self.opened_file = open(path_to_file, 'r')
except:
raise IOError("File Open Error")
if debug:
print("File: '{0}' Size: {1} Start: '{2}' End: '{3}'"
.format(path_to_file, size, search_start, search_end))
# Seek using binary search -- ONLY WORKS ON FILES WHO ARE SORTED BY DATES (should be true for log files)
try:
while pos1 != end_range and old_mid_range != 0 and self.line_date != search_start:
self.opened_file.seek(mid_range)
# sync to self.line ending
self.line_date, self.line = self.read_lines()
pos1 = self.opened_file.tell()
# if not beginning of file, discard first read
if mid_range > 0:
if debug:
print("...partial: (len: {0}) '{1}'".format((len(self.line)), self.line))
self.line_date, self.line = self.read_lines()
pos2 = self.opened_file.tell()
count += 1
if debug:
print("#{0} Beginning: {1} Mid: {2} End: {3} P1: {4} P2: {5} Timestamp: '{6}'".
format(count, begin_range, mid_range, end_range, pos1, pos2, self.line_date))
if search_start > self.line_date:
begin_range = mid_range
else:
end_range = mid_range
old_mid_range = mid_range
mid_range = (begin_range + end_range) / 2
if count > self.LIMIT:
raise IndexError("ERROR: ITERATION LIMIT EXCEEDED")
if debug:
print("...stopping: '{0}'".format(self.line))
# Rewind a bit to make sure we didn't miss any
seek = old_mid_range
while self.line_date >= search_start and seek > 0:
if seek < self.REWIND:
seek = 0
else:
seek -= self.REWIND
if debug:
print("...rewinding")
self.opened_file.seek(seek)
# sync to self.line ending
self.line_date, self.line = self.read_lines()
if debug:
print("...junk: '{0}'".format(self.line))
self.line_date, self.line = self.read_lines()
if debug:
print("...comparing: '{0}'".format(self.line_date))
# Scan forward
while self.line_date < search_start:
if debug:
print("...skipping: '{0}'".format(self.line_date))
self.line_date, self.line = self.read_lines()
if debug:
print("...found: '{0}'".format(self.line))
if debug:
print("Beginning: {0} Mid: {1} End: {2} P1: {3} P2: {4} Timestamp: '{5}'".
format(begin_range, mid_range, end_range, pos1, pos2, self.line_date))
# Now that the preliminaries are out of the way, we just loop,
# reading lines and printing them until they are beyond the end of the range we want
while self.line_date <= search_end:
# Exclude our 'Nonetype' values
if not self.line_date == datetime.min:
print self.line
self.line_date, self.line = self.read_lines()
if debug:
print("Start: '{0}' End: '{1}'".format(search_start, search_end))
self.opened_file.close()
# Do not display EOFErrors:
except EOFError as e:
pass