I have customized apache dispatcher access logs that I need to parse in spark scala. For this I have written a sample code but I'm not able to get the matching group by name. In my logs some of the fields are optional like cookies so I want a stable way to get the matcher regex by name rather than by its sequence number. Sample logs:
import scala.util.matching.Regex
case class LogRecord( date: String, time: String, host: String, xForward: String, user: String, method: String, statusCode: String, resTime: String, resBytes: String, referer: String, userAgent: String, cookies: String, cookies2: String, request: String, protocol: String, path: String)
object Logs extends Serializable {
val PATTERN = """(date)*=?\"*(?<date>(\S*))\"*\s*(time)*=?\"*(?<time>(\S*))\"*\s*(host1)*=?\"*(?<host>(\S*))\"*\s*x-forward=\"*(?<xForward>(?:\d+\.){3}\d+(?:,\s*(?:\d+\.){3}\d+)*|-)\"*\s*(user)*=?\"*(?<user>([^ ]*))\"*\s*(method)*=?\"*(?<method>(\S*))\"*\s*(status_code)*=?\"*(?<statusCode>(\S*))\"*\s*(res_time)*=?\"*(?<resTime>(\S*))\"*\s*(res_bytes)*=?\"*(?<resBytes>(\S*))\"*\s*(ref)*=?\"*(?<referer>[^\"]*)\"*\s*(ua)*=?\"*(?<userAgent>[^\"]*)\"*\s*\"*(?<cookies>[^\"]*)\"*\s*\"*(?<cookies2>[^\"]*)\"*\s*\"*(?<cookies3>[^\"]*)\"*\s*\"*(?<request>[^\"]*)\"*\s(proto)*=?\"*(?<protocol>(\S*))\"*\s*(path)*=?\"*(?<path>(\S*))\"*\s*""".r
def parseLogLine(log: String): LogRecord = {
try {
val res = PATTERN.findFirstMatchIn(log)
if (res.isEmpty) {
println("Rejected Log Line: " + log)
LogRecord("-1", "-1", "-1", "-1", "-1", "-1", "-1", "-1", "-1", "-1", "-1", "-1", "-1", "-1", "-1", "-1")
}
else {
val m = res.get
// NOTE: HEAD does not have a content size.
if (m.group("date").equals("-")) {
LogRecord(m.group("date"), m.group("time"), m.group("host"), m.group("xForward"),
m.group("user"), m.group("method"), m.group("statusCode"), m.group("resTime"), m.group("resBytes"), m.group("referer"),
m.group("userAgent"),m.group("cookies"),m.group("cookies2"),m.group("request"),m.group("protocol"),m.group("path"))
}
else {
LogRecord(m.group("date"), m.group("time"), m.group("host"), m.group("xForward"),
m.group("user"), m.group("method"), m.group("statusCode"), m.group("resTime"), m.group("resBytes"), m.group("referer"),
m.group("userAgent"),m.group("cookies"),m.group("cookies2"),m.group("request"),m.group("protocol"),m.group("path"))
}
}
} catch
{
case e: Exception =>
LogRecord("-2", "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-2", "-2")
}
}
-----------------START----------------------
date="12-16-2016" time="02:00:02.1481871602" host1="192.96.2.120" x-forward="192.267.186.196" user="-" method="POST" status_code="301" res_time="1108" res_bytes="266" ref="-" ua="Microsoft Office/16.0 (Windows NT 6.1; Microsoft Outlook 16.0.7426; Pro`enter code here`)" "OutlookSession=\"{3975F011-1D66-4540-BF7A-9FF0EDD5B572}\"; AKCounty=; AKAreacode=; AKCity=HONGKONG; AKRegioncode=; AKZip=; AKCountry=HK; BIGipServertestserver.test.com_443=838938378.36895.0000; BIGipServertestserver.test.com=570502410.36895.0000; TLTUID=76D578D8C35C10C39C7CBC55A6B7903E; TLTSID=76D578D8C35C10C39C7CBC55A6B7903E" "POST /autodiscover/autodiscover.xml HTTP/1.1" proto=HTTP/1.1 path=/autodiscover/autodiscover.xml
date="12-16-2016" time="02:00:02.1481871602" host1="192.96.2.120" x-forward="-" user="-" method="GET" status_code="200" res_time="39449" res_bytes="62444" ref="-" ua="Mozilla/4.0 (compatible; MSIE 4.01; Windows NT)" "-" "GET /us/en/home.html HTTP/1.1" proto=HTTP/1.1 path=/content/base/us/en/home.html
date="12-16-2016" time="02:00:03.1481871603" host1="192.96.2.120" x-forward="192.267.186.196" user="-" method="POST" status_code="301" res_time="694" res_bytes="266" ref="-" ua="Microsoft Office/15.0 (Windows NT 6.1; Microsoft Outlook 15.0.4875; Pro)" "OutlookSession=\"{1CD60224-8A1F-4382-A7C9-2002BBB6E11B}\"" "POST /autodiscover/autodiscover.xml HTTP/1.1" proto=HTTP/1.1 path=/autodiscover/autodiscover.xml
----------------- END ----------------------