-- web access log schema -- anders-panacea@evantide.com -- -- ab: may/02/2002 - creation -- -- sample log line: -- 4.33.251.90 - - [02/May/2002:16:08:43 -0400] "GET / HTTP/1.1" 200 45165 "http://www.google.com/search?hl=en&q=san+mateo+t1+isp" "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0)" -- everything centers around the requests table create table requests ( id serial, ipId int not null, usernameId int not null, timestamp datetime, methodId int not null, pathId int not null, versionId int not null, statusId int not null, size int not null, refererId int, userAgentId int, siteId int not null, serverId int not null, logSourceId int not null ); -- basic breakout tables create table ips ( id serial, ip inet not null, name varchar( 128 ) ); create unique index ipsIndex on ips ( ip ); create table usernames ( id serial, username varchar( 32 ) not null ); create unique index usernamesIndex on usernames ( username ); -- these are only basic http authenticated users. (ala htaccess/htpasswd) create table methods ( id serial, method varchar( 8 ) not null ); create unique index methodsIndex on methods ( method ); insert into methods ( method ) values ( 'OPTIONS' ); insert into methods ( method ) values ( 'GET' ); insert into methods ( method ) values ( 'HEAD' ); insert into methods ( method ) values ( 'POST' ); insert into methods ( method ) values ( 'PUT' ); insert into methods ( method ) values ( 'DELETE' ); insert into methods ( method ) values ( 'TRACE' ); insert into methods ( method ) values ( 'CONNECT' ); -- from the rfc at http://www.ietf.org/rfc/rfc2616.txt create table paths ( id serial, path varchar( 256 ) not null ); create unique index pathsIndex on paths ( path ); -- example: /images/bandwidth_logo_sm.gif -- note: doesn't include arguments (those are stored in keyValuePairs table) create table versions ( id serial, version varchar( 32 ) not null ); create unique index versionsIndex on versions ( version ); insert into versions ( version ) values ( 'HTTP/1.0' ); insert into versions ( version ) values ( 'HTTP/1.1' ); insert into versions ( version ) values ( 'HTTP/1.2' ); insert into versions ( version ) values ( 'HTTP/1.3' ); -- more will come so will need to add more in the future create table statuses ( id serial, status int not null, description varchar( 32 ) not null ); create unique index statusIndex on statuses ( status ); insert into statuses ( status, description ) values ( 100, 'Continue' ); insert into statuses ( status, description ) values ( 101, 'Switching Protocols' ); insert into statuses ( status, description ) values ( 200, 'OK' ); insert into statuses ( status, description ) values ( 201, 'Created' ); insert into statuses ( status, description ) values ( 202, 'Accepted' ); insert into statuses ( status, description ) values ( 203, 'Non-Authoritative Information' ); insert into statuses ( status, description ) values ( 204, 'No Content' ); insert into statuses ( status, description ) values ( 205, 'Reset Content' ); insert into statuses ( status, description ) values ( 206, 'Partial Content' ); insert into statuses ( status, description ) values ( 300, 'Multiple Choices' ); insert into statuses ( status, description ) values ( 301, 'Moved Permanently' ); insert into statuses ( status, description ) values ( 302, 'Found' ); insert into statuses ( status, description ) values ( 303, 'See Other' ); insert into statuses ( status, description ) values ( 304, 'Not Modified' ); insert into statuses ( status, description ) values ( 305, 'Use Proxy' ); insert into statuses ( status, description ) values ( 307, 'Temporary Redirect' ); insert into statuses ( status, description ) values ( 400, 'Bad Request' ); insert into statuses ( status, description ) values ( 401, 'Unauthorized' ); insert into statuses ( status, description ) values ( 402, 'Payment Required' ); insert into statuses ( status, description ) values ( 403, 'Forbidden' ); insert into statuses ( status, description ) values ( 404, 'Not Found' ); insert into statuses ( status, description ) values ( 405, 'Method Not Allowed' ); insert into statuses ( status, description ) values ( 406, 'Not Acceptable' ); insert into statuses ( status, description ) values ( 407, 'Proxy Authentication Required' ); insert into statuses ( status, description ) values ( 408, 'Request Time-out' ); insert into statuses ( status, description ) values ( 409, 'Conflict' ); insert into statuses ( status, description ) values ( 410, 'Gone' ); insert into statuses ( status, description ) values ( 411, 'Length Required' ); insert into statuses ( status, description ) values ( 412, 'Precondition Failed' ); insert into statuses ( status, description ) values ( 413, 'Request Entity Too Large' ); insert into statuses ( status, description ) values ( 414, 'Request-URI Too Large' ); insert into statuses ( status, description ) values ( 415, 'Unsupported Media Type' ); insert into statuses ( status, description ) values ( 416, 'Requested range not satisfiable' ); insert into statuses ( status, description ) values ( 417, 'Expectation Failed' ); insert into statuses ( status, description ) values ( 500, 'Internal Server Error' ); insert into statuses ( status, description ) values ( 501, 'Not Implemented' ); insert into statuses ( status, description ) values ( 502, 'Bad Gateway' ); insert into statuses ( status, description ) values ( 503, 'Service Unavailable' ); insert into statuses ( status, description ) values ( 504, 'Gateway Time-out' ); insert into statuses ( status, description ) values ( 505, 'HTTP Version not supported' ); -- why don't i use the http response code as the unique id? because i'm -- a slave to conformity. the result code is a unique int, but then the -- implimentation is non-standard and hence a cause for confusion. create table referers ( id serial, referer varchar( 512 ) not null ); create unique index referersIndex on referers ( referer ); -- example: http://google.yahoo.com/bin/query?p=internet+Bandwidth+t1&hc=0&hs=5 create table userAgents ( id serial, userAgent varchar( 256 ) not null ); create unique index userAgentsIndex on userAgents ( userAgent ); -- example: 'Mozilla/4.0 (compatible; MSIE 6.0; Windows 98; Q312461)' create table sites ( id serial, url varchar( 64 ) not null, description varchar( 256 ) ); create unique index sitesIndex on sites ( url ); -- a website or virtual host such as 'http://www.bandwidth.com/' create table servers ( id serial, name varchar( 64 ) not null, description varchar( 256 ) ); create unique index serversIndex on servers ( name ); -- computer that served the request such as 'web01.bandwidth.com' create table logSources ( id serial, name varchar( 256 ) not null, logSourceTypeId int not null, timestamp datetime not null, startDate datetime, endDate datetime ); create unique index logSourcesIndex on logSources ( name ); -- log sources tie the request to a specific source such as a log file -- or a live session in the case of realtime logging. in the case of -- a live session, the name will usually be a human readable string -- identifying the session. in the case of a log file, the name of the -- file will be recorded as a "not all that foolproof" means of -- stopping multiple imports of the same log file. the root reason for -- this table is a means for identifying requests to be ripped out in -- the case where the data is inserted by a double log file import or -- rogue live session. create table logSourceTypes ( id serial, name varchar( 64 ) not null, description varchar( 256 ) ); create unique index logSourceTypeIndex on logSourceTypes ( name ); insert into logSourceTypes ( name, description ) values ( 'File', 'An access log file from a filesystem.' ); insert into logSourceTypes ( name, description ) values ( 'Web Application Feed', 'Data collected on the fly from a webserver.' ); insert into logSourceTypes ( name, description ) values ( 'Sniffer Feed', 'Data collected by a web traffic sniffer.' ); -- -- the "not so straightforward" tables -- create table keyValuePairs ( id serial, key varchar( 64 ) not null, value varchar( 128 ) ); create unique index keyValuePairsIndex on keyValuePairs ( key, value ); -- example: for the url http://foo.com/test.html?a=1&b=2 -- a and b are keys and 1 and 2 are values. this info is -- NOT recorded in the paths table. create table requestToKeyValuePairs ( requestId int not null, keyValuePairId int not null ); create unique index requestToKeyValuePairsIndex on requestToKeyValuePairs ( requestId, keyValuePairId ); -- this table ties key value pairs to specific requests create table sessionVariables ( id serial, sessionVariableTypeId int not null, variableData varchar( 32 ) not null ); create unique index sessionVariablesIndex on sessionVariables ( sessionVariableTypeId, variableData ); -- example: a7wjd7392jd72ms -- this bares some explanation. session variables are user defineanble -- things that you want to track from request to request such as an -- object identifier or a cookie. they track items across multiple -- requests that have either differient lifespans ( such as a persistant -- cookie and a session cookie ) or things that aren't directly visable -- in the logs. ( such as a user id or other variables stored in a -- session object ) -- on the face of it, one would think that we should convolve -- sessionVariables and keyValuePairs because session variables are -- realy just key value pairs. the reason we don't is because keys may -- have the same name as session variables and it is slightly more -- handy to break out the name and description of session variables -- should one want to change a session variable name mid-stream. (such -- as when one moves from one servlet runner to another which names -- state differiently) create table sessionVariableTypes ( id serial, type varchar( 32 ) not null, description varchar( 128 ) ); create unique index sessionVariableTypesIndex on sessionVariableTypes ( type ); -- examples: jsessionid, persistantCookie, userId create table requestToSessionVariables ( requestId int not null, sessionVariableId int not null ); create unique index requestToSessionVariablesIndex on requestToSessionVariables ( requestId, sessionVariableId ); -- if there are 2 or more sessionVariables for a request, this table -- will grow that many more times faster than the requests table. -- hence if you track a persistant cookie, a session cookie and a -- user id, this table is going to be seriously BIG! (3 times that of -- the requests table or 3 times the number of hits!) so go easy on -- number of session variables you choose to track because reporting -- can become impractical quickly if the number of requests is -- significant. -- -- support tables -- -- the following tables are fleshed out at some other time than initial -- data import. they are permanent support tables that exist to make -- queries lighter on the system. create table refererToKeyValuePairs ( refererId int not null, keyValuePairId int not null ); create unique index refererToKeyValuePairsIndex on refererToKeyValuePairs ( refererId, keyValuePairId ); -- referer urls somtimes include arguments. (as in the case with a referer -- from a search engine bearing the search terms) -- -- functions -- -- select addRequest( '10.1.1.1', 'anders', now(), 'GET', '/index.html', 'HTTP/1.0', 200, 345, 'http://www.anders.com/', 'Mozilla', 'www.bandwidth.com'); -- -- basic request functions -- create or replace function addRequest( inet, text, datetime, text, text, text, int, int, text, text, text, text, text, text ) returns int as ' declare ipVar alias for $1; usernameVar alias for $2; timestampVar alias for $3; methodVar alias for $4; pathVar alias for $5; versionVar alias for $6; statusVar alias for $7; sizeVar alias for $8; refererVar alias for $9; userAgentVar alias for $10; siteVar alias for $11; serverVar alias for $12; logSourceVar alias for $13; logSourceTypeVar alias for $14; requestIdVar int; ipIdVar int; usernameIdVar int; methodIdVar int; pathIdVar int; versionIdVar int; statusIdVar int; refererIdVar int; userAgentIdVar int; siteIdVar int; serverIdVar int; logSourceIdVar int; begin ipIdVar := getInsertIpId( ipVar ); usernameIdVar := getInsertUsernameId( usernameVar ); methodIdVar := getInsertMethodId( methodVar ); pathIdVar := getInsertPathId( pathVar ); versionIdVar := getInsertVersionId( versionVar ); statusIdVar := getInsertStatusId( statusVar ); refererIdVar := getInsertRefererId( refererVar ); userAgentIdVar := getInsertUserAgentId( userAgentVar ); siteIdVar := getInsertSiteId( siteVar ); serverIdVar := getInsertServerId( serverVar ); logSourceIdVar := getInsertLogSourceId( logSourceVar, logSourceTypeVar ); insert into requests ( ipId, usernameId, timestamp, methodId, pathId, versionId, statusId, size, refererId, userAgentId, siteId, serverId, logSourceId ) values ( ipIdVar, usernameIdVar, timestampVar, methodIdVar, pathIdVar, versionIdVar, statusIdVar, sizeVar, refererIdVar, userAgentIdVar, siteIdVar, serverIdVar, logSourceIdVar ); requestIdVar := currval( ''requests_id_seq'' ); return requestIdVar; end ' language 'plpgsql'; create or replace function getInsertIpId( inet ) returns int as ' declare ipInet alias for $1; ipId int; begin select into ipId id from ips where ip = ipInet; if not found then insert into ips ( ip ) values ( ipInet ); ipId := currval( ''ips_id_seq'' ); end if; return ipId; end ' language 'plpgsql'; create or replace function getInsertUsernameId( text ) returns int as ' declare usernameText alias for $1; usernameId int; begin select into usernameId id from usernames where username = usernameText; if not found then insert into usernames ( username ) values ( usernameText ); usernameId := currval( ''usernames_id_seq'' ); end if; return usernameId; end ' language 'plpgsql'; create or replace function getInsertMethodId( text ) returns int as ' declare methodText alias for $1; methodId int; begin select into methodId id from methods where method = methodText; if not found then insert into methods ( method ) values ( methodText ); methodId := currval( ''methods_id_seq'' ); end if; return methodId; end ' language 'plpgsql'; create or replace function getInsertPathId( text ) returns int as ' declare pathText alias for $1; pathId int; begin select into pathId id from paths where path = pathText; if not found then insert into paths ( path ) values ( pathText ); pathId := currval( ''paths_id_seq'' ); end if; return pathId; end ' language 'plpgsql'; create or replace function getInsertVersionId( text ) returns int as ' declare versionText alias for $1; versionId int; begin select into versionId id from versions where version = versionText; if not found then insert into versions ( version ) values ( versionText ); versionId := currval( ''versions_id_seq'' ); end if; return versionId; end ' language 'plpgsql'; create or replace function getInsertStatusId( int ) returns int as ' declare statusInt alias for $1; statusId int; begin select into statusId id from statuses where status = statusInt; if not found then insert into statuses ( status ) values ( statusInt ); statusId := currval( ''statuses_id_seq'' ); end if; return statusId; end ' language 'plpgsql'; create or replace function getInsertRefererId( text ) returns int as ' declare refererText alias for $1; refererId int; begin select into refererId id from referers where referer = refererText; if not found then insert into referers ( referer ) values ( refererText ); refererId := currval( ''referers_id_seq'' ); end if; return refererId; end ' language 'plpgsql'; create or replace function getInsertUserAgentId( text ) returns int as ' declare userAgentText alias for $1; userAgentId int; begin select into userAgentId id from userAgents where userAgent = userAgentText; if not found then insert into userAgents ( userAgent ) values ( userAgentText ); userAgentId := currval( ''userAgents_id_seq'' ); end if; return userAgentId; end ' language 'plpgsql'; create or replace function getInsertSiteId( text ) returns int as ' declare siteText alias for $1; siteId int; begin select into siteId id from sites where url = siteText; if not found then insert into sites ( url ) values ( siteText ); siteId := currval( ''sites_id_seq'' ); end if; return siteId; end ' language 'plpgsql'; create or replace function getInsertServerId( text ) returns int as ' declare serverText alias for $1; serverId int; begin select into serverId id from servers where name = serverText; if not found then insert into servers ( name ) values ( serverText ); serverId := currval( ''servers_id_seq'' ); end if; return serverId; end ' language 'plpgsql'; create or replace function getInsertLogSourceId( text ) returns int as ' declare logSourceText alias for $1; logSourceTypeText alias for $2; logSourceId int; logSourceTypeId int; begin select into logSourceId id from logSources where name = logSourceText; if not found then select into logSourceTypeId id from logSourceTypes where name = logSourceTypeText; if not found then insert into logSourceTypes ( name ) values ( logSourceTypeText ); logSourceTypeId := curval( ''logSourceTypes_id_seq'' ); end if; insert into logSources ( name, logSourceTypeId, timestamp ) values ( logSourceText, logSourceTypeId, now( ) ); logSourceId := currval( ''logSources_id_seq'' ); end if; return logSourceId; end ' language 'plpgsql'; -- -- key value pair functions -- create or replace function addKeyValue( int, text, text ) returns int as ' declare requestIdVar alias for $1; keyVar alias for $2; valueVar alias for $3; keyValuePairIdVar int; begin keyValuePairIdVar := getInsertKeyValuePairId( keyVar, valueVar ); insert into requestToKeyValuePairs( requestId, keyValuePairId ) values ( requestIdVar, keyValuePairIdVar ); return 1; end ' language 'plpgsql'; create or replace function getInsertKeyValuePairId( text, text ) returns int as ' declare keyVar alias for $1; valueVar alias for $2; keyValuePairIdVar int; begin select into keyValuePairIdVar id from keyValuePairs where key = keyVar and value = valueVar; if not found then insert into keyValuePairs ( key, value ) values ( keyVar, valueVar ); keyValuePairIdVar := currval( ''keyValuePairs_id_seq'' ); end if; return keyValuePairIdVar; end ' language 'plpgsql'; -- -- session variable functions -- create or replace function addSessionVariable( int, text, text ) returns int as ' declare requestIdVar alias for $1; sessionVar alias for $2; variableVar alias for $3; variableTypeIdVar int; variableIdVar int; begin variableTypeIdVar := getInsertSessionVariableTypeId( sessionVar ); variableIdVar := getInsertSessionVariableId( variableTypeIdVar, variableVar ); insert into requestToSessionVariables( requestId, sessionVariableId ) values ( requestIdVar, variableIdVar ); return 1; end ' language 'plpgsql'; create or replace function getInsertSessionVariableTypeId( text ) returns int as ' declare variableTypeVar alias for $1; variableTypeIdVar int; begin select into variableTypeIdVar id from sessionVariableTypes where type = variableTypeVar; if not found then insert into sessionVariableTypes ( type ) values ( variableTypeVar ); variableTypeIdVar := currval( ''sessionVariableTypes_id_seq'' ); end if; return variableTypeIdVar; end ' language 'plpgsql'; create or replace function getInsertSessionVariableId( int, text ) returns int as ' declare variableTypeIdVar alias for $1; variableDataVar alias for $2; variableIdVar int; begin select into variableIdVar id from sessionVariables where sessionVariableTypeId = variableTypeIdVar and variableData = variableDataVar; if not found then insert into sessionVariables ( sessionVariableTypeId, variableData ) values ( variableTypeIdVar, variableDataVar ); variableIdVar := currval( ''sessionVariables_id_seq'' ); end if; return variableIdVar; end ' language 'plpgsql'; -- -- reporting functions --