collector.conf
collector {
interface = "0.0.0.0"
interface = ${?COLLECTOR_INTERFACE}
port = 8181
port = ${?COLLECTOR_PORT}
# optional SSL/TLS configuration
ssl {
enable = false
enable = ${?COLLECTOR_SSL}
# whether to redirect HTTP to HTTPS
redirect = false
redirect = ${?COLLECTOR_SSL_REDIRECT}
port = 9543
port = ${?COLLECTOR_SSL_PORT}
}
paths {
# "/com.acme/track" = "/com.snowplowanalytics.snowplow/tp2"
# "/com.acme/redirect" = "/r/tp2"
# "/com.acme/iglu" = "/com.snowplowanalytics.iglu/v1"
}
# Configure the P3P policy header.
p3p {
policyRef = "/w3c/p3p.xml"
CP = "NOI DSP COR NID PSA OUR IND COM NAV STA"
}
crossDomain {
enabled = false
# Domains that are granted access, *.acme.com will match http://acme.com and http://sub.acme.com
enabled = ${?COLLECTOR_CROSS_DOMAIN_ENABLED}
domains = [ "*" ]
domains = [ ${?COLLECTOR_CROSS_DOMAIN_DOMAIN} ]
# Whether to only grant access to HTTPS or both HTTPS and HTTP sources
secure = true
secure = ${?COLLECTOR_CROSS_DOMAIN_SECURE}
}
cookie {
enabled = true
enabled = ${?COLLECTOR_COOKIE_ENABLED}
expiration = "365 days"
expiration = ${?COLLECTOR_COOKIE_EXPIRATION}
# Network cookie name
name = zanui_collector_cookie
name = ${?COLLECTOR_COOKIE_NAME}
domains = [
"{{cookieDomain1}}" # e.g. "domain.com" -> any origin domain ending with this will be matched and domain.com will be returned
"{{cookieDomain2}}" # e.g. "secure.anotherdomain.com" -> any origin domain ending with this will be matched and secure.anotherdomain.com will be returned
# ... more domains
]
domains += ${?COLLECTOR_COOKIE_DOMAIN_1}
domains += ${?COLLECTOR_COOKIE_DOMAIN_2}
fallbackDomain = ""
fallbackDomain = ${?FALLBACK_DOMAIN}
secure = false
secure = ${?COLLECTOR_COOKIE_SECURE}
httpOnly = false
httpOnly = ${?COLLECTOR_COOKIE_HTTP_ONLY}
sameSite = "{{cookieSameSite}}"
sameSite = ${?COLLECTOR_COOKIE_SAME_SITE}
}
doNotTrackCookie {
enabled = false
enabled = ${?COLLECTOR_DO_NOT_TRACK_COOKIE_ENABLED}
# name = {{doNotTrackCookieName}}
name = zanui-collector-do-not-track-cookie
# value = {{doNotTrackCookieValue}}
value = zanui-collector-do-not-track-cookie-value
}
cookieBounce {
enabled = false
enabled = ${?COLLECTOR_COOKIE_BOUNCE_ENABLED}
name = "n3pc"
name = ${?COLLECTOR_COOKIE_BOUNCE_NAME}
fallbackNetworkUserId = "00000000-0000-4000-A000-000000000000"
fallbackNetworkUserId = ${?COLLECTOR_COOKIE_BOUNCE_FALLBACK_NETWORK_USER_ID}
forwardedProtocolHeader = "X-Forwarded-Proto"
forwardedProtocolHeader = ${?COLLECTOR_COOKIE_BOUNCE_FORWARDED_PROTOCOL_HEADER}
}
enableDefaultRedirect = true
enableDefaultRedirect = ${?COLLECTOR_ALLOW_REDIRECTS}
redirectMacro {
enabled = false
enabled = ${?COLLECTOR_REDIRECT_MACRO_ENABLED}
# Optional custom placeholder token (defaults to the literal `${SP_NUID}`)
placeholder = "[TOKEN]"
placeholder = ${?COLLECTOR_REDIRECT_REDIRECT_MACRO_PLACEHOLDER}
}
rootResponse {
enabled = false
enabled = ${?COLLECTOR_ROOT_RESPONSE_ENABLED}
statusCode = 302
statusCode = ${?COLLECTOR_ROOT_RESPONSE_STATUS_CODE}
# Optional, defaults to empty map
headers = {
Location = "https://127.0.0.1/",
Location = ${?COLLECTOR_ROOT_RESPONSE_HEADERS_LOCATION},
X-Custom = "something"
}
# Optional, defaults to empty string
body = "302, redirecting"
body = ${?COLLECTOR_ROOT_RESPONSE_BODY}
}
cors {
accessControlMaxAge = 5 seconds
accessControlMaxAge = ${?COLLECTOR_CORS_ACCESS_CONTROL_MAX_AGE}
}
# Configuration of prometheus http metrics
prometheusMetrics {
enabled = false
}
streams {
# Events which have successfully been collected will be stored in the good stream/topic
good = snowplow-collected-good-events-stream
good = ${?COLLECTOR_STREAMS_GOOD}
# Events that are too big (w.r.t Kinesis 1MB limit) will be stored in the bad stream/topic
bad = snowplow-collected-bad-events-stream
bad = ${?COLLECTOR_STREAMS_BAD}
useIpAddressAsPartitionKey = false
useIpAddressAsPartitionKey = ${?COLLECTOR_STREAMS_USE_IP_ADDRESS_AS_PARTITION_KEY}
sink {
enabled = kinesis
enabled = ${?COLLECTOR_STREAMS_SINK_ENABLED}
# Region where the streams are located
region = ap-southeast-2
region = ${?COLLECTOR_STREAMS_SINK_REGION}
threadPoolSize = 10
threadPoolSize = ${?COLLECTOR_STREAMS_SINK_THREAD_POOL_SIZE}
aws {
accessKey = env
accessKey = ${?COLLECTOR_STREAMS_SINK_AWS_ACCESS_KEY}
secretKey = env
secretKey = ${?COLLECTOR_STREAMS_SINK_AWS_SECRET_KEY}
}
# Minimum and maximum backoff periods, in milliseconds
backoffPolicy {
#minBackoff = {{minBackoffMillis}}
minBackoff = 10
#maxBackoff = {{maxBackoffMillis}}
maxBackoff = 10
}
}
buffer {
byteLimit = 4500000
byteLimit = ${?COLLECTOR_STREAMS_BUFFER_BYTE_LIMIT}
recordLimit =500 # Not supported by Kafka; will be ignored
recordLimit = ${?COLLECTOR_STREAMS_BUFFER_RECORD_LIMIT}
timeLimit = 5000
timeLimit = ${?COLLECTOR_STREAMS_BUFFER_TIME_LIMIT}
}
}
}
akka {
loglevel = DEBUG # 'OFF' for no logging, 'DEBUG' for all logging.
loglevel = ${?AKKA_LOGLEVEL}
loggers = ["akka.event.slf4j.Slf4jLogger"]
loggers = [${?AKKA_LOGGERS}]
http.server {
remote-address-header = on
remote-address-header = ${?AKKA_HTTP_SERVER_REMOTE_ADDRESS_HEADER}
raw-request-uri-header = on
raw-request-uri-header = ${?AKKA_HTTP_SERVER_RAW_REQUEST_URI_HEADER}
# Define the maximum request length (the default is 2048)
parsing {
max-uri-length = 32768
max-uri-length = ${?AKKA_HTTP_SERVER_PARSING_MAX_URI_LENGTH}
uri-parsing-mode = relaxed
uri-parsing-mode = ${?AKKA_HTTP_SERVER_PARSING_URI_PARSING_MODE}
}
}
}
Run Command:
java -Dcom.amazonaws.sdk.disableCbor -jar snowplow-stream-collector-kinesis-1.0.0.jar --config collector.conf
enricher.conf
enrich {
streams {
in {
# Stream/topic where the raw events to be enriched are located
raw = snowplow-collected-good-events-stream
raw = ${?ENRICH_STREAMS_IN_RAW}
}
out {
# Stream/topic where the events that were successfully enriched will end up
enriched = snowplow-collected-good-events-stream
# Stream/topic where the event that failed enrichment will be stored
bad = snowplow-collected-bad-events-stream
bad = ${?ENRICH_STREAMS_OUT_BAD}
# Stream/topic where the pii tranformation events will end up
# pii = {{outPii}}
# pii = ${?ENRICH_STREAMS_OUT_PII}
partitionKey = event_id
partitionKey = ${?ENRICH_STREAMS_OUT_PARTITION_KEY}
}
sourceSink {
enabled = kinesis
enabled = ${?ENRICH_STREAMS_SOURCE_SINK_ENABLED}
region = ap-southeast-2
aws {
accessKey = env
accessKey = ${?ENRICH_STREAMS_SOURCE_SINK_AWS_ACCESS_KEY}
secretKey = env
secretKey = ${?ENRICH_STREAMS_SOURCE_SINK_AWS_SECRET_KEY}
}
maxRecords = 10000
initialPosition = TRIM_HORIZON
initialTimestamp = "2020-09-10T10:00:00Z"
backoffPolicy {
minBackoff = 1000
minBackoff = ${?ENRICH_STREAMS_SOURCE_SINK_BACKOFF_POLICY_MIN_BACKOFF}
maxBackoff = 5000
maxBackoff = ${?ENRICH_STREAMS_SOURCE_SINK_BACKOFF_POLICY_MAX_BACKOFF}
}
}
buffer {
byteLimit = 1000000000
byteLimit = ${?ENRICH_STREAMS_BUFFER_BYTE_LIMIT}
recordLimit = 10 # Not supported by Kafka; will be ignored
recordLimit = ${?ENRICH_STREAMS_BUFFER_RECORD_LIMIT}
timeLimit = 5000
timeLimit = ${?ENRICH_STREAMS_BUFFER_TIME_LIMIT}
}
appName = "zanui-enricher-app"
appName = ${?ENRICH_STREAMS_APP_NAME}
}
}
Run Command:
java -jar snowplow-stream-enrich-kinesis-1.0.0.jar --config enricher.conf --resolver file:resolver.json
S3 Loader Config
source = "kinesis"
sink = "kinesis"
aws {
accessKey = "env"
secretKey = "env"
}
# Config for NSQ
nsq {
channelName = "nsqSourceChannelName"
# Host name for NSQ tools
host = "127.0.0.1"
# HTTP port for nsqd
port = 4150
# HTTP port for nsqlookupd
lookupPort = 4161
}
kinesis {
initialPosition = "TRIM_HORIZON"
initialTimestamp = "2017-05-17T10:00:00Z"
maxRecords = 10000
region = "ap-southeast-2"
appName = "zanui-enricher-app"
}
streams {
inStreamName = "snowplow-collected-good-events-stream"
outStreamName = "snowplow-collected-bad-events-stream"
buffer {
byteLimit = 1000000000 # Not supported by NSQ; will be ignored
recordLimit = 10
timeLimit = 5000 # Not supported by NSQ; will be ignored
}
}
s3 {
region = "ap-southeast-2"
bucket = "snowplow-enriched-good-events"
partitionedBucket = "snowplow-enriched-good-events/partitioned"
dateFormat = "{YYYY}/{MM}/{dd}/{HH}"
outputDirectory = "zanui-enriched/good"
filenamePrefix = "zanui-output"
format = "gzip"
# Maximum Timeout that the application is allowed to fail for (in milliseconds)
maxTimeout = 300000 # 5 minutes
}
Run Command:
java -jar snowplow-s3-loader-0.6.0.jar --config my.conf
But this Snowplow S3 Loader not doing anything so i used Data Fireshose to transfer stream to S3 bucket.
When i try to use Aws Lambda in Data Fireshose, it give error
{"attemptsMade":4,"arrivalTimestamp":1600154159619,"errorCode":"Lambda.FunctionError","errorMessage":"The Lambda function was successfully invoked but it returned an error result.","attemptEndingTimestamp":1600154235846,"rawData":"****","lambdaArn":"arn:aws:lambda:ap-southeast-2:573188294151:function:snowplow-json-transformer-lambda:$LATEST"}
{"attemptsMade":4,"arrivalTimestamp":1600154161523,"errorCode":"Lambda.FunctionError","errorMessage":"The Lambda function was successfully invoked but it returned an error result.","attemptEndingTimestamp":1600154235846,"rawData":"*****=","lambdaArn":"arn:aws:lambda:ap-southeast-2:573188294151:function:snowplow-json-transformer-lambda:$LATEST"}
If i dont use lambda, log is created in S3 Good Enriched Bucket for page view event, but at the same time, log is created in S3 bad Enriched Bucket for the same page view event saying
{"schema":"iglu:com.snowplowanalytics.snowplow.badrows/collector_payload_format_violation/jsonschema/1-0-0","data":{"processor":{"artifact":"snowplow-stream-enrich","version":"1.0.0"},"failure":{"timestamp":"2020-09-15T07:16:02.488Z","loader":"thrift","message":{"error":"error deserializing raw event: Cannot read. Remote side has closed. Tried to read 2 bytes, but only got 1 bytes. (This is often indicative of an internal error on the server side. Please check your server logs.)"}},"payload":"****="}}
I have followed the documentation repeatedly but i am confused in the setup for stream enrich. What i did not understand is that , do we need to setup database for stream enrich if we are not using custom schema ? Because since i am trying to test with Page View event from Javascript Tracker, I have not setup any database. But i have provided access for DynamoDb create, edit for IAM role.
Please help me to setup snowplow if anyone has done before. Please :(
I had written a blog on how to setup Snowplow Analytics in AWS.
Here is the link, hope this helps you.