Log File Source Gallery

Input format sample

Date,Device,SrcIP,dstIP,Alert,Country
SCANNER_1,2012-01-01T13:43:00,10.0.0.1,66.66.66.66,DUMMY_ALERT_TYPE_1,United States
SCANNER_2,2012-02-01T14:21:00,SCANNER_2,10.0.0.2,66.66.66.66,DUMMY_ALERT_TYPE_2,United Kingdom
SCANNER_3,2012-03-01T15:17:00,10.0.0.1,99.66.99.66,DUMMY_ALERT_TYPE_3,Netherlands

Source #1a - fileshare, manual parsing

{
    "description": "For cyber demo",
    "isPublic": false,
    "mediaType": "Log",
    "searchCycle_secs": 3600,
    "tags": [
        "cyber",
        "structured"
    ],
    "title": "Cyber Logs Test",
    "processingPipeline": [
        {
            "file": {
                "XmlRootLevelValues": [],
                "domain": "DOMAIN",
                "password": "PASSWORD",
                "type": "csv",
                "username": "USER",
                "url": "smb://FILESHARE:139/cyber_logs/"
            }
        },
        {
            "globals": {
                "scripts": [
                    "function decode(x)\n{\n    var info = {};   \n    var rec = x.split(',');   \n    info.device = rec[0];\n    info.date = rec[1];\n    info.srcIP = rec[2];\n    info.dstIP = rec[3];\n    info.alert = rec[4];\n    info.country = rec[5];\n    return info;\n}"
                ]
            }
        },
        {
            "harvest": {
                "searchCycle_secs": 3600
            }
        },
        {
            "docMetadata": {
                "title": "$metadata.info.alert @ $metadata.info.date [$metadata.info.device]: $metadata.info.dstIP -> $metadata.info.srcIP",
                "publishedDate": "$SCRIPT( return _doc.metadata.info[0].date; )"
            }
        },
        {
            "contentMetadata": [
                {
                    "fieldName": "info",
                    "script": "var info = decode(text); info;",
                    "scriptlang": "javascript"
                }
            ]
        },
        {
            "text": [
                {
                    "fieldName": "fullText",
                    "script": ",",
                    "scriptlang": "regex",
                    "flags": "md",
                    "replacement": " , "
                },
                {
                    "fieldName": "description",
                    "script": ",",
                    "scriptlang": "regex",
                    "flags": "md",
                    "replacement": " , "
                }
            ]
        },
        {
            "entities": [
                {
                    "dimension": "What",
                    "disambiguated_name": "$metadata.info.srcIP",
                    "type": "PrivateIP"
                },
                {
                    "dimension": "What",
                    "disambiguated_name": "$metadata.info.dstIP",
                    "geotag": {
                        "country": "$SCRIPT( return _doc.metadata.info[0].country; )"
                    },
                    "ontology_type": "country",
                    "type": "PublicIP"
                },
                {
                    "actual_name": "$metadata.info.country",
                    "dimension": "Where",
                    "disambiguated_name": "$SCRIPT( return _doc.metadata.info[0].country; )",
                    "geotag": {
                        "country": "$SCRIPT( return _doc.metadata.info[0].country; )"
                    },
                    "ontology_type": "country",
                    "type": "Country"
                },
                {
                    "dimension": "What",
                    "disambiguated_name": "$metadata.info.device",
                    "type": "Sensor"
                },
                {
                    "dimension": "What",
                    "disambiguated_name": "$metadata.info.alert",
                    "type": "AlertType"
                }
            ]
        },
        {
            "associations": [
                {
                    "entity1": "$metadata.info.dstIP",
                    "entity2": "$metadata.info.srcIP",
                    "geo_index": "$SCRIPT( return _doc.metadata.info[0].country + '/country'; )",
                    "time_start": "$SCRIPT( return _doc.metadata.info[0].date; )",
                    "verb": "$SCRIPT( return _doc.metadata.info[0].alert; )",
                    "verb_category": "$SCRIPT( return _doc.metadata.info[0].alert; )"
                }
            ]
        },
        {
            "searchIndex": {
                "metadataFieldList": ""
            }
        }
    ]
}

Source #1b - fileshare, automated parsing - headers manually specified

{
    "description": "For cyber demo",
    "isPublic": false,
    "mediaType": "Log",
    "searchCycle_secs": 3600,
    "tags": [
        "cyber",
        "structured"
    ],
    "title": "Cyber Logs Test",
    "processingPipeline": [
        {
            "file": {
                "XmlRootLevelValues": [
                    "device",
                    "date",
                    "srcIP",
                    "dstIP",
                    "alert",
                    "country"
                ],
                "XmlIgnoreValues": [
                    "device,date,srcIP"
                ],
                "domain": "DOMAIN",
                "password": "PASSWORD",
                "type": "csv",
                "username": "USER",
                "url": "smb://FILESHARE:139/cyber_logs/"
            }
        },
        {
            "harvest": {
                "searchCycle_secs": 3600
            }
        },
        {
            "docMetadata": {
                "title": "$metadata.csv.alert @ $metadata.csv.date [$metadata.csv.device]: $metadata.csv.dstIP -> $metadata.csv.srcIP",
                "publishedDate": "$SCRIPT( return _doc.metadata.csv[0].date; )"
            }
        },
        {
            "entities": [
                {
                    "dimension": "What",
                    "disambiguated_name": "$metadata.csv.srcIP",
                    "type": "PrivateIP"
                },
                {
                    "dimension": "What",
                    "disambiguated_name": "$metadata.csv.dstIP",
                    "geotag": {
                        "country": "$SCRIPT( return _doc.metadata.csv[0].country; )"
                    },
                    "ontology_type": "country",
                    "type": "PublicIP"
                },
                {
                    "actual_name": "$metadata.csv.country",
                    "dimension": "Where",
                    "disambiguated_name": "$SCRIPT( return _doc.metadata.csv[0].country; )",
                    "geotag": {
                        "country": "$SCRIPT( return _doc.metadata.csv[0].country; )"
                    },
                    "ontology_type": "country",
                    "type": "Country"
                },
                {
                    "dimension": "What",
                    "disambiguated_name": "$metadata.csv.device",
                    "type": "Sensor"
                },
                {
                    "dimension": "What",
                    "disambiguated_name": "$metadata.csv.alert",
                    "type": "AlertType"
                }
            ]
        },
        {
            "associations": [
                {
                    "entity1": "$metadata.csv.dstIP",
                    "entity2": "$metadata.csv.srcIP",
                    "geo_index": "$SCRIPT( return _doc.metadata.csv[0].country + '/country'; )",
                    "time_start": "$SCRIPT( return _doc.metadata.csv[0].date; )",
                    "verb": "$SCRIPT( return _doc.metadata.csv[0].alert; )",
                    "verb_category": "$SCRIPT( return _doc.metadata.csv[0].alert; )"
                }
            ]
        },
        {
            "searchIndex": {
                "metadataFieldList": ""
            }
        }
    ]
}

Source #1c - fileshare, automated parsing - headers automatically specified

For the purpose of this example, imagine that the first line starts with a "#", eg:

#Date,Device,SrcIP,dstIP,Alert,Country
SCANNER_1,2012-01-01T13:43:00,10.0.0.1,66.66.66.66,DUMMY_ALERT_TYPE_1,United States

...

Then:

{
    "description": "For cyber demo",
    "isPublic": false,
    "mediaType": "Log",
    "searchCycle_secs": 3600,
    "tags": [
        "cyber",
        "structured"
    ],
    "title": "Cyber Logs Test",
    "processingPipeline": [
        {
            "file": {
                "XmlIgnoreValues": [
                    "#"
                ],
                "domain": "DOMAIN",
                "password": "PASSWORD",
                "type": "csv",
                "username": "USER",
                "url": "smb://FILESHARE:139/cyber_logs/"
            }
        },
        {
            "harvest": {
                "searchCycle_secs": 3600
            }
        },
        {
            "docMetadata": {
                "title": "$metadata.csv.alert @ $metadata.csv.date [$metadata.csv.device]: $metadata.csv.dstIP -> $metadata.csv.srcIP",
                "publishedDate": "$SCRIPT( return _doc.metadata.csv[0].date; )"
            }
        },
        {
            "entities": [
                {
                    "dimension": "What",
                    "disambiguated_name": "$metadata.csv.srcIP",
                    "type": "PrivateIP"
                },
                {
                    "dimension": "What",
                    "disambiguated_name": "$metadata.csv.dstIP",
                    "geotag": {
                        "country": "$SCRIPT( return _doc.metadata.csv[0].country; )"
                    },
                    "ontology_type": "country",
                    "type": "PublicIP"
                },
                {
                    "actual_name": "$metadata.csv.country",
                    "dimension": "Where",
                    "disambiguated_name": "$SCRIPT( return _doc.metadata.csv[0].country; )",
                    "geotag": {
                        "country": "$SCRIPT( return _doc.metadata.csv[0].country; )"
                    },
                    "ontology_type": "country",
                    "type": "Country"
                },
                {
                    "dimension": "What",
                    "disambiguated_name": "$metadata.csv.device",
                    "type": "Sensor"
                },
                {
                    "dimension": "What",
                    "disambiguated_name": "$metadata.csv.alert",
                    "type": "AlertType"
                }
            ]
        },
        {
            "associations": [
                {
                    "entity1": "$metadata.csv.dstIP",
                    "entity2": "$metadata.csv.srcIP",
                    "geo_index": "$SCRIPT( return _doc.metadata.csv[0].country + '/country'; )",
                    "time_start": "$SCRIPT( return _doc.metadata.csv[0].date; )",
                    "verb": "$SCRIPT( return _doc.metadata.csv[0].alert; )",
                    "verb_category": "$SCRIPT( return _doc.metadata.csv[0].alert; )"
                }
            ]
        },
        {
            "searchIndex": {
                "metadataFieldList": ""
            }
        }
    ]
}

Note that if the header was as originally specified (eg first line "Date,Device,SrcIP,dstIP,Alert,Country"), then the "XmlIgnoreValues" field would have need to be "\"Device\"" to correctly parse the headers (assuming the quote field was ").

Source #2a - web (including uploaded fileshares), manual parsing

It is slightly more complicated to parse CSV files over the Web, but still quite possible, using the searchConfig capability. Note that one neat trick is to upload a share to Infinit.e, and then use an API key to access the REST interface. Users can allocate themselves an API key from the People Manager.

Note that when accessing Web documents you must use "rss.extraUrls" and specify minimally "url" and "title" fields, and not the top-level "url" (otherwise the URL is treated as an RSS feed rather than a standalone web page)

{
    "description": "For cyber demo",
    "isPublic": false,
    "mediaType": "Log",
    "searchCycle_secs": 3600,
    "tags": [
        "cyber",
        "structured"
    ],
    "title": "Cyber Logs Test",
    "processingPipeline": [
        {
            "feed": {
                "extraUrls": [
                    {
                        "url": "http://INFINITE_ENDPOINT/api/share/get/51ad28a440b4a4f0f757824c?infinite_api_key=API_KEY"
                    }
                ]
            }
        },
        {
            "globals": {
                "scripts": [
                    "function decode(x)\n{\n    var info = {};   \n    var rec = x.split(',');   \n    info.device = rec[0];\n    info.date = rec[1];\n    info.srcIP = rec[2];\n    info.dstIP = rec[3];\n    info.alert = rec[4];\n    info.country = rec[5];\n    return info;\n}"
                ]
            }
        },
        {
            "harvest": {
                "searchCycle_secs": 3600
            }
        },
        {
            "links": {
                "script": "var retVals = [];\nvar n = -1;\nvar url = _doc.url.replace(/[?].*/,\"\");\nvar start = 0;\nwhile (start < text.length) {\n    var end = text.indexOf('\\n', start);\n    if (end == -1) end = text.length;\n    var line = text.substr(start,end-1);\n    start = end + 1;    \n    \n    n++;\n    if (0 == n) continue;\n    \n    var title = 'line #' + n.toString();\n    var url2 = url + '#' + n.toString();\n    var fullText = line;\n    var retVal = { 'title':title, 'url':url2, 'fullText':line };\n    retVals.push(retVal); \n}\nretVals;\n      "
            }
        },
        {
            "docMetadata": {
                "title": "$metadata.info.alert @ $metadata.info.date [$metadata.info.device]: $metadata.info.dstIP -> $metadata.info.srcIP",
                "publishedDate": "$SCRIPT( return _doc.metadata.info[0].date; )"
            }
        },
        {
            "contentMetadata": [
                {
                    "fieldName": "info",
                    "script": "var info = decode(text); info;",
                    "scriptlang": "javascript"
                }
            ]
        },
        {
            "text": [
                {
                    "fieldName": "fullText",
                    "script": ",",
                    "scriptlang": "regex",
                    "flags": "md",
                    "replacement": " , "
                },
                {
                    "fieldName": "description",
                    "script": ",",
                    "scriptlang": "regex",
                    "flags": "md",
                    "replacement": " , "
                }
            ]
        },
        {
            "entities": [
                {
                    "dimension": "What",
                    "disambiguated_name": "$metadata.info.srcIP",
                    "type": "PrivateIP"
                },
                {
                    "dimension": "What",
                    "disambiguated_name": "$metadata.info.dstIP",
                    "geotag": {
                        "country": "$SCRIPT( return _doc.metadata.info[0].country; )"
                    },
                    "ontology_type": "country",
                    "type": "PublicIP"
                },
                {
                    "actual_name": "$metadata.info.country",
                    "dimension": "Where",
                    "disambiguated_name": "$SCRIPT( return _doc.metadata.info[0].country; )",
                    "geotag": {
                        "country": "$SCRIPT( return _doc.metadata.info[0].country; )"
                    },
                    "ontology_type": "country",
                    "type": "Country"
                },
                {
                    "dimension": "What",
                    "disambiguated_name": "$metadata.info.device",
                    "type": "Sensor"
                },
                {
                    "dimension": "What",
                    "disambiguated_name": "$metadata.info.alert",
                    "type": "AlertType"
                }
            ]
        },
        {
            "associations": [
                {
                    "entity1": "$metadata.info.dstIP",
                    "entity2": "$metadata.info.srcIP",
                    "geo_index": "$SCRIPT( return _doc.metadata.info[0].country + '/country'; )",
                    "time_start": "$SCRIPT( return _doc.metadata.info[0].date; )",
                    "verb": "$SCRIPT( return _doc.metadata.info[0].alert; )",
                    "verb_category": "$SCRIPT( return _doc.metadata.info[0].alert; )"
                }
            ]
        },
        {
            "searchIndex": {
                "metadataFieldList": ""
            }
        }
    ]
}

Note that the API key is not visible in any of the extracted documents (it is removed in the "searchConfig.script" code), and is also not visible in the source to anyone but the source owner and administrator (due to the "isPublic:false" field). As an alternative (from June 2013), a cookie can be used: (eg) "rss.httpFields": { "Cookie": "infinitecookie=api:API_KEY;" }.

Source #2b - web (including uploaded fileshares), manual parsing using Java

As above, except "unstructuredAnalysis.script" will look like:

var parser = new Packages.au.com.bytecode.opencsv.CSVParser();
function decode(x)
{
   var rec = parser.parseLine(x.toString());
   var info = {};   
   info.device = '' + rec[0];
   info.date = '' + rec[1];
   info.srcIP = '' + rec[2];
   info.dstIP = '' + rec[3];
   info.alert = '' + rec[4];
   info.country = '' + rec[5];
   return info;
}

(Note that the "'' +  <string-variable>" construct is necessary to convert from Java strings to javascript strings)

See the opencsv documentation for more details.

Output sample

(For source 1b, metadata.info is called metadata.csv)
{
    "associations": [{
        "assoc_type": "Event",
        "entity1": "66.66.66.66",
        "entity1_index": "66.66.66.66/publicip",
        "entity2": "10.0.0.1",
        "entity2_index": "10.0.0.1/privateip",
        "geo_index": "united states/country",
        "time_start": "2012-01-01T13:43:00",
        "verb": "DUMMY_ALERT_TYPE_1",
        "verb_category": "DUMMY_ALERT_TYPE_1"
    }],
    "communityId": ["506dc16dfbf042893dd6b8f2"],
    "created": "Jun 4, 2013 12:54:34 AM UTC",
    "entities": [
        {
            "actual_name": "10.0.0.1",
            "dimension": "What",
            "disambiguated_name": "10.0.0.1",
            "doccount": 0,
            "frequency": 1,
            "index": "10.0.0.1/privateip",
            "relevance": 0,
            "totalfrequency": -1,
            "type": "PrivateIP"
        },
        {
            "actual_name": "66.66.66.66",
            "dimension": "What",
            "disambiguated_name": "66.66.66.66",
            "doccount": 0,
            "frequency": 1,
            "index": "66.66.66.66/publicip",
            "relevance": 0,
            "totalfrequency": -1,
            "type": "PublicIP"
        },
        {
            "actual_name": "United States",
            "dimension": "Where",
            "disambiguated_name": "United States",
            "doccount": 0,
            "frequency": 1,
            "index": "united states/country",
            "ontology_type": "country",
            "relevance": 0,
            "totalfrequency": -1,
            "type": "Country"
        },
        {
            "actual_name": "SCANNER_1",
            "dimension": "What",
            "disambiguated_name": "SCANNER_1",
            "doccount": 0,
            "frequency": 1,
            "index": "scanner_1/sensor",
            "relevance": 0,
            "totalfrequency": -1,
            "type": "Sensor"
        },
        {
            "actual_name": "DUMMY_ALERT_TYPE_1",
            "dimension": "What",
            "disambiguated_name": "DUMMY_ALERT_TYPE_1",
            "doccount": 0,
            "frequency": 1,
            "index": "dummy_alert_type_1/alerttype",
            "relevance": 0,
            "totalfrequency": -1,
            "type": "AlertType"
        }
    ],
    "fullText": "SCANNER_1 , 2012-01-01T13:43:00 , 10.0.0.1 , 66.66.66.66 , DUMMY_ALERT_TYPE_1 , United States",
    "mediaType": ["Log"],
    "metadata": {"info": [{
        "alert": "DUMMY_ALERT_TYPE_1 ",
        "country": "United States",
        "date": "2012-01-01T13:43:00",
        "device": "SCANNER_1 ",
        "dstIP": "66.66.66.66",
        "srcIP": " 10.0.0.1"
    }]},
    "modified": "Jun 4, 2013 12:54:34 AM UTC",
    "publishedDate": "January 1, 2012 13:43:00 PM UTC",
    "source": ["Cyber Logs Test"],
    "sourceKey": ["INFINITE_ENDPOINT.api.share.get.51ad28a440b4a4f0f757824c.25.26"],
    "tags": [
        "cyber",
        "structured"
    ],
    "title": "DUMMY_ALERT_TYPE_1  @ 2012-01-01T13:43:00 [SCANNER_1 ]: 66.66.66.66 -> 10.0.0.1",
    "url": "http://INFINITE_ENDPOINT/api/share/get/51ad28a440b4a4f0f757824c#1"
}

Sources - old format

Old Format Source #1a - fileshare, manual parsing

{
    "description": "For cyber demo",
    "extractType": "File",
    "file": {
        "XmlRootLevelValues": [],
        "domain": "DOMAIN",
        "password": "PASSWORD",
        "type": "csv",
        "username": "USER"
    },
    "isPublic": false,
    "mediaType": "Log",
    "searchCycle_secs": 3600,
    "searchIndexFilter": {
        "metadataFieldList": ""
    },
    "structuredAnalysis": {
        "associations": [
            {
                "entity1": "$metadata.info.dstIP",
                "entity2": "$metadata.info.srcIP",
                "geo_index": "$SCRIPT( return _doc.metadata.info[0].country + '/country'; )",
                "time_start": "$SCRIPT( return _doc.metadata.info[0].date; )",
                "verb": "$SCRIPT( return _doc.metadata.info[0].alert; )",
                "verb_category": "$SCRIPT( return _doc.metadata.info[0].alert; )"
            }
        ],
        "entities": [
            {
                "dimension": "What",
                "disambiguated_name": "$metadata.info.srcIP",
                "type": "PrivateIP"
            },
            {
                "dimension": "What",
                "disambiguated_name": "$metadata.info.dstIP",
                "geotag": {
                    "country": "$SCRIPT( return _doc.metadata.info[0].country; )"
                },
                "ontology_type": "country",
                "type": "PublicIP"
            },
            {
                "actual_name": "$metadata.info.country",
                "dimension": "Where",
                "disambiguated_name": "$SCRIPT( return _doc.metadata.info[0].country; )",
                "geotag": {
                    "country": "$SCRIPT( return _doc.metadata.info[0].country; )"
                },
                "ontology_type": "country",
                "type": "Country"
            },
            {
                "dimension": "What",
                "disambiguated_name": "$metadata.info.device",
                "type": "Sensor"
            },
            {
                "dimension": "What",
                "disambiguated_name": "$metadata.info.alert",
                "type": "AlertType"
            }
        ],
        "publishedDate": "$SCRIPT( return _doc.metadata.info[0].date; )",
        "script": "",
        "scriptEngine": "javascript",
        "title": "$metadata.info.alert @ $metadata.info.date [$metadata.info.device]: $metadata.info.dstIP -> $metadata.info.srcIP"
    },
    "tags": [
        "cyber",
        "structured"
    ],
    "title": "Cyber Logs Test",
    "unstructuredAnalysis": {
        "meta": [
            {
                "context": "First",
                "fieldName": "info",
                "script": "var info = decode(text); info;",
                "scriptlang": "javascript"
            }
        ],
        "script": "function decode(x)\n{\n    var info = {};   \n    var rec = x.split(',');   \n    info.device = rec[0];\n    info.date = rec[1];\n    info.srcIP = rec[2];\n    info.dstIP = rec[3];\n    info.alert = rec[4];\n    info.country = rec[5];\n    return info;\n}",
        "simpleTextCleanser": [
            {
                "field": "fullText",
                "flags": "md",
                "replacement": " , ",
                "script": ",",
                "scriptlang": "regex"
            },
            {
                "field": "description",
                "flags": "md",
                "replacement": " , ",
                "script": ",",
                "scriptlang": "regex"
            }
        ]
    },
    "useExtractor":"none",
    "useTextExtractor":"none",
    "url": "smb://FILESHARE:139/cyber_logs/"
}

Old Format Source #1b - fileshare, automated parsing

{
    "description": "For cyber demo",
    "extractType": "File",
    "file": {
        "XmlRootLevelValues": [ "device", "date", "srcIP", "dstIP", "alert", "country" ],
		"XmlIgnoreValues": [ "device,date,srcIP" ],
        "domain": "DOMAIN",
        "password": "PASSWORD",
        "type": "csv",
        "username": "USER"
    },
    "isPublic": false,
    "mediaType": "Log",
    "searchCycle_secs": 3600,
    "searchIndexFilter": {
        "metadataFieldList": ""
    },
    "structuredAnalysis": {
        "associations": [
            {
                "entity1": "$metadata.csv.dstIP",
                "entity2": "$metadata.csv.srcIP",
                "geo_index": "$SCRIPT( return _doc.metadata.csv[0].country + '/country'; )",
                "time_start": "$SCRIPT( return _doc.metadata.csv[0].date; )",
                "verb": "$SCRIPT( return _doc.metadata.csv[0].alert; )",
                "verb_category": "$SCRIPT( return _doc.metadata.csv[0].alert; )"
            }
        ],
        "entities": [
            {
                "dimension": "What",
                "disambiguated_name": "$metadata.csv.srcIP",
                "type": "PrivateIP"
            },
            {
                "dimension": "What",
                "disambiguated_name": "$metadata.csv.dstIP",
                "geotag": {
                    "country": "$SCRIPT( return _doc.metadata.csv[0].country; )"
                },
                "ontology_type": "country",
                "type": "PublicIP"
            },
            {
                "actual_name": "$metadata.csv.country",
                "dimension": "Where",
                "disambiguated_name": "$SCRIPT( return _doc.metadata.csv[0].country; )",
                "geotag": {
                    "country": "$SCRIPT( return _doc.metadata.csv[0].country; )"
                },
                "ontology_type": "country",
                "type": "Country"
            },
            {
                "dimension": "What",
                "disambiguated_name": "$metadata.csv.device",
                "type": "Sensor"
            },
            {
                "dimension": "What",
                "disambiguated_name": "$metadata.csv.alert",
                "type": "AlertType"
            }
        ],
        "publishedDate": "$SCRIPT( return _doc.metadata.csv[0].date; )",
        "script": "",
        "scriptEngine": "javascript",
        "title": "$metadata.csv.alert @ $metadata.csv.date [$metadata.csv.device]: $metadata.csv.dstIP -> $metadata.csv.srcIP"
    },
    "tags": [
        "cyber",
        "structured"
    ],
    "title": "Cyber Logs Test",
    "useExtractor":"none",
    "useTextExtractor":"none",
    "url": "smb://FILESHARE:139/cyber_logs/"
}

Old Format Source #2a - web (including uploaded fileshares), manual parsing

It is slightly more complicated to parse CSV files over the Web, but still quite possible, using the searchConfig capability. Note that one neat trick is to upload a share to Infinit.e, and then use an API key to access the REST interface. Users can allocate themselves an API key from the People Manager.

Note that when accessing Web documents you must use "rss.extraUrls" and specify minimally "url" and "title" fields, and not the top-level "url" (otherwise the URL is treated as an RSS feed rather than a standalone web page)

{
    "description": "For cyber demo",
    "extractType": "Feed",
    "isPublic": false,
    "mediaType": "Log",
    "searchCycle_secs": 3600,
    "rss": {
		"extraUrls": [
			{ "url": "http://INFINITE_ENDPOINT/api/share/get/51ad28a440b4a4f0f757824c?infinite_api_key=API_KEY" }
		],
        "searchConfig": {"script": "var retVals = [];\nvar n = -1;\nvar url = _doc.url.replace(/[?].*/,\"\");\nvar start = 0;\nwhile (start < text.length) {\n    var end = text.indexOf('\\n', start);\n    if (end == -1) end = text.length;\n    var line = text.substr(start,end-1);\n    start = end + 1;    \n    \n    n++;\n    if (0 == n) continue;\n    \n    var title = 'line #' + n.toString();\n    var url2 = url + '#' + n.toString();\n    var fullText = line;\n    var retVal = { 'title':title, 'url':url2, 'fullText':line };\n    retVals.push(retVal); \n}\nretVals;\n      "}
    },
    "searchIndexFilter": {
        "metadataFieldList": ""
    },
    "structuredAnalysis": {
        "associations": [
            {
                "entity1": "$metadata.info.dstIP",
                "entity2": "$metadata.info.srcIP",
                "geo_index": "$SCRIPT( return _doc.metadata.info[0].country + '/country'; )",
                "time_start": "$SCRIPT( return _doc.metadata.info[0].date; )",
                "verb": "$SCRIPT( return _doc.metadata.info[0].alert; )",
                "verb_category": "$SCRIPT( return _doc.metadata.info[0].alert; )"
            }
        ],
        "entities": [
            {
                "dimension": "What",
                "disambiguated_name": "$metadata.info.srcIP",
                "type": "PrivateIP"
            },
            {
                "dimension": "What",
                "disambiguated_name": "$metadata.info.dstIP",
                "geotag": {
                    "country": "$SCRIPT( return _doc.metadata.info[0].country; )"
                },
                "ontology_type": "country",
                "type": "PublicIP"
            },
            {
                "actual_name": "$metadata.info.country",
                "dimension": "Where",
                "disambiguated_name": "$SCRIPT( return _doc.metadata.info[0].country; )",
                "geotag": {
                    "country": "$SCRIPT( return _doc.metadata.info[0].country; )"
                },
                "ontology_type": "country",
                "type": "Country"
            },
            {
                "dimension": "What",
                "disambiguated_name": "$metadata.info.device",
                "type": "Sensor"
            },
            {
                "dimension": "What",
                "disambiguated_name": "$metadata.info.alert",
                "type": "AlertType"
            }
        ],
        "publishedDate": "$SCRIPT( return _doc.metadata.info[0].date; )",
        "script": "",
        "scriptEngine": "javascript",
        "title": "$metadata.info.alert @ $metadata.info.date [$metadata.info.device]: $metadata.info.dstIP -> $metadata.info.srcIP"
    },
    "tags": [
        "cyber",
        "structured"
    ],
    "title": "Cyber Logs Test",
    "unstructuredAnalysis": {
        "meta": [
            {
                "context": "First",
                "fieldName": "info",
                "script": "var info = decode(text); info;",
                "scriptlang": "javascript"
            }
        ],
        "script": "function decode(x)\n{\n    var info = {};   \n    var rec = x.split(',');   \n    info.device = rec[0];\n    info.date = rec[1];\n    info.srcIP = rec[2];\n    info.dstIP = rec[3];\n    info.alert = rec[4];\n    info.country = rec[5];\n    return info;\n}",
        "simpleTextCleanser": [
            {
                "field": "fullText",
                "flags": "md",
                "replacement": " , ",
                "script": ",",
                "scriptlang": "regex"
            },
            {
                "field": "description",
                "flags": "md",
                "replacement": " , ",
                "script": ",",
                "scriptlang": "regex"
            }
        ]
    },
    "useExtractor":"none",
    "useTextExtractor":"none"	
}

Note that the API key is not visible in any of the extracted documents (it is removed in the "searchConfig.script" code), and is also not visible in the source to anyone but the source owner and administrator (due to the "isPublic:false" field). As an alternative (from June 2013), a cookie can be used: (eg) "rss.httpFields": { "Cookie": "infinitecookie=api:API_KEY;" }.

Old format Source #2b - web (including uploaded fileshares), manual parsing using Java

As above, except "unstructuredAnalysis.script" will look like:

var parser = new Packages.au.com.bytecode.opencsv.CSVParser();
function decode(x)
{
   var rec = parser.parseLine(x.toString());
   var info = {};   
   info.device = '' + rec[0];
   info.date = '' + rec[1];
   info.srcIP = '' + rec[2];
   info.dstIP = '' + rec[3];
   info.alert = '' + rec[4];
   info.country = '' + rec[5];
   return info;
}

(Note that the "'' +  <string-variable>" construct is necessary to convert from Java strings to javascript strings)

See the opencsv documentation for more details.