Sample document
TODO
Source
...
Code Block | ||||
---|---|---|---|---|
| ||||
{
"id": "tag:search.twitter.com,2005:266601489475186688",
"objectType": "activity",
"actor": {
"objectType": "person",
"id": "id:twitter.com:835627776",
"link": "http://www.twitter.com/FocalCRM",
"displayName": "CRM Buddy",
"postedTime": "2012-09-20T13:59:56.000Z",
"image": "http://a0.twimg.com/profile_images/2630355549/8cad59efaddd57283dbb159332336744_normal.jpeg",
"summary": "",
"links": [
{
"href": null,
"rel": "me"
}
],
"friendsCount": 0,
"followersCount": 245,
"listedCount": 6,
"statusesCount": 3688,
"twitterTimeZone": null,
"verified": false,
"utcOffset": null,
"preferredUsername": "FocalCRM",
"languages": [
"en"
]
},
"verb": "post",
"postedTime": "2012-11-08T18:02:02.000Z",
"generator": {
"displayName": "dlvr.it",
"link": "http://dlvr.it"
},
"provider": {
"objectType": "service",
"displayName": "Twitter",
"link": "http://www.twitter.com"
},
"link": "http://twitter.com/FocalCRM/statuses/266601489475186688",
"body": "Amex Teams With Halo 4 on Master Chief Incentives http://t.co/IvwmjJyV #crm",
"object": {
"objectType": "note",
"id": "object:search.twitter.com,2005:266601489475186688",
"summary": "Amex TeamsWith Halo 4 on Master Chief Incentives http://t.co/IvwmjJyV #crm",
"link": "http://twitter.com/FocalCRM/statuses/266601489475186688",
"postedTime": "2012-11-08T18:02:02.000Z"
},
"twitter_entities": {
"urls": [
{
"display_url": "dlvr.it/2S6sjV",
"indices": [
50,
70
],
"expanded_url": "http://dlvr.it/2S6sjV",
"url": "http://t.co/IvwmjJyV"
}
],
"user_mentions": [],
"hashtags": [
{
"text": "crm",
"indices": [
71,
75
]
}
]
},
"retweetCount": 0,
"gnip": {
"language": {
"value": "en"
},
"matching_rules": [
{
"value": "halo 4",
"tag": null
}
],
"klout_score": 48,
"urls": [
{
"url": "http://t.co/IvwmjJyV",
"expanded_url": "http://www.crmbuyer.com/rsstory/76578.html"
}
]
}
} |
Source
Code Block |
---|
//TODO (INF-1865): need to distinguish between "tweets_to" and "retweets"...
//TODO (INF-1865): looks like body is HTML encodedÂ
//TODO (INF-1865): aggregate sentiment vs user?
//TODO (INF-1865): distinguish between tweets and mentions
{
"description": "A large set of tweets related to Super Storm Sandy",
"isApproved": true,
"isPublic": false,
"mediaType": "Social",
"tags": [
"twitter",
"gnip"
],
"title": "Super Storm Sandy - Twitter: SANDY_SUBSTRING",
"processingPipeline": [
{
"file": {
"XmlPrimaryKey": "link",
"XmlSourceName": "",
"XmlRootLevelValues": [],
"domain": "XXX",
"password": "XXX",
"username": "XXX",
"url": "smb://HOST:139/SHARE/PATH/TO/"
}
},
{
"globals": {
"scripts": [
"function getAddressVal( addressStr, number) { try { var addressArray = addressStr.split(/ *, */); if (addressArray != null && addressArray.length > 0) { if (addressArray[number].toLowerCase()=='ny') { return 'new york'; } else if (addressArray[number].toLowerCase()=='long island' || addressArray[number].toLowerCase()=='li') { return 'medford'; } else { return addressArray[number]; } } else { return ''; } } catch (err) { return ''; } } function getRegion( code ) { if (code.toLowerCase()=='ny') {return 'New York';} else if (code.toLowerCase()=='nj') {return 'New Jersey';} else if (code.toLowerCase()=='ct') {return 'Connecticut';} else if (code.toLowerCase()=='md') {return 'Maryland';} else if (code.toLowerCase()=='va') {return 'Virginia';} else if (code.toLowerCase()=='pa') {return 'Pennsylvania';} else if (code.toLowerCase()=='nj') {return 'New Jersey';} else {return 'New York';} }"
]
}
},
{
"docMetadata": {
"title": "$metadata.json.body",
"description": "$metadata.json.body",
"fullText": "$metadata.json.body",
"publishedDate": "$SCRIPT(return _doc.metadata.json[0].postedTime.replace(/.[0-9]{3}Z/,'Z');)",
"geotag": {
"lat": "$SCRIPT( try {return _doc.metadata.json[0].geo.coordinates[0];} catch (err) {return '';})",
"lon": "$SCRIPT( try {return _doc.metadata.json[0].geo.coordinates[1];} catch (err) {return '';})"
}
}
},
{
"featureEngine": {
"engineName": "AlchemyAPI-metadata",
"engineConfig": {
"app.alchemyapi-metadata.batchSize": 100,
"app.alchemyapi-metadata.numKeywords": 5,
"app.alchemyapi-metadata.strict": "true"
}
}
},
{
"entities": [
{
"actual_name": "$metadata.json.actor.displayName",
"dimension": "Who",
"disambiguated_name": "$metadata.json.actor.preferredUsername",
"linkdata": "$metadata.json.actor.link",
"type": "TwitterHandle"
},
{
"iterateOver": "json.twitter_entities.user_mentions",
"actual_name": "$SCRIPT(return _iterator.name;)",
"dimension": "Who",
"disambiguated_name": "$SCRIPT(return _iterator.screen_name;)",
"linkdata": "$SCRIPT(return 'http://www.twitter.com/' + _iterator.screen_name;)",
"type": "TwitterHandle"
},
{
"actual_name": "$metadata.json.object.actor.displayName",
"dimension": "Who",
"disambiguated_name": "$metadata.json.object.actor.preferredUsername",
"linkdata": "$metadata.json.object.actor.link",
"type": "TwitterHandle"
},
{
"dimension": "Where",
"disambiguated_name": "$metadata.json.actor.location.displayName",
"geotag": {
"city": "$SCRIPT( return getAddressVal( _doc.metadata.json[0].actor.location.displayName, 0 ) )",
"stateProvince": "$SCRIPT( return getRegion(getAddressVal( _doc.metadata.json[0].actor.location.displayName, 1 )) )",
"countryCode": "US",
"alternatives": [
{
"stateProvince": "$SCRIPT( return getRegion(getAddressVal( _doc.metadata.json[0].actor.location.displayName, 1 )) )",
"countryCode": "US"
}
]
},
"type": "Location"
},
{
"dimension": "Where",
"disambiguated_name": "$metadata.json.object.actor.location.displayName",
"type": "Location"
},
{
"disambiguated_name": "$SCRIPT(return _iterator.text;)",
"iterateOver": "json.twitter_entities.hashtags",
"type": "HashTag"
},
{
"actual_name": "$SCRIPT(return _iterator.url)",
"disambiguated_name": "$SCRIPT(return _iterator.expanded_url;)",
"iterateOver": "json.gnip.urls",
"type": "URL"
}
]
},
{
"associations": [
{
"assoc_type": "Event",
"creationCriteriaScript": "$SCRIPT( return (null != _doc.metadata.json[0].object.actor); )",
"entity1_index": "$SCRIPT( return _doc.metadata.json[0].actor.preferredUsername + '/twitterhandle';)",
"entity2_index": "$SCRIPT( return _doc.metadata.json[0].object.actor.preferredUsername + '/twitterhandle';)",
"verb": "retweets",
"verb_category": "retweets"
},
{
"assoc_type": "Event",
"creationCriteriaScript": "$SCRIPT( return (null != _doc.metadata.json[0].object.actor) && (null != _doc.metadata.json[0].object.actor.location); )",
"entity1_index": "$SCRIPT( return _doc.metadata.json[0].object.actor.preferredUsername + '/twitterhandle';)",
"entity2_index": "$SCRIPT( return _doc.metadata.json[0].object.actor.location.displayName+ '/location';)",
"verb": "twitter_location",
"verb_category": "twitter_location"
},
{
"assoc_type": "Event",
"entity1_index": "$SCRIPT( return _doc.metadata.json[0].actor.preferredUsername + '/twitterhandle';)",
"entity2_index": "$SCRIPT( return _iterator.text + '/hashtag'; )",
"iterateOver": "json.twitter_entities.hashtags",
"verb": "tweets_about",
"verb_category": "tweets_about"
},
{
"assoc_type": "Event",
"entity1_index": "$SCRIPT( return _doc.metadata.json[0].actor.preferredUsername + '/twitterhandle';)",
"entity2_index": "$SCRIPT( return _iterator.screen_name + '/twitterhandle'; )",
"iterateOver": "json.twitter_entities.user_mentions",
"verb": "tweets_to",
"verb_category": "tweets_to"
},
{
"assoc_type": "Event",
"entity1_index": "$SCRIPT( return _doc.metadata.json[0].actor.preferredUsername + '/twitterhandle';)",
"entity2_index": "$SCRIPT( return _iterator.expanded_url + '/url'; )",
"iterateOver": "json.gnip.urls",
"verb": "tweets_link",
"verb_category": "tweets_link"
}
]
},
{
"storageSettings": {
"rejectDocCriteria": "$SCRIPT( if (null == _doc.metadata.json[0].link || null == _doc.metadata.json[0].object) return 'reject'; )"
}
}
]
} |
Sample output
Code Block | ||
---|---|---|
| ||
{
"associations": [
{
"assoc_type": "Event",
"entity1": "focalcrm",
"entity1_index": "focalcrm/twitterhandle",
"entity2": "crm",
"entity2_index": "crm/hashtag",
"verb": "tweets_about",
"verb_category": "tweets_about"
},
{
"assoc_type": "Event",
"entity1": "focalcrm",
"entity1_index": "focalcrm/twitterhandle",
"entity2": "http://www.crmbuyer.com/rsstory/76578.html",
"entity2_index": "http://www.crmbuyer.com/rsstory/76578.html/url",
"verb": "tweets_link",
"verb_category": "tweets_link"
}
],
"communityId": ["506dc16dfbf042893dd6b8f2"],
"created": "May 16, 2013 12:28:09 PM UTC",
"description": "Amex Teams With Halo 4 on Master Chief Incentives http://t.co/IvwmjJyV #crm",
"entities": [
{
"actual_name": "CRM Buddy",
"dimension": "Who",
"disambiguated_name": "FocalCRM",
"doccount": 0,
"frequency": 1,
"index": "focalcrm/twitterhandle",
"linkdata": ["http://www.twitter.com/FocalCRM"],
"relevance": 0,
"totalfrequency": -1,
"type": "TwitterHandle"
},
{
"actual_name": "crm",
"dimension": "What",
"disambiguated_name": "crm",
"doccount": 0,
"frequency": 1,
"index": "crm/hashtag",
"relevance": 0,
"totalfrequency": -1,
"type": "HashTag"
},
{
"actual_name": "http://t.co/IvwmjJyV",
"dimension": "What",
"disambiguated_name": "http://www.crmbuyer.com/rsstory/76578.html",
"doccount": 0,
"frequency": 1,
"index": "http://www.crmbuyer.com/rsstory/76578.html/url",
"relevance": 0,
"totalfrequency": -1,
"type": "URL"
},
{
"actual_name": "Amex Teams",
"dimension": "What",
"disambiguated_name": "Amex Teams",
"doccount": -1,
"frequency": 1,
"index": "amex teams/keyword",
"relevance": 0.758636,
"sentiment": 0.160753,
"totalfrequency": -1,
"type": "Keyword"
},
{
"actual_name": "Halo",
"dimension": "What",
"disambiguated_name": "Halo",
"doccount": -1,
"frequency": 1,
"index": "halo/keyword",
"relevance": 0.461833,
"sentiment": 0.168822,
"totalfrequency": -1,
"type": "Keyword"
},
{
"actual_name": "Master Chief Incentives",
"dimension": "What",
"disambiguated_name": "Master Chief Incentives",
"doccount": -1,
"frequency": 1,
"index": "master chief incentives/keyword",
"relevance": 0.981457,
"sentiment": 0.168876,
"totalfrequency": -1,
"type": "Keyword"
},
{
"actual_name": "http://t.co/IvwmjJyV",
"dimension": "What",
"disambiguated_name": "http://t.co/IvwmjJyV",
"doccount": -1,
"frequency": 1,
"index": "http://t.co/ivwmjjyv/keyword",
"relevance": 0.212007,
"sentiment": 0.126168,
"totalfrequency": -1,
"type": "Keyword"
},
{
"actual_name": "crm",
"dimension": "What",
"disambiguated_name": "crm",
"doccount": -1,
"frequency": 1,
"index": "crm/keyword",
"relevance": 0.404086,
"sentiment": 0.103838,
"totalfrequency": -1,
"type": "Keyword"
}
],
"mediaType": ["Social"],
"metadata": {"json": [{
"actor": {
"displayName": "CRM Buddy",
"followersCount": "245",
"friendsCount": "0",
"id": "id:twitter.com:835627776",
"image": "http://a0.twimg.com/profile_images/2630355549/8cad59efaddd57283dbb159332336744_normal.jpeg",
"languages": ["en"],
"link": "http://www.twitter.com/FocalCRM",
"links": [{"rel": "me"}],
"listedCount": "6",
"objectType": "person",
"postedTime": "2012-09-20T13:59:56.000Z",
"preferredUsername": "FocalCRM",
"statusesCount": "3688",
"summary": "",
"verified": "false"
},
"body": "Amex Teams With Halo 4 on Master Chief Incentives http://t.co/IvwmjJyV #crm",
"generator": {
"displayName": "dlvr.it",
"link": "http://dlvr.it"
},
"gnip": {
"klout_score": "48",
"language": {"value": "en"},
"matching_rules": [{"value": "halo 4"}],
"urls": [{
"expanded_url": "http://www.crmbuyer.com/rsstory/76578.html",
"url": "http://t.co/IvwmjJyV"
}]
},
"id": "tag:search.twitter.com,2005:266601489475186688",
"link": "http://twitter.com/FocalCRM/statuses/266601489475186688",
"object": {
"id": "object:search.twitter.com,2005:266601489475186688",
"link": "http://twitter.com/FocalCRM/statuses/266601489475186688",
"objectType": "note",
"postedTime": "2012-11-08T18:02:02.000Z",
"summary": "Amex Teams With Halo 4 on Master Chief Incentives http://t.co/IvwmjJyV #crm"
},
"objectType": "activity",
"postedTime": "2012-11-08T18:02:02.000Z",
"provider": {
"displayName": "Twitter",
"link": "http://www.twitter.com",
"objectType": "service"
},
"retweetCount": "0",
"twitter_entities": {
"hashtags": [{
"indices": [
"71",
"75"
],
"text": "crm"
}],
"urls": [{
"display_url": "dlvr.it/2S6sjV",
"expanded_url": "http://dlvr.it/2S6sjV",
"indices": [
"50",
"70"
],
"url": "http://t.co/IvwmjJyV"
}],
"user_mentions": []
},
"verb": "post"
}]},
"modified": "Nov 8, 2012 06:02:44 PM UTC",
"publishedDate": "Nov 8, 2012 06:02:02 PM UTC",
"source": ["gnip test"],
"sourceKey": [".mnt.fileshare.datasift.gnip."],
"sourceUrl": "file:/mnt/fileshare/datasift/gnip/gnip.json",
"tags": [
"twitter",
"gnip"
],
"title": "Amex Teams With Halo 4 on Master Chief Incentives http://t.co/IvwmjJyV #crm",
"url": "http://twitter.com/FocalCRM/statuses/266601489475186688"
} |
Source
Code Block |
---|
//TODO (INF-1865): need to distinguish between "tweets_to" and "retweets"... //TODO (INF-1865): looks like body is HTML encoded //TODO (INF-1865): aggregate sentiment vs user? //TODO (INF-1865): distinguish between tweets and mentions { "description": "A large set of tweets related to Super Storm Sandy", "extractType": "File", "extractorOptions": { "app.alchemyapi-metadata.batchSize": 100, "app.alchemyapi-metadata.numKeywords": 5, "app.alchemyapi-metadata.strict": "true" }, "file": { "XmlPrimaryKey": "link", "XmlSourceName": "", "XmlRootLevelValues": [], "domain": "XXX", "password": "XXX", "username": "XXX" }, "isApproved": true, "isPublic": false, "mediaType": "Social", "structuredAnalysis": { "rejectDocCriteria": "$SCRIPT( if (null == _doc.metadata.json[0].link || null == _doc.metadata.json[0].object) return 'reject'; )", "metadataFieldList": "", "docGeo" : { "lat": "$SCRIPT( try {return _doc.metadata.json[0].geo.coordinates[0];} catch (err) {return '';})", "lon": "$SCRIPT( try {return _doc.metadata.json[0].geo.coordinates[1];} catch (err) {return '';})" }, "associations": [ { "assoc_type": "Event", "creationCriteriaScript": "$SCRIPT( return (null != _doc.metadata.json[0].object.actor); )", "entity1_index": "$SCRIPT( return _doc.metadata.json[0].actor.preferredUsername + '/twitterhandle';)", "entity2_index": "$SCRIPT( return _doc.metadata.json[0].object.actor.preferredUsername + '/twitterhandle';)", "verb": "retweets", "verb_category": "retweets" }, { "assoc_type": "Event", "creationCriteriaScript": "$SCRIPT( return (null != _doc.metadata.json[0].object.actor) && (null != _doc.metadata.json[0].object.actor.location); )", "entity1_index": "$SCRIPT( return _doc.metadata.json[0].object.actor.preferredUsername + '/twitterhandle';)", "entity2_index": "$SCRIPT( return _doc.metadata.json[0].object.actor.location.displayName+ '/location';)", "verb": "twitter_location", "verb_category": "twitter_location" }, { "assoc_type": "Event", "entity1_index": "$SCRIPT( return _doc.metadata.json[0].actor.preferredUsername + '/twitterhandle';)", "entity2_index": "$SCRIPT( return _iterator.text + '/hashtag'; )", "iterateOver": "json.twitter_entities.hashtags", "verb": "tweets_about", "verb_category": "tweets_about" }, { "assoc_type": "Event", "entity1_index": "$SCRIPT( return _doc.metadata.json[0].actor.preferredUsername + '/twitterhandle';)", "entity2_index": "$SCRIPT( return _iterator.screen_name + '/twitterhandle'; )", "iterateOver": "json.twitter_entities.user_mentions", "verb": "tweets_to", "verb_category": "tweets_to" }, { "assoc_type": "Event", "entity1_index": "$SCRIPT( return _doc.metadata.json[0].actor.preferredUsername + '/twitterhandle';)", "entity2_index": "$SCRIPT( return _iterator.expanded_url + '/url'; )", "iterateOver": "json.gnip.urls", "verb": "tweets_link", "verb_category": "tweets_link" } ], "description": "$metadata.json.body", "entities": [ { "actual_name": "$metadata.json.actor.displayName", "dimension": "Who", "disambiguated_name": "$metadata.json.actor.preferredUsername", "linkdata": "$metadata.json.actor.link", "type": "TwitterHandle" }, { "iterateOver": "json.twitter_entities.user_mentions", "actual_name": "$SCRIPT(return _iterator.name;)", "dimension": "Who", "disambiguated_name": "$SCRIPT(return _iterator.screen_name;)", "linkdata": "$SCRIPT(return 'http://www.twitter.com/' + _iterator.screen_name;)", "type": "TwitterHandle" }, { "actual_name": "$metadata.json.object.actor.displayName", "dimension": "Who", "disambiguated_name": "$metadata.json.object.actor.preferredUsername", "linkdata": "$metadata.json.object.actor.link", "type": "TwitterHandle" }, { "dimension": "Where", "disambiguated_name": "$metadata.json.actor.location.displayName", "geotag": { "city": "$SCRIPT( return getAddressVal( _doc.metadata.json[0].actor.location.displayName, 0 ) )", "stateProvince": "$SCRIPT( return getRegion(getAddressVal( _doc.metadata.json[0].actor.location.displayName, 1 )) )", "countryCode" : "US", "alternatives": [ { "stateProvince": "$SCRIPT( return getRegion(getAddressVal( _doc.metadata.json[0].actor.location.displayName, 1 )) )", "countryCode" : "US" } ] }, "type": "Location" }, { "dimension": "Where", "disambiguated_name": "$metadata.json.object.actor.location.displayName", "type": "Location" }, { "disambiguated_name": "$SCRIPT(return _iterator.text;)", "iterateOver": "json.twitter_entities.hashtags", "type": "HashTag" }, { "actual_name": "$SCRIPT(return _iterator.url)", "disambiguated_name": "$SCRIPT(return _iterator.expanded_url;)", "iterateOver": "json.gnip.urls", "type": "URL" } ], "fullText": "$metadata.json.body", "script": "function getAddressVal( addressStr, number) { try { var addressArray = addressStr.split(/ *, */); if (addressArray != null && addressArray.length > 0) { if (addressArray[number].toLowerCase()=='ny') { return 'new york'; } else if (addressArray[number].toLowerCase()=='long island' || addressArray[number].toLowerCase()=='li') { return 'medford'; } else { return addressArray[number]; } } else { return ''; } } catch (err) { return ''; } } function getRegion( code ) { if (code.toLowerCase()=='ny') {return 'New York';} else if (code.toLowerCase()=='nj') {return 'New Jersey';} else if (code.toLowerCase()=='ct') {return 'Connecticut';} else if (code.toLowerCase()=='md') {return 'Maryland';} else if (code.toLowerCase()=='va') {return 'Virginia';} else if (code.toLowerCase()=='pa') {return 'Pennsylvania';} else if (code.toLowerCase()=='nj') {return 'New Jersey';} else {return 'New York';} }", "scriptEngine": "javascript", "title": "$metadata.json.body", "url": "$metadata.json.link", "publishedDate": "$SCRIPT(return _doc.metadata.json[0].postedTime.replace(/.[0-9]{3}Z/,'Z');)" }, "tags": [ "twitter", "gnip" ], "title": "Super Storm Sandy - Twitter: SANDY_SUBSTRING", "url": "smb://HOST:139/SHARE/PATH/TO/", "useExtractor": "AlchemyAPI-metadata", "useTextExtractor": "none" } |
Sample output
...