GNIP source gallery
Sample document
Twitter document
{ "id": "tag:search.twitter.com,2005:266601489475186688", "objectType": "activity", "actor": { "objectType": "person", "id": "id:twitter.com:835627776", "link": "http://www.twitter.com/FocalCRM", "displayName": "CRM Buddy", "postedTime": "2012-09-20T13:59:56.000Z", "image": "http://a0.twimg.com/profile_images/2630355549/8cad59efaddd57283dbb159332336744_normal.jpeg", "summary": "", "links": [ { "href": null, "rel": "me" } ], "friendsCount": 0, "followersCount": 245, "listedCount": 6, "statusesCount": 3688, "twitterTimeZone": null, "verified": false, "utcOffset": null, "preferredUsername": "FocalCRM", "languages": [ "en" ] }, "verb": "post", "postedTime": "2012-11-08T18:02:02.000Z", "generator": { "displayName": "dlvr.it", "link": "http://dlvr.it" }, "provider": { "objectType": "service", "displayName": "Twitter", "link": "http://www.twitter.com" }, "link": "http://twitter.com/FocalCRM/statuses/266601489475186688", "body": "Amex Teams With Halo 4 on Master Chief Incentives http://t.co/IvwmjJyV #crm", "object": { "objectType": "note", "id": "object:search.twitter.com,2005:266601489475186688", "summary": "Amex TeamsWith Halo 4 on Master Chief Incentives http://t.co/IvwmjJyV #crm", "link": "http://twitter.com/FocalCRM/statuses/266601489475186688", "postedTime": "2012-11-08T18:02:02.000Z" }, "twitter_entities": { "urls": [ { "display_url": "dlvr.it/2S6sjV", "indices": [ 50, 70 ], "expanded_url": "http://dlvr.it/2S6sjV", "url": "http://t.co/IvwmjJyV" } ], "user_mentions": [], "hashtags": [ { "text": "crm", "indices": [ 71, 75 ] } ] }, "retweetCount": 0, "gnip": { "language": { "value": "en" }, "matching_rules": [ { "value": "halo 4", "tag": null } ], "klout_score": 48, "urls": [ { "url": "http://t.co/IvwmjJyV", "expanded_url": "http://www.crmbuyer.com/rsstory/76578.html" } ] } }
Source
//TODO (INF-1865): need to distinguish between "tweets_to" and "retweets"... //TODO (INF-1865): looks like body is HTML encoded //TODO (INF-1865): aggregate sentiment vs user? //TODO (INF-1865): distinguish between tweets and mentions { "description": "A large set of tweets related to Super Storm Sandy", "isApproved": true, "isPublic": false, "mediaType": "Social", "tags": [ "twitter", "gnip" ], "title": "Super Storm Sandy - Twitter: SANDY_SUBSTRING", "processingPipeline": [ { "file": { "XmlPrimaryKey": "link", "XmlSourceName": "", "XmlRootLevelValues": [], "domain": "XXX", "password": "XXX", "username": "XXX", "url": "smb://HOST:139/SHARE/PATH/TO/" } }, { "globals": { "scripts": [ "function getAddressVal( addressStr, number) { try { var addressArray = addressStr.split(/ *, */); if (addressArray != null && addressArray.length > 0) { if (addressArray[number].toLowerCase()=='ny') { return 'new york'; } else if (addressArray[number].toLowerCase()=='long island' || addressArray[number].toLowerCase()=='li') { return 'medford'; } else { return addressArray[number]; } } else { return ''; } } catch (err) { return ''; } } function getRegion( code ) { if (code.toLowerCase()=='ny') {return 'New York';} else if (code.toLowerCase()=='nj') {return 'New Jersey';} else if (code.toLowerCase()=='ct') {return 'Connecticut';} else if (code.toLowerCase()=='md') {return 'Maryland';} else if (code.toLowerCase()=='va') {return 'Virginia';} else if (code.toLowerCase()=='pa') {return 'Pennsylvania';} else if (code.toLowerCase()=='nj') {return 'New Jersey';} else {return 'New York';} }" ] } }, { "docMetadata": { "title": "$metadata.json.body", "description": "$metadata.json.body", "fullText": "$metadata.json.body", "publishedDate": "$SCRIPT(return _doc.metadata.json[0].postedTime.replace(/.[0-9]{3}Z/,'Z');)", "geotag": { "lat": "$SCRIPT( try {return _doc.metadata.json[0].geo.coordinates[0];} catch (err) {return '';})", "lon": "$SCRIPT( try {return _doc.metadata.json[0].geo.coordinates[1];} catch (err) {return '';})" } } }, { "featureEngine": { "engineName": "AlchemyAPI-metadata", "engineConfig": { "app.alchemyapi-metadata.batchSize": 100, "app.alchemyapi-metadata.numKeywords": 5, "app.alchemyapi-metadata.strict": "true" } } }, { "entities": [ { "actual_name": "$metadata.json.actor.displayName", "dimension": "Who", "disambiguated_name": "$metadata.json.actor.preferredUsername", "linkdata": "$metadata.json.actor.link", "type": "TwitterHandle" }, { "iterateOver": "json.twitter_entities.user_mentions", "actual_name": "$SCRIPT(return _iterator.name;)", "dimension": "Who", "disambiguated_name": "$SCRIPT(return _iterator.screen_name;)", "linkdata": "$SCRIPT(return 'http://www.twitter.com/' + _iterator.screen_name;)", "type": "TwitterHandle" }, { "actual_name": "$metadata.json.object.actor.displayName", "dimension": "Who", "disambiguated_name": "$metadata.json.object.actor.preferredUsername", "linkdata": "$metadata.json.object.actor.link", "type": "TwitterHandle" }, { "dimension": "Where", "disambiguated_name": "$metadata.json.actor.location.displayName", "geotag": { "city": "$SCRIPT( return getAddressVal( _doc.metadata.json[0].actor.location.displayName, 0 ) )", "stateProvince": "$SCRIPT( return getRegion(getAddressVal( _doc.metadata.json[0].actor.location.displayName, 1 )) )", "countryCode": "US", "alternatives": [ { "stateProvince": "$SCRIPT( return getRegion(getAddressVal( _doc.metadata.json[0].actor.location.displayName, 1 )) )", "countryCode": "US" } ] }, "type": "Location" }, { "dimension": "Where", "disambiguated_name": "$metadata.json.object.actor.location.displayName", "type": "Location" }, { "disambiguated_name": "$SCRIPT(return _iterator.text;)", "iterateOver": "json.twitter_entities.hashtags", "type": "HashTag" }, { "actual_name": "$SCRIPT(return _iterator.url)", "disambiguated_name": "$SCRIPT(return _iterator.expanded_url;)", "iterateOver": "json.gnip.urls", "type": "URL" } ] }, { "associations": [ { "assoc_type": "Event", "creationCriteriaScript": "$SCRIPT( return (null != _doc.metadata.json[0].object.actor); )", "entity1_index": "$SCRIPT( return _doc.metadata.json[0].actor.preferredUsername + '/twitterhandle';)", "entity2_index": "$SCRIPT( return _doc.metadata.json[0].object.actor.preferredUsername + '/twitterhandle';)", "verb": "retweets", "verb_category": "retweets" }, { "assoc_type": "Event", "creationCriteriaScript": "$SCRIPT( return (null != _doc.metadata.json[0].object.actor) && (null != _doc.metadata.json[0].object.actor.location); )", "entity1_index": "$SCRIPT( return _doc.metadata.json[0].object.actor.preferredUsername + '/twitterhandle';)", "entity2_index": "$SCRIPT( return _doc.metadata.json[0].object.actor.location.displayName+ '/location';)", "verb": "twitter_location", "verb_category": "twitter_location" }, { "assoc_type": "Event", "entity1_index": "$SCRIPT( return _doc.metadata.json[0].actor.preferredUsername + '/twitterhandle';)", "entity2_index": "$SCRIPT( return _iterator.text + '/hashtag'; )", "iterateOver": "json.twitter_entities.hashtags", "verb": "tweets_about", "verb_category": "tweets_about" }, { "assoc_type": "Event", "entity1_index": "$SCRIPT( return _doc.metadata.json[0].actor.preferredUsername + '/twitterhandle';)", "entity2_index": "$SCRIPT( return _iterator.screen_name + '/twitterhandle'; )", "iterateOver": "json.twitter_entities.user_mentions", "verb": "tweets_to", "verb_category": "tweets_to" }, { "assoc_type": "Event", "entity1_index": "$SCRIPT( return _doc.metadata.json[0].actor.preferredUsername + '/twitterhandle';)", "entity2_index": "$SCRIPT( return _iterator.expanded_url + '/url'; )", "iterateOver": "json.gnip.urls", "verb": "tweets_link", "verb_category": "tweets_link" } ] }, { "storageSettings": { "rejectDocCriteria": "$SCRIPT( if (null == _doc.metadata.json[0].link || null == _doc.metadata.json[0].object) return 'reject'; )" } } ] }
Sample output
{ "associations": [ { "assoc_type": "Event", "entity1": "focalcrm", "entity1_index": "focalcrm/twitterhandle", "entity2": "crm", "entity2_index": "crm/hashtag", "verb": "tweets_about", "verb_category": "tweets_about" }, { "assoc_type": "Event", "entity1": "focalcrm", "entity1_index": "focalcrm/twitterhandle", "entity2": "http://www.crmbuyer.com/rsstory/76578.html", "entity2_index": "http://www.crmbuyer.com/rsstory/76578.html/url", "verb": "tweets_link", "verb_category": "tweets_link" } ], "communityId": ["506dc16dfbf042893dd6b8f2"], "created": "May 16, 2013 12:28:09 PM UTC", "description": "Amex Teams With Halo 4 on Master Chief Incentives http://t.co/IvwmjJyV #crm", "entities": [ { "actual_name": "CRM Buddy", "dimension": "Who", "disambiguated_name": "FocalCRM", "doccount": 0, "frequency": 1, "index": "focalcrm/twitterhandle", "linkdata": ["http://www.twitter.com/FocalCRM"], "relevance": 0, "totalfrequency": -1, "type": "TwitterHandle" }, { "actual_name": "crm", "dimension": "What", "disambiguated_name": "crm", "doccount": 0, "frequency": 1, "index": "crm/hashtag", "relevance": 0, "totalfrequency": -1, "type": "HashTag" }, { "actual_name": "http://t.co/IvwmjJyV", "dimension": "What", "disambiguated_name": "http://www.crmbuyer.com/rsstory/76578.html", "doccount": 0, "frequency": 1, "index": "http://www.crmbuyer.com/rsstory/76578.html/url", "relevance": 0, "totalfrequency": -1, "type": "URL" }, { "actual_name": "Amex Teams", "dimension": "What", "disambiguated_name": "Amex Teams", "doccount": -1, "frequency": 1, "index": "amex teams/keyword", "relevance": 0.758636, "sentiment": 0.160753, "totalfrequency": -1, "type": "Keyword" }, { "actual_name": "Halo", "dimension": "What", "disambiguated_name": "Halo", "doccount": -1, "frequency": 1, "index": "halo/keyword", "relevance": 0.461833, "sentiment": 0.168822, "totalfrequency": -1, "type": "Keyword" }, { "actual_name": "Master Chief Incentives", "dimension": "What", "disambiguated_name": "Master Chief Incentives", "doccount": -1, "frequency": 1, "index": "master chief incentives/keyword", "relevance": 0.981457, "sentiment": 0.168876, "totalfrequency": -1, "type": "Keyword" }, { "actual_name": "http://t.co/IvwmjJyV", "dimension": "What", "disambiguated_name": "http://t.co/IvwmjJyV", "doccount": -1, "frequency": 1, "index": "http://t.co/ivwmjjyv/keyword", "relevance": 0.212007, "sentiment": 0.126168, "totalfrequency": -1, "type": "Keyword" }, { "actual_name": "crm", "dimension": "What", "disambiguated_name": "crm", "doccount": -1, "frequency": 1, "index": "crm/keyword", "relevance": 0.404086, "sentiment": 0.103838, "totalfrequency": -1, "type": "Keyword" } ], "mediaType": ["Social"], "metadata": {"json": [{ "actor": { "displayName": "CRM Buddy", "followersCount": "245", "friendsCount": "0", "id": "id:twitter.com:835627776", "image": "http://a0.twimg.com/profile_images/2630355549/8cad59efaddd57283dbb159332336744_normal.jpeg", "languages": ["en"], "link": "http://www.twitter.com/FocalCRM", "links": [{"rel": "me"}], "listedCount": "6", "objectType": "person", "postedTime": "2012-09-20T13:59:56.000Z", "preferredUsername": "FocalCRM", "statusesCount": "3688", "summary": "", "verified": "false" }, "body": "Amex Teams With Halo 4 on Master Chief Incentives http://t.co/IvwmjJyV #crm", "generator": { "displayName": "dlvr.it", "link": "http://dlvr.it" }, "gnip": { "klout_score": "48", "language": {"value": "en"}, "matching_rules": [{"value": "halo 4"}], "urls": [{ "expanded_url": "http://www.crmbuyer.com/rsstory/76578.html", "url": "http://t.co/IvwmjJyV" }] }, "id": "tag:search.twitter.com,2005:266601489475186688", "link": "http://twitter.com/FocalCRM/statuses/266601489475186688", "object": { "id": "object:search.twitter.com,2005:266601489475186688", "link": "http://twitter.com/FocalCRM/statuses/266601489475186688", "objectType": "note", "postedTime": "2012-11-08T18:02:02.000Z", "summary": "Amex Teams With Halo 4 on Master Chief Incentives http://t.co/IvwmjJyV #crm" }, "objectType": "activity", "postedTime": "2012-11-08T18:02:02.000Z", "provider": { "displayName": "Twitter", "link": "http://www.twitter.com", "objectType": "service" }, "retweetCount": "0", "twitter_entities": { "hashtags": [{ "indices": [ "71", "75" ], "text": "crm" }], "urls": [{ "display_url": "dlvr.it/2S6sjV", "expanded_url": "http://dlvr.it/2S6sjV", "indices": [ "50", "70" ], "url": "http://t.co/IvwmjJyV" }], "user_mentions": [] }, "verb": "post" }]}, "modified": "Nov 8, 2012 06:02:44 PM UTC", "publishedDate": "Nov 8, 2012 06:02:02 PM UTC", "source": ["gnip test"], "sourceKey": [".mnt.fileshare.datasift.gnip."], "sourceUrl": "file:/mnt/fileshare/datasift/gnip/gnip.json", "tags": [ "twitter", "gnip" ], "title": "Amex Teams With Halo 4 on Master Chief Incentives http://t.co/IvwmjJyV #crm", "url": "http://twitter.com/FocalCRM/statuses/266601489475186688" }
Source
//TODO (INF-1865): need to distinguish between "tweets_to" and "retweets"... //TODO (INF-1865): looks like body is HTML encoded //TODO (INF-1865): aggregate sentiment vs user? //TODO (INF-1865): distinguish between tweets and mentions { "description": "A large set of tweets related to Super Storm Sandy", "extractType": "File", "extractorOptions": { "app.alchemyapi-metadata.batchSize": 100, "app.alchemyapi-metadata.numKeywords": 5, "app.alchemyapi-metadata.strict": "true" }, "file": { "XmlPrimaryKey": "link", "XmlSourceName": "", "XmlRootLevelValues": [], "domain": "XXX", "password": "XXX", "username": "XXX" }, "isApproved": true, "isPublic": false, "mediaType": "Social", "structuredAnalysis": { "rejectDocCriteria": "$SCRIPT( if (null == _doc.metadata.json[0].link || null == _doc.metadata.json[0].object) return 'reject'; )", "metadataFieldList": "", "docGeo" : { "lat": "$SCRIPT( try {return _doc.metadata.json[0].geo.coordinates[0];} catch (err) {return '';})", "lon": "$SCRIPT( try {return _doc.metadata.json[0].geo.coordinates[1];} catch (err) {return '';})" }, "associations": [ { "assoc_type": "Event", "creationCriteriaScript": "$SCRIPT( return (null != _doc.metadata.json[0].object.actor); )", "entity1_index": "$SCRIPT( return _doc.metadata.json[0].actor.preferredUsername + '/twitterhandle';)", "entity2_index": "$SCRIPT( return _doc.metadata.json[0].object.actor.preferredUsername + '/twitterhandle';)", "verb": "retweets", "verb_category": "retweets" }, { "assoc_type": "Event", "creationCriteriaScript": "$SCRIPT( return (null != _doc.metadata.json[0].object.actor) && (null != _doc.metadata.json[0].object.actor.location); )", "entity1_index": "$SCRIPT( return _doc.metadata.json[0].object.actor.preferredUsername + '/twitterhandle';)", "entity2_index": "$SCRIPT( return _doc.metadata.json[0].object.actor.location.displayName+ '/location';)", "verb": "twitter_location", "verb_category": "twitter_location" }, { "assoc_type": "Event", "entity1_index": "$SCRIPT( return _doc.metadata.json[0].actor.preferredUsername + '/twitterhandle';)", "entity2_index": "$SCRIPT( return _iterator.text + '/hashtag'; )", "iterateOver": "json.twitter_entities.hashtags", "verb": "tweets_about", "verb_category": "tweets_about" }, { "assoc_type": "Event", "entity1_index": "$SCRIPT( return _doc.metadata.json[0].actor.preferredUsername + '/twitterhandle';)", "entity2_index": "$SCRIPT( return _iterator.screen_name + '/twitterhandle'; )", "iterateOver": "json.twitter_entities.user_mentions", "verb": "tweets_to", "verb_category": "tweets_to" }, { "assoc_type": "Event", "entity1_index": "$SCRIPT( return _doc.metadata.json[0].actor.preferredUsername + '/twitterhandle';)", "entity2_index": "$SCRIPT( return _iterator.expanded_url + '/url'; )", "iterateOver": "json.gnip.urls", "verb": "tweets_link", "verb_category": "tweets_link" } ], "description": "$metadata.json.body", "entities": [ { "actual_name": "$metadata.json.actor.displayName", "dimension": "Who", "disambiguated_name": "$metadata.json.actor.preferredUsername", "linkdata": "$metadata.json.actor.link", "type": "TwitterHandle" }, { "iterateOver": "json.twitter_entities.user_mentions", "actual_name": "$SCRIPT(return _iterator.name;)", "dimension": "Who", "disambiguated_name": "$SCRIPT(return _iterator.screen_name;)", "linkdata": "$SCRIPT(return 'http://www.twitter.com/' + _iterator.screen_name;)", "type": "TwitterHandle" }, { "actual_name": "$metadata.json.object.actor.displayName", "dimension": "Who", "disambiguated_name": "$metadata.json.object.actor.preferredUsername", "linkdata": "$metadata.json.object.actor.link", "type": "TwitterHandle" }, { "dimension": "Where", "disambiguated_name": "$metadata.json.actor.location.displayName", "geotag": { "city": "$SCRIPT( return getAddressVal( _doc.metadata.json[0].actor.location.displayName, 0 ) )", "stateProvince": "$SCRIPT( return getRegion(getAddressVal( _doc.metadata.json[0].actor.location.displayName, 1 )) )", "countryCode" : "US", "alternatives": [ { "stateProvince": "$SCRIPT( return getRegion(getAddressVal( _doc.metadata.json[0].actor.location.displayName, 1 )) )", "countryCode" : "US" } ] }, "type": "Location" }, { "dimension": "Where", "disambiguated_name": "$metadata.json.object.actor.location.displayName", "type": "Location" }, { "disambiguated_name": "$SCRIPT(return _iterator.text;)", "iterateOver": "json.twitter_entities.hashtags", "type": "HashTag" }, { "actual_name": "$SCRIPT(return _iterator.url)", "disambiguated_name": "$SCRIPT(return _iterator.expanded_url;)", "iterateOver": "json.gnip.urls", "type": "URL" } ], "fullText": "$metadata.json.body", "script": "function getAddressVal( addressStr, number) { try { var addressArray = addressStr.split(/ *, */); if (addressArray != null && addressArray.length > 0) { if (addressArray[number].toLowerCase()=='ny') { return 'new york'; } else if (addressArray[number].toLowerCase()=='long island' || addressArray[number].toLowerCase()=='li') { return 'medford'; } else { return addressArray[number]; } } else { return ''; } } catch (err) { return ''; } } function getRegion( code ) { if (code.toLowerCase()=='ny') {return 'New York';} else if (code.toLowerCase()=='nj') {return 'New Jersey';} else if (code.toLowerCase()=='ct') {return 'Connecticut';} else if (code.toLowerCase()=='md') {return 'Maryland';} else if (code.toLowerCase()=='va') {return 'Virginia';} else if (code.toLowerCase()=='pa') {return 'Pennsylvania';} else if (code.toLowerCase()=='nj') {return 'New Jersey';} else {return 'New York';} }", "scriptEngine": "javascript", "title": "$metadata.json.body", "url": "$metadata.json.link", "publishedDate": "$SCRIPT(return _doc.metadata.json[0].postedTime.replace(/.[0-9]{3}Z/,'Z');)" }, "tags": [ "twitter", "gnip" ], "title": "Super Storm Sandy - Twitter: SANDY_SUBSTRING", "url": "smb://HOST:139/SHARE/PATH/TO/", "useExtractor": "AlchemyAPI-metadata", "useTextExtractor": "none" }