...
Code Block |
---|
//TODO (INF-1865): need to distinguish between "tweets_to" and "retweets"...
//TODO (INF-1865): looks like body is HTML encodedÂ
//TODO (INF-1865): aggregate sentiment vs user?
{
"description": "A large set of tweets related to Super Storm Sandy",
"extractType": "File",
"extractorOptions": {
"app.alchemyapi-metadata.batchSize": 100,
"app.alchemyapi-metadata.numKeywords": 5,
"app.alchemyapi-metadata.strict": "true"
},
"file": {
"XmlPrimaryKey": "link",
"XmlSourceName": "",
"XmlRootLevelValues": [],
"domain": "XXX",
"password": "XXX",
"username": "XXX"
},
"isApproved": true,
"isPublic": false,
"mediaType": "Social",
"structuredAnalysis": {
"rejectDocCriteria": "$SCRIPT( if (null == _doc.metadata.json[0].link || null == _doc.metadata.json[0].object) return 'reject'; )",
"metadataFieldList": "",
"docGeo" : {
"lat": "$SCRIPT( try {return _doc.metadata.json[0].geo.coordinates[0];} catch (err) {return '';})",
"lon": "$SCRIPT( try {return _doc.metadata.json[0].geo.coordinates[1];} catch (err) {return '';})"
},
"associations": [
{
"assoc_type": "Event",
"creationCriteriaScript": "$SCRIPT( return (null != _doc.metadata.json[0].object.actor); )",
"entity1_index": "$SCRIPT( return _doc.metadata.json[0].actor.preferredUsername + '/twitterhandle';)",
"entity2_index": "$SCRIPT( return _doc.metadata.json[0].object.actor.preferredUsername + '/twitterhandle';)",
"verb": "retweets",
"verb_category": "retweets"
},
{
"assoc_type": "Event",
"creationCriteriaScript": "$SCRIPT( return (null != _doc.metadata.json[0].object.actor) && (null != _doc.metadata.json[0].object.actor.location); )",
"entity1_index": "$SCRIPT( return _doc.metadata.json[0].object.actor.preferredUsername + '/twitterhandle';)",
"entity2_index": "$SCRIPT( return _doc.metadata.json[0].object.actor.location.displayName+ '/location';)",
"verb": "twitter_location",
"verb_category": "twitter_location"
},
{
"assoc_type": "Event",
"entity1_index": "$SCRIPT( return _doc.metadata.json[0].actor.preferredUsername + '/twitterhandle';)",
"entity2_index": "$SCRIPT( return _iterator.text + '/hashtag'; )",
"iterateOver": "json.twitter_entities.hashtags",
"verb": "tweets_about",
"verb_category": "tweets_about"
},
{
"assoc_type": "Event",
"entity1_index": "$SCRIPT( return _doc.metadata.json[0].actor.preferredUsername + '/twitterhandle';)",
"entity2_index": "$SCRIPT( return _iterator.screen_name + '/twitterhandle'; )",
"iterateOver": "json.twitter_entities.user_mentions",
"verb": "tweets_to",
"verb_category": "tweets_to"
},
{
"assoc_type": "Event",
"entity1_index": "$SCRIPT( return _doc.metadata.json[0].actor.preferredUsername + '/twitterhandle';)",
"entity2_index": "$SCRIPT( return _iterator.expanded_url + '/url'; )",
"iterateOver": "json.gnip.urls",
"verb": "tweets_link",
"verb_category": "tweets_link"
}
],
"description": "$metadata.json.body",
"entities": [
{
"actual_name": "$metadata.json.actor.displayName",
"dimension": "Who",
"disambiguated_name": "$metadata.json.actor.preferredUsername",
"linkdata": "$metadata.json.actor.link",
"type": "TwitterHandle"
},
{
"iterateOver": "json.twitter_entities.user_mentions",
"actual_name": "$SCRIPT(return _iterator.name;)",
"dimension": "Who",
"disambiguated_name": "$SCRIPT(return _iterator.screen_name;)",
"linkdata": "$SCRIPT(return 'http://www.twitter.com/' + _iterator.screen_name;)",
"type": "TwitterHandle"
},
{
"actual_name": "$metadata.json.object.actor.displayName",
"dimension": "Who",
"disambiguated_name": "$metadata.json.object.actor.preferredUsername",
"linkdata": "$metadata.json.object.actor.link",
"type": "TwitterHandle"
},
{
"dimension": "Where",
"disambiguated_name": "$metadata.json.actor.location.displayName",
"geotag": {
"city": "$SCRIPT( return getAddressVal( _doc.metadata.json[0].actor.location.displayName, 0 ) )",
"stateProvince": "$SCRIPT( return getRegion(getAddressVal( _doc.metadata.json[0].actor.location.displayName, 1 )) )",
"countryCode" : "US",
"alternatives": [
{
"stateProvince": "$SCRIPT( return getRegion(getAddressVal( _doc.metadata.json[0].actor.location.displayName, 1 )) )",
"countryCode" : "US"
}
]
},
"type": "Location"
},
{
"dimension": "Where",
"disambiguated_name": "$metadata.json.object.actor.location.displayName",
"type": "Location"
},
{
"disambiguated_name": "$SCRIPT(return _iterator.text;)",
"iterateOver": "json.twitter_entities.hashtags",
"type": "HashTag"
},
{
"actual_name": "$SCRIPT(return _iterator.url)",
"disambiguated_name": "$SCRIPT(return _iterator.expanded_url;)",
"iterateOver": "json.gnip.urls",
"type": "URL"
}
],
"fullText": "$metadata.json.body",
"script": "function getAddressVal( addressStr, number) { try { var addressArray = addressStr.split(/ *, */); if (addressArray != null && addressArray.length > 0) { if (addressArray[number].toLowerCase()=='ny') { return 'new york'; } else if (addressArray[number].toLowerCase()=='long island' || addressArray[number].toLowerCase()=='li') { return 'medford'; } else { return addressArray[number]; } } else { return ''; } } catch (err) { return ''; } } function getRegion( code ) { if (code.toLowerCase()=='ny') {return 'New York';} else if (code.toLowerCase()=='nj') {return 'New Jersey';} else if (code.toLowerCase()=='ct') {return 'Connecticut';} else if (code.toLowerCase()=='md') {return 'Maryland';} else if (code.toLowerCase()=='va') {return 'Virginia';} else if (code.toLowerCase()=='pa') {return 'Pennsylvania';} else if (code.toLowerCase()=='nj') {return 'New Jersey';} else {return 'New York';} }",
"scriptEngine": "javascript",
"title": "$metadata.json.body",
"url": "$metadata.json.link",
"publishedDate": "$SCRIPT(return _doc.metadata.json[0].postedTime.replace(/.[0-9]{3}Z/,'Z');)"
},
"tags": [
"twitter",
"gnip"
],
"title": "Super Storm Sandy - Twitter: SANDY_SUBSTRING",
"url": "smb://HOST:139/SHARE/PATH/TO/",
"useExtractor": "AlchemyAPI-metadata",
"useTextExtractor": "none"
} |
...