Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

Sample document

TODO

Source

TODO: geo (use docGeo.alternatives, retain only those metadata fields that are wanted)distinguish between tweets and mentions

Code Block
{
    "description": "2.5 million tweets related to Super Storm Sandy",
    "extractType": "File",
    "file": {
        "XmlPrimaryKey": "",
        "XmlRootLevelValues": [],
   //TODO (INF-1865): need to distinguish between "tweets_to" and "retweets"...
//TODO (INF-1865): looks like body is HTML encoded 
//TODO (INF-1865): aggregate sentiment vs user?
{
	"description": "A large set of tweets related to Super Storm Sandy",
	"extractType": "File",
    "domainextractorOptions": "XXXXXXX",{
        "passwordapp.alchemyapi-metadata.batchSize": "XXXXXX"100,
        "usernameapp.alchemyapi-metadata.numKeywords": "XXXXXX"5,      },  
  "isApproved": true,     "isPublic": true,app.alchemyapi-metadata.strict": "true"        "mediaType":
"Social",    },
	"structuredAnalysisfile": {
		"XmlPrimaryKey": "",
		"XmlRootLevelValues": [],
		"rejectDocCriteriadomain": "XXX"$SCRIPT( if (,
		"password": "XXX",
		"username": "XXX"
	},
	"isApproved": true,
	"isPublic": false,
	"mediaType": "Social",
	"structuredAnalysis": {
		"rejectDocCriteria": "$SCRIPT( if (null == _doc.metadata.json[0].link || null == _doc.metadata.json[0].object) return 'reject'; )",
    		"metadataFieldList": "",
    		"docGeo" : {
        			"lat": "$SCRIPT( try {return _doc.metadata.json[0].geo.coordinates[0];} catch (err) {return '';})",
        			"lon": "$SCRIPT( try {return _doc.metadata.json[0].geo.coordinates[1];} catch (err) {return '';})"
        },
        		},
		"associations": [
             {
               	 "assoc_type": "Event",
               	 "creationCriteriaScript": "$SCRIPT( return (null != _doc.metadata.json[0].object.actor); )",
               	 "entity1_index": "$SCRIPT( return _doc.metadata.json[0].actor.preferredUsername + '/twitterhandle';)",
            	    "entity2_index": "$SCRIPT( return _doc.metadata.json[0].object.actor.preferredUsername + '/twitterhandle';)",
               	 "verb": "retweets",
            	    "verb_category": "retweets"
             },
             {
            	    "assoc_type": "Event",
               	 "creationCriteriaScript": "$SCRIPT( return (null != _doc.metadata.json[0].object.actor) && (null != _doc.metadata.json[0].object.actor.location); )",
            	    "entity1_index": "$SCRIPT( return _doc.metadata.json[0].object.actor.preferredUsername + '/twitterhandle';)",
            	    "entity2_index": "$SCRIPT( return _doc.metadata.json[0].object.actor.location.displayName+ '/location';)",
               	 "verb": "twitter_location",
               	 "verb_category": "twitter_location"
             },
             {
             	   "assoc_type": "Event",
 
             	 "entity1_index": "$SCRIPT( return _doc.metadata.json[0].actor.preferredUsername + '/twitterhandle';)",
               	 "entity2_index": "$SCRIPT( return _iterator.text + '/hashtag'; )",
               	 "iterateOver": "json.twitter_entities.hashtags",
               	 "verb": "tweets_about",
            	    "verb_category": "tweets_about"
             },
             {
              	  "assoc_type": "Event",
              	  "entity1_index": "$SCRIPT( return _doc.metadata.json[0].actor.preferredUsername + '/twitterhandle';)",
            	    "entity2_index": "$SCRIPT( return _iterator.screen_name + '/twitterhandle'; )",
               	 "iterateOver": "json.twitter_entities.user_mentions",
             	   "verb": "tweets_to",
             	   "verb_category": "tweets_to"
             },
             {
               	 "assoc_type": "Event",
              	  "entity1_index": "$SCRIPT( return _doc.metadata.json[0].actor.preferredUsername + '/twitterhandle';)",
              	  "entity2_index": "$SCRIPT( return _iterator.expanded_url + '/url'; )",
  
            	 "iterateOver": "json.gnip.urls",
               	 "verb": "tweets_link",
               	 "verb_category": "tweets_link"
             }
         ],
         "description": "$metadata.json.body",
         "entities": [
		            {
		    	            "actual_name": "$metadata.json.actor.displayName",
		              	  "dimension": "Who",
         ": "Who",
		     	  "disambiguated_name": "$metadata.json.actor.preferredUsername",
		       	         "linkdata": "$metadata.json.actor.link",
		      	          "type": "TwitterHandle"
 		           },
      		      {
		    	        	"iterateOver": "json.twitter_entities.user_mentions",
		              	  "actual_name": "$SCRIPT(return _iterator.name;)",
		    	            "dimension": "Who",
		    	            "disambiguated_name": "$SCRIPT(return _iterator.screen_name;)",
		              	  "linkdata": "$SCRIPT(return 'http://www.twitter.com/' + _iterator.screen_name;)",
      )",
		        	  "type": "TwitterHandle"
   		         },
   		         {
		    	            "actual_name": "$metadata.json.object.actor.displayName",
		              	  "dimension": "Who",
        ,
		      	  "disambiguated_name": "$metadata.json.object.actor.preferredUsername",
		          	      "linkdata": "$metadata.json.object.actor.link",
                "type": "TwitterHandle"
            },
		    	  "type": "TwitterHandle"
		    {  },
		      {
		     	  "dimension": "Where",
		    	            "disambiguated_name": "$metadata.json.actor.location.displayName",
                "geotag": {
    ",
		    	  "geotag": {
		      		  "city": "$SCRIPT( return getAddressVal( _doc.metadata.json[0].actor.location.displayName, 0 ) )",
		        		            "stateProvince": "$SCRIPT( return getRegion(getAddressVal( _doc.metadata.json[0].actor.location.displayName, 1 )) )",
		                  		  "country_codecountryCode" : "US",
      
		    		  "alternatives": [
		      "alternatives": [            {
			{                	   			"stateProvince": "$SCRIPT( return getRegion(getAddressVal( _doc.metadata.json[0].actor.location.displayName, 1 )) )",
 
              			"country_code" : "US"             			}       	   "countryCode" : "US"
		]                 },  }
	              "type": "Location"    ]
		    	    },
		    	  "type": "Location"
		    {  },
		      {
		     	  "dimension": "Where",
        ": "Where",
		      	  "disambiguated_name": "$metadata.json.object.actor.location.displayName",
		      	          "type": "Location"
  		          },
     		       {
		              	  "disambiguated_name": "$SCRIPT(return _iterator.text;)",
      )",
		        	  "iterateOver": "json.twitter_entities.hashtags",
		      	          "type": "HashTag"
  		          },
     		       {
		              	  "actual_name": "$SCRIPT(return _iterator.url)",
       ",
		       	  "disambiguated_name": "$SCRIPT(return _iterator.expanded_url;)",
		    	  "iterateOver": "json.gnip.urls",
		     	   "iterateOvertype": "json.gnip.urls",URL"
		      }
		      ],
      "typefullText": "URL$metadata.json.body",
      "script": "function getAddressVal( addressStr, number) { }try { var addressArray = addressStr.split(/ *, */); if ],(addressArray != null && addressArray.length > 0) {  "fullText": "$metadata.json.body",
        "script": "function getAddressVal( addressStr, number) { try { var addressArray = addressStr.split(','); if (addressArray != null && addressArray.length > 0) if (addressArray[number].toLowerCase()=='ny') { return 'new york'; } else if (addressArray[number].toLowerCase()=='long island' || addressArray[number].toLowerCase()=='li') { return 'medford'; } else { return addressArray[number]; } } else { return ''; } } catch (err) { return ''; } } function getRegion( code ) { if (code.toLowerCase()=='ny') {return 'New York';} else if (code.toLowerCase()=='nj') {return 'New Jersey';} else if (code.toLowerCase()=='ct') {return 'Connecticut';} else if (code.toLowerCase()=='md') {return 'ConnecticutMaryland';} else if (code.toLowerCase()=='mdva') {return 'MarylandVirginia';} else if (code.toLowerCase()=='vapa') {return 'VirginiaPennsylvania';} else if (code.toLowerCase()=='panj') {return 'PennsylvaniaNew Jersey';} else {return 'New York';} }",
        "scriptEngine": "javascript",
 
      "title": "$metadata.json.body",
 
      "url": "$metadata.json.link",
 
      "publishedDate": "$SCRIPT(return _doc.metadata.json[0].postedTime.replace(/.[0-9]{3}Z/,'Z');)"
    	},
    	"tags": [
         "twitter",
         "gnip"
     ],
     "title": "Super Storm Sandy - Twitter: SANDY_SUBSTRING",
     "url": "smb://HOST:139/XXXXXXXXXXXSHARE/sandy_demoPATH/testTO/",
     "useExtractor": "noneAlchemyAPI-metadata",
     "useTextExtractor": "none"
}

...