Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

...

Code Block
languagejavascript
titleFeed Harvester configuration
"rss": {
	"feedType": string, // Currently not used - will allow for RSS vs Atom in future releases (currently only RSS is supported)
 
	"waitTimeOverride_ms": integer, // Optional - if specified, controls the amount of time between successive reads to a site (default: 10000ms):
					// ie if a site is timing out it may limit the number of accesses from a given IP - set the number higher
					// for large sites you can increase the performance of the harvester by setting this number lower
	"updateCycle_secs": integer, // Optional - if present harvested URLs may be replaced if they are older than this time and are encountered from the RSS or in the "extraUrls"
	"regexInclude": string, // Optional - if specified, only URLs matching the regex will be harvested
	"regexExclude": string, // Optional - if specified, any URLs matching the regex will not be harvested
	
	"extraUrls": [ // This array allows for manually specified URLs to be harvested once
		{
			"url": string, // The URL 
			"title": string, // The title that the document will be given (ie the equivalent to the RSS title)
			"description": string, // (Optional) The description that the document will be given (ie the equivalent to the RSS description)
			"publishedData": string, // (Optional) The date that will be assigned to the document (default: now) - this can be overridden from "structuredAnalysis"
			"fullText": string // (Optional) If present and "useTextExtractor" is "none", then uses the specified string instead of the URL contents (mainly for debugging)
		},
		//etc
	],
	"userAgent": string, // (Optional) If present overrides the system default user agent string
 	"proxyOverride": string, // (Optional) "direct" to bypass proxy (the default), or a proxy specification "(http|socks)://host:port" 
	"cookies": string, // (Optional) appends this string to the "Cookies" field (can included multiple semi-colon separated cookie values)

	"searchConfig": { ... } // (Optional) A complex configuration object that allows the contents of URLs to be used generate more URLs/docs to harvest
} 

...