...
Code Block |
---|
#------------------------------------------------------------------------------- # 1.13] Entity Extractor Properties #------------------------------------------------------------------------------- # Alchemy and Open Calais Keys: # (Obtain from alchemyapi.com or opencalais.com) # (can't be blank so set to DUMMY if not in use) extractor.key.alchemyapi=DUMMY extractor.key.opencalais=DUMMY #---------------------------------------------- # Entity extraction type selection: opencalais or alchemyapi or none # ("opencalais" has a much higher limit than "alchemyapi" (1000/day) so is recommended for free use # "alchemyapi" extracts sentiment, "opencalais" extracts entity associations Note this can be overridden per source) extractor.entity.default= # Text extraction type selection: boilerpipe or alchemyapi or none # ("alchemyapi" is much better, but has the limit discussed above. Note this can be overridden per source) extractor.text.default= |
...
Code Block |
---|
#------------------------------------------------------------------------------- # 2.8] MongoDB Properties #------------------------------------------------------------------------------- # (server/port should normally point to localhost:27017), where API nodes have a mongos db.server=localhost db.port= 27017 # db.sharded - 0 = false and 1 = true db.sharded=01 # The max number of documents to store (eg 10M). Docs will be dropped in order of age. # (Not currently supported): db.capacity=10000000 # MongoDB config server or servers (must be 1 or 3 comma separated IPs), non-EC2/AWS installations only db.config.servers= db.replica.sets= #---------------------------------------------- # db.cluster.subnet - used for non-EC2/AWS only installations to help mongodb configurations # identify proper host ip addresses, e.g. 127.0.0. db.cluster.subnet= #---------------------------------------------- # The location from which to fetch the geo.bson dump used for feature.geo # can start s3://, http:// or https://, else is assumed to be a file, eg #db.geo_archive=s3://config.saas.infinite.ikanow.com/geo.bson.tar.gz # Can always be retrieved here db.geo_archive=http://www.ikanow.com/infinit.e-preinstall/geo.bson.tar.gz |
...
Code Block |
---|
#------------------------------------------------------------------------------- # 2.11] Harvester Properties #------------------------------------------------------------------------------- # Comma-separated-list from File,Database,Feed (note Database and Feed need jars not bundled with the RPM) harvester.types=File,Database,Feed # Web crawling etiquette: the time to way between consecutive accesses to the same time (10s is standard) harvest.feed.wait=10000 # The minimum time between consecutive harvests (avoids thrashing FS/DB/RSS when there's nothing to get) harvest.mintime.ms=300000 # The minimum time between consecutive source harvests (set if needs to be longer than harvest.mintime.ms, # eg if you want to pick up a source quickly the first time but then not update so frequently) harvest.source.mintime.ms= # Restricts the number of docs that can be harvested per cycle for memory reasons: harvest.maxdocs_persource=5000 # Threading configuration type:num_threads (type from above): # (eg for RSS heavy increase the "feed", for DB heavy increase the "file" etc. Beyond 20 there is limited benefit). harvest.threads=file:5,database:5,feed:20 # This controls the batch size of sources picked up by a thread, this does not normally need to be changed (its default is shown) # (It can be reduced in cases where a small number of very long-running sources need to be harvested). #harvest.distribution.batch.harvest=20 # This disables entity and association aggregation. For almost all applications you will not want to set this. #harvest.disable_aggregation=false # This parameter uses the Java Security Manager to prevent scripts accessing local network services (at the expense of some performance) # It can be turned off for uses of the platform where sources must be approved before being added (etc) harvest.security=truefalse # This is a comma-separated list of hosts in the following format "http://<HOST>[:<PORT>]" or "socks://<HOST>:<PORT>" # When specified, all requests for external content from the harvester are proxied (round-robin) through the specified hosts harvest.proxy= |
...