File source to ingest the results of the query results (from either the API or the GUI export)

Source

See here for details on bulk export of data from the API.

This version has a few noteworthy limitations:

  • Tags and mediaType cannot be dynamically specified, so you should segment the files into different directories where possible and then hardcode them in the source fields (see TODO below)
  • The fullText is not generated - this is a fundamental limitation since it is not in the exported JSON. Note that the title, description, entities, and associations are all indexed, so documents will normally have a decent "full text signature" regardlesss
    • (If needed, fullText could be populated with a more complex "Feed" source that used "rss.searchConfig" but was otherwise similar to this)
  • The metadata from the old document is stored in metadata.prevmeta (this cannot be worked around unless you know a priori the metadata fieldnames, in which case it can be easily fixed in the "unstructuredAnalysis,meta" block below)
  • Associations are not copied across. This can be easily added if needed.

{
    "description": "Re-imports the documents output from Infinit.e's query API or GUI export",
    "isPublic": true,
    "mediaType": "TODO_INSERT_TYPE_HERE",
    "tags": [
        "TODO_INSERT_TAGES_HERE"
    ],
    "title": "Infinit.e Import Template",
    "processingPipeline": [
        {
            "file": {
                "type": "json",
                "XmlPrimaryKey": "url",
                "XmlRootLevelValues": [
                    "documents",
                    "data"
                ],
                "XmlSourceName": "",
                "url": "file://path/to/files/"
            }
        },
        {
            "docMetadata": {
                "title": "$metadata.json.title",
                "description": "$metadata.json.description",
                "publishedDate": "$SCRIPT( return new Date(_doc.metadata.json[0].publishedDate).toString(); )"
            }
        },
        {
            "contentMetadata": [
                {
                    "fieldName": "prevmeta",
                    "script": "var retval = _metadata.json[0].metadata; retval;",
                    "scriptlang": "javascript",
                    "flags": "m"
                }
            ]
        },
        {
            "textEngine": {
                "engineName": "default"
            }
        },
        {
            "featureEngine": {
                "engineName": "default"
            }
        },
        {
            "entities": [
                {
                    "actual_name": "$SCRIPT( return _iterator.actual_name == null ? _iterator.disambiguated_name : _iterator.actual_name; )",
                    "dimension": "$dimension",
                    "disambiguated_name": "$disambiguated_name",
                    "geotag": {
                        "lat": "$geotag.lat",
                        "lon": "$geotag.lon"
                    },
                    "iterateOver": "json.entities",
                    "linkdata": "$linkdata",
                    "ontology_type": "$ontology_type",
                    "relevance": "$relevance",
                    "frequency": "$frequency",
                    "sentiment": "$sentiment",
                    "type": "$type"
                }
            ]
        },
        {
            "associations": [
                {
                    "assoc_type": "$assoc_type",
                    "entity1": "$entity1",
                    "entity1_index": "$entity1_index",
                    "entity2": "$entity2",
                    "entity2_index": "$entity2_index",
                    "iterateOver": "json.associations",
                    "verb": "$verb",
                    "verb_category": "$verb_category"
                }
            ]
        },
        {
            "storageSettings": {
                "metadataFields": "-json"
            }
        },
        {
            "searchIndex": {
                "metadataFieldList": ""
            }
        }
    ]
}

Source - old format

Note needs to be run on v0.1.7714+ since earlier versions have a bug that will hang the harvester/API test if 1) the max number of docs is exceeded and 2) the data/documents is not the last JSON element in the file (it won't normally be unless manually edited)

{
    "description": "Re-imports the documents output from Infinit.e's query API or GUI export",
    "extractType": "File",
    "file": {
		"type":"json",
        "XmlPrimaryKey": "url",
        "XmlRootLevelValues": [
            "documents",
            "data"
        ],
        "XmlSourceName": ""
    },
    "isPublic": true,
    "mediaType": "TODO_INSERT_TYPE_HERE",
    "searchIndexFilter": {
        "metadataFieldList": ""
    },
    "structuredAnalysis": {
        "associations": [{
            "assoc_type": "$assoc_type",
            "entity1": "$entity1",
            "entity1_index": "$entity1_index",
            "entity2": "$entity2",
            "entity2_index": "$entity2_index",
            "iterateOver": "json.associations",
            "verb": "$verb",
            "verb_category": "$verb_category"
        }],
        "description": "$metadata.json.description",
        "entities": [
            {
                "actual_name": "$SCRIPT( return _iterator.actual_name == null ? _iterator.disambiguated_name : _iterator.actual_name; )",
                "dimension": "$dimension",
                "disambiguated_name": "$disambiguated_name",
                "geotag": {
                    "lat": "$geotag.lat",
                    "lon": "$geotag.lon"
                },
                "iterateOver": "json.entities",
                "linkdata": "$linkdata",
                "ontology_type": "$ontology_type",
                "relevance": "$relevance",
                "frequency": "$frequency",
                "sentiment": "$sentiment",
                "type": "$type"
            }
        ],
        "metadataFields": "-json",
        "publishedDate": "$SCRIPT( return new Date(_doc.metadata.json[0].publishedDate).toString(); )",
        "title": "$metadata.json.title"
    },
    "tags": [
        "TODO_INSERT_TAGES_HERE"
    ],
    "title": "Infinit.e Import Template",
    "unstructuredAnalysis": {
        "meta": [
            {
                "context": "First",
                "fieldName": "prevmeta",
                "flags": "m",
                "script": "var retval = _metadata.json[0].metadata; retval;",
                "scriptlang": "javascript"
            }
        ]
    },
    "url": "file://path/to/files/"
}