Simple web-hosted XML containing many documents

Overview

This is similar to the WITS example, except that the XML is hosted on a web server instead of in a fileshare. Because the Feed Harvester does not have the same built-in decoding capabilities as the File Harvester, this makes life a little bit more complicated.

Example data

http://www.w3schools.com/xml/simple.xml

Note that when accessing Web documents you must use "rss.extraUrls" and specify minimally "url" and "title" fields, and not the top-level "url" (otherwise the URL is treated as an RSS feed rather than a standalone web page)

<?xml version="1.0" encoding="UTF-8"?>
<breakfast_menu>
   <food>
      <name>Belgian Waffles</name>
      <price>$5.95</price>
      <description>two of our famous Belgian Waffles with plenty of real maple syrup</description>
      <calories>650</calories>
   </food>
   <food>
      <name>Strawberry Belgian Waffles</name>
      <price>$7.95</price>
      <description>light Belgian waffles covered with strawberries and whipped cream</description>
      <calories>900</calories>
   </food>
   <food>
      <name>Berry-Berry Belgian Waffles</name>
      <price>$8.95</price>
      <description>light Belgian waffles covered with an assortment of fresh berries and whipped cream</description>
      <calories>900</calories>
   </food>
   <food>
      <name>French Toast</name>
      <price>$4.50</price>
      <description>thick slices made from our homemade sourdough bread</description>
      <calories>600</calories>
   </food>
   <food>
      <name>Homestyle Breakfast</name>
      <price>$6.95</price>
      <description>two eggs, bacon or sausage, toast, and our ever-popular hash browns</description>
      <calories>950</calories>
   </food>
</breakfast_menu>

Source

Note the use of XPath to identify easily how to convert the top-level XML document into lots of little documents - the "web,searchConfig.script" is then boilerplate and converts the XML into lots of small documents, with the "fullText" of each containing the JSON representation of the selected XML. This is then converted into metadata by the "contentMetadata" block. Normally "docMetadata"/"entities"/"associations" block would finally be used to set the per-document titles, descriptions, entities etc.

{
    "description": "wiy",
    "isPublic": true,
    "mediaType": "News",
    "tags": [
        "tag1"
    ],
    "title": "aaa xml test",
    "processingPipeline": [
        {
            "feed": {
                "extraUrls": [
                    {
                        "url": "http://www.w3schools.com/xml/simple.xml"
                    }
                ],
                "updateCycle_secs": 86400
            }
        },
        {
            "links": {
                "extraMeta": [
                    {
                        "context": "First",
                        "fieldName": "convert_to_json",
                        "flags": "o",
                        "script": "//breakfast_menu/food[*]",
                        "scriptlang": "xpath"
                    }
                ],
                "script": "function convert_to_docs(jsonarray, url)\n{\n    var docs = [];\n    for (var docIt in jsonarray) {\n        var predoc = jsonarray[docIt];\n        delete predoc.content;\n        var doc = {};\n        doc.url = _doc.url.replace(/[?].*/,\"\") + '#' + docIt;\n        doc.fullText = predoc;\n        doc.title = \"TBD\";\n        doc.description = \"TBD\";\n        docs.push(doc);\n    }\n    return docs;\n}\nvar docs = convert_to_docs(_doc.metadata['convert_to_json'], _doc.url);\ndocs;",
                "scriptflags": "d"
            }
        },
        {
            "contentMetadata": [
                {
                    "fieldName": "json",
                    "script": "var json = eval('('+text+')'); json;",
                    "scriptlang": "javascript"
                }
            ]
        }
    ]
}

Output

{
    "communityId": ["4d38b72c054548f038a0414a"],
    "created": "Jun 5, 2013 09:12:15 PM UTC",
    "description": "TBD",
    "fullText": "{ \"calories\" : \"650\" , \"description\" : \"two of our famous Belgian Waffles with plenty of real maple syrup\" , \"price\" : \"$5.95\" , \"name\" : \"Belgian Waffles\"}",
    "mediaType": ["News"],
    "metadata": {"json": [{
        "calories": "650",
        "description": "two of our famous Belgian Waffles with plenty of real maple syrup",
        "name": "Belgian Waffles",
        "price": "$5.95"
    }]},
    "modified": "Jun 5, 2013 09:12:15 PM UTC",
    "publishedDate": "Jun 5, 2013 09:12:15 PM UTC",
    "source": ["aaa xml test"],
    "sourceKey": ["www.w3schools.com.xml.simple.xml"],
    "tags": ["tag1"],
    "title": "TBD",
    "url": "http://www.w3schools.com/xml/simple.xml#0"
}
{
    "communityId": ["4d38b72c054548f038a0414a"],
    "created": "Jun 5, 2013 09:12:15 PM UTC",
    "description": "TBD",
    "fullText": "{ \"calories\" : \"900\" , \"description\" : \"light Belgian waffles covered with strawberries and whipped cream\" , \"price\" : \"$7.95\" , \"name\" : \"Strawberry Belgian Waffles\"}",
    "mediaType": ["News"],
    "metadata": {"json": [{
        "calories": "900",
        "description": "light Belgian waffles covered with strawberries and whipped cream",
        "name": "Strawberry Belgian Waffles",
        "price": "$7.95"
    }]},
    "modified": "Jun 5, 2013 09:12:15 PM UTC",
    "publishedDate": "Jun 5, 2013 09:12:15 PM UTC",
    "source": ["aaa xml test"],
    "sourceKey": ["www.w3schools.com.xml.simple.xml"],
    "tags": ["tag1"],
    "title": "TBD",
    "url": "http://www.w3schools.com/xml/simple.xml#1"
}
{
    "communityId": ["4d38b72c054548f038a0414a"],
    "created": "Jun 5, 2013 09:12:15 PM UTC",
    "description": "TBD",
    "fullText": "{ \"calories\" : \"900\" , \"description\" : \"light Belgian waffles covered with an assortment of fresh berries and whipped cream\" , \"price\" : \"$8.95\" , \"name\" : \"Berry-Berry Belgian Waffles\"}",
    "mediaType": ["News"],
    "metadata": {"json": [{
        "calories": "900",
        "description": "light Belgian waffles covered with an assortment of fresh berries and whipped cream",
        "name": "Berry-Berry Belgian Waffles",
        "price": "$8.95"
    }]},
    "modified": "Jun 5, 2013 09:12:15 PM UTC",
    "publishedDate": "Jun 5, 2013 09:12:15 PM UTC",
    "source": ["aaa xml test"],
    "sourceKey": ["www.w3schools.com.xml.simple.xml"],
    "tags": ["tag1"],
    "title": "TBD",
    "url": "http://www.w3schools.com/xml/simple.xml#2"
}
{
    "communityId": ["4d38b72c054548f038a0414a"],
    "created": "Jun 5, 2013 09:12:15 PM UTC",
    "description": "TBD",
    "fullText": "{ \"calories\" : \"600\" , \"description\" : \"thick slices made from our homemade sourdough bread\" , \"price\" : \"$4.50\" , \"name\" : \"French Toast\"}",
    "mediaType": ["News"],
    "metadata": {"json": [{
        "calories": "600",
        "description": "thick slices made from our homemade sourdough bread",
        "name": "French Toast",
        "price": "$4.50"
    }]},
    "modified": "Jun 5, 2013 09:12:15 PM UTC",
    "publishedDate": "Jun 5, 2013 09:12:15 PM UTC",
    "source": ["aaa xml test"],
    "sourceKey": ["www.w3schools.com.xml.simple.xml"],
    "tags": ["tag1"],
    "title": "TBD",
    "url": "http://www.w3schools.com/xml/simple.xml#3"
}
{
    "communityId": ["4d38b72c054548f038a0414a"],
    "created": "Jun 5, 2013 09:12:15 PM UTC",
    "description": "TBD",
    "fullText": "{ \"calories\" : \"950\" , \"description\" : \"two eggs, bacon or sausage, toast, and our ever-popular hash browns\" , \"price\" : \"$6.95\" , \"name\" : \"Homestyle Breakfast\"}",
    "mediaType": ["News"],
    "metadata": {"json": [{
        "calories": "950",
        "description": "two eggs, bacon or sausage, toast, and our ever-popular hash browns",
        "name": "Homestyle Breakfast",
        "price": "$6.95"
    }]},
    "modified": "Jun 5, 2013 09:12:15 PM UTC",
    "publishedDate": "Jun 5, 2013 09:12:15 PM UTC",
    "source": ["aaa xml test"],
    "sourceKey": ["www.w3schools.com.xml.simple.xml"],
    "tags": ["tag1"],
    "title": "TBD",
    "url": "http://www.w3schools.com/xml/simple.xml#4"
}

 

Old format source

{
    "description": "wiy",
    "extractType": "Feed",
    "isPublic": true,
    "mediaType": "News",
    "rss": {
        "searchConfig": {
            "extraMeta": [
                {
                    "context": "First",
                    "fieldName": "convert_to_json",
                    "flags": "o",
                    "script": "//breakfast_menu/food[*]",
                    "scriptlang": "xpath"
                }
            ],
            "script": "function convert_to_docs(jsonarray, url)\n{\n    var docs = [];\n    for (var docIt in jsonarray) {\n        var predoc = jsonarray[docIt];\n        delete predoc.content;\n        var doc = {};\n        doc.url = _doc.url.replace(/[?].*/,\"\") + '#' + docIt;\n        doc.fullText = predoc;\n        doc.title = \"TBD\";\n        doc.description = \"TBD\";\n        docs.push(doc);\n    }\n    return docs;\n}\nvar docs = convert_to_docs(_doc.metadata['convert_to_json'], _doc.url);\ndocs;",
            "scriptflags": "d"
        },
        "extraUrls": [
            {
                "url": "http://www.w3schools.com/xml/simple.xml"
            }
        ],
        "updateCycle_secs": 86400
    },
    "tags": [
        "tag1"
    ],
    "title": "aaa xml test",
    "unstructuredAnalysis": {
        "meta": [
            {
                "context": "First",
                "fieldName": "json",
                "script": "var json = eval('('+text+')'); json;",
                "scriptlang": "javascript"
            }
        ]
    },
    "useExtractor": "none",
    "useTextExtractor": "none"
}