Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

...

Note the use of XPath to identify easily how to convert the top-level XML document into lots of little documents - the "rssweb,searchConfig.script" is then boilerplate and converts the XML into lots of small documents, with the "fullText" of each containing the JSON representation of the selected XML. This is then converted into metadata by the "unstructuredAnalysiscontentMetadata" block. Normally a "structuredAnalysis"docMetadata"/"entities"/"associations" block would finally be used to set the per-document titles, descriptions, entities etc.

Code Block
languagejavascript
{
    "description": "wiy",
    "extractType": "Feed",
    "isPublic": true,
    "mediaType": "News",
    "rsstags": {[
        "searchConfigtag1":
{    ],
    "title": "aaa xml test",
    "extraMetaprocessingPipeline": [
        {
       {     "feed": {
                "contextextraUrls": "First",[
                    {
  "fieldName": "convert_to_json",                     "flagsurl": "o",http://www.w3schools.com/xml/simple.xml"
                    "script": "//breakfast_menu/food[*]",}
                ],
   "scriptlang": "xpath"            "updateCycle_secs": 86400
   }         }
   ],       },
     "script": "function convert_to_docs(jsonarray, url)\n{\n 
  var docs = [];\n    for (var docIt in jsonarray) {\n"links": {
       var predoc = jsonarray[docIt];\n        delete predoc.content;\n"extraMeta": [
        var doc = {};\n        doc.url = _doc.url.replace(/[?].*/,\"\") + '#' + docIt;\n{
            doc.fullText = predoc;\n        doc.title = \"TBD\";\n context": "First",
      doc.description = \"TBD\";\n        docs.push(doc);\n    }\n    return docs;\n}\nvar docs = "fieldName": "convert_to_docs(_doc.metadata['convert_to_json'], _doc.url);\ndocs;",
      json",
       "scriptflags": "d"         },         "extraUrlsflags": ["o",
            {                 "urlscript": "http://www.w3schools.com/xml/simple.xml"
breakfast_menu/food[*]",
           }         ],    "scriptlang": "xpath"
   "updateCycle_secs": 86400     },     "tags": [     }
   "tag1"     ],     "title": "aaa xml test"],
    "unstructuredAnalysis": {            "metascript": [
 "function convert_to_docs(jsonarray, url)\n{\n    var docs =  [];\n   { for (var docIt in jsonarray) {\n        var predoc  "context": "First",= jsonarray[docIt];\n        delete predoc.content;\n        var "fieldName": "json",
doc = {};\n        doc.url        "script": "var json = eval('('+text+')'); json;",= _doc.url.replace(/[?].*/,\"\") + '#' + docIt;\n        doc.fullText = predoc;\n         "scriptlang": "javascript"doc.title = \"TBD\";\n        doc.description = \"TBD\";\n   }     docs.push(doc);\n    ]}\n    return docs;\n}\nvar docs = convert_to_docs(_doc.metadata['convert_to_json'], _doc.url);\ndocs;",
   "useExtractor": "none",     "useTextExtractor": "none" }

Output

Code Block
languagejavascript
{     "communityIdscriptflags": ["4d38b72c054548f038a0414ad"],
    "created": "Jun 5, 2013 09:12:15 PM UTC",  }
  "description": "TBD",     "fullText": "{ \"calories\" : \"650\" , \"description\" : \"two of our famous Belgian Waffles with plenty of real maple syrup\" , \"price\" : \"$5.95\" , \"name\" : \"Belgian Waffles\"}",},
        {
            "contentMetadata": [
          "mediaType": ["News"],     "metadata": {"json": [{{
          "calories": "650",         "descriptionfieldName": "json"two,
 of our famous Belgian Waffles with plenty of real maple syrup",         "namescript": "Belgianvar Waffles",json         "price": "$5.95"= eval('('+text+')'); json;",
    }]},     "modified": "Jun 5, 2013 09:12:15 PM UTC",     "publishedDatescriptlang": "javascript"Jun
5, 2013 09:12:15 PM UTC",     "source": ["aaa xml test"],
     }
            ]
        }
   "sourceKey": ["www.w3schools.com.xml.simple.xml"],]
}

Output

Code Block
languagejavascript
{
    "tagscommunityId": ["tag14d38b72c054548f038a0414a"],
    "titlecreated": "TBD",
    "url": "http://www.w3schools.com/xml/simple.xml#0"
}
{
    "communityId": ["4d38b72c054548f038a0414a"],
    "created": "Jun 5, 2013 09:12:15 PM UTC",
    "description": "TBD",
    "fullText": "{ \"calories\" : \"900650\" , \"description\" : \"lighttwo Belgianof waffles coveredour famous Belgian Waffles with plenty strawberriesof andreal whippedmaple creamsyrup\" , \"price\" : \"$7$5.95\" , \"name\" : \"Strawberry Belgian Waffles\"}",
    "mediaType": ["News"],
    "metadata": {"json": [{
        "calories": "900650",
        "description": "lighttwo of our famous Belgian wafflesWaffles coveredwith withplenty strawberriesof andreal whippedmaple creamsyrup",
        "name": "Strawberry Belgian Waffles",
        "price": "$7$5.95"
    }]},
    "modified": "Jun 5, 2013 09:12:15 PM UTC",
    "publishedDate": "Jun 5, 2013 09:12:15 PM UTC",
    "source": ["aaa xml test"],
    "sourceKey": ["www.w3schools.com.xml.simple.xml"],
    "tags": ["tag1"],
    "title": "TBD",
    "url": "http://www.w3schools.com/xml/simple.xml#1xml#0"
}
{
    "communityId": ["4d38b72c054548f038a0414a"],
    "created": "Jun 5, 2013 09:12:15 PM UTC",
    "description": "TBD",
    "fullText": "{ \"calories\" : \"900\" , \"description\" : \"light Belgian waffles covered with an assortment of fresh berries strawberries and whipped cream\" , \"price\" : \"$8$7.95\" , \"name\" : \"Berry-BerryStrawberry Belgian Waffles\"}",
    "mediaType": ["News"],
    "metadata": {"json": [{
        "calories": "900",
        "description": "light Belgian waffles covered with an assortment of fresh berries strawberries and whipped cream",
        "name": "Berry-BerryStrawberry Belgian Waffles",
        "price": "$8$7.95"
    }]},
    "modified": "Jun 5, 2013 09:12:15 PM UTC",
    "publishedDate": "Jun 5, 2013 09:12:15 PM UTC",
    "source": ["aaa xml test"],
    "sourceKey": ["www.w3schools.com.xml.simple.xml"],
    "tags": ["tag1"],
    "title": "TBD",
    "url": "http://www.w3schools.com/xml/simple.xml#2xml#1"
}
{
    "communityId": ["4d38b72c054548f038a0414a"],
    "created": "Jun 5, 2013 09:12:15 PM UTC",
    "description": "TBD",
    "fullText": "{ \"calories\" : \"600900\" , \"description\" : \"thick slices made from our homemade sourdough breadlight Belgian waffles covered with an assortment of fresh berries and whipped cream\" , \"price\" : \"$4$8.5095\" , \"name\" : \"French ToastBerry-Berry Belgian Waffles\"}",
    "mediaType": ["News"],
    "metadata": {"json": [{
        "calories": "600900",
        "description": "thicklight slicesBelgian madewaffles fromcovered our homemade sourdough breadwith an assortment of fresh berries and whipped cream",
        "name": "Berry-Berry Belgian Waffles",
        "price": "$8.95"
    }]},
    "modified": "French Toast",
        "price": "$4.50"
    }]},
    "modified": "Jun 5, 2013 09:12:15 PM UTC",
    "publishedDate": "Jun 5, 2013 09:12:15 PM UTC",
    "source": ["aaa xml test"],
    "sourceKey": ["www.w3schools.com.xml.simple.xml"],
    "tags": ["tag1"],
    "title": "TBD", "Jun 5, 2013 09:12:15 PM UTC",
    "publishedDate": "Jun 5, 2013 09:12:15 PM UTC",
    "source": ["aaa xml test"],
    "sourceKey": ["www.w3schools.com.xml.simple.xml"],
    "tags": ["tag1"],
    "title": "TBD",
    "url": "http://www.w3schools.com/xml/simple.xml#2"
}
{
    "communityId": ["4d38b72c054548f038a0414a"],
    "created": "Jun 5, 2013 09:12:15 PM UTC",
    "description": "TBD",
    "fullText": "{ \"calories\" : \"600\" , \"description\" : \"thick slices made from our homemade sourdough bread\" , \"price\" : \"$4.50\" , \"name\" : \"French Toast\"}",
    "mediaType": ["News"],
    "metadata": {"json": [{
        "calories": "600",
        "description": "thick slices made from our homemade sourdough bread",
        "name": "French Toast",
        "price": "$4.50"
    }]},
    "modified": "Jun 5, 2013 09:12:15 PM UTC",
    "publishedDate": "Jun 5, 2013 09:12:15 PM UTC",
    "source": ["aaa xml test"],
    "sourceKey": ["www.w3schools.com.xml.simple.xml"],
    "tags": ["tag1"],
    "title": "TBD",
    "url": "http://www.w3schools.com/xml/simple.xml#3"
}
{
    "communityId": ["4d38b72c054548f038a0414a"],
    "created": "Jun 5, 2013 09:12:15 PM UTC",
    "description": "TBD",
    "fullText": "{ \"calories\" : \"950\" , \"description\" : \"two eggs, bacon or sausage, toast, and our ever-popular hash browns\" , \"price\" : \"$6.95\" , \"name\" : \"Homestyle Breakfast\"}",
    "mediaType": ["News"],
    "metadata": {"json": [{
        "calories": "950",
        "description": "two eggs, bacon or sausage, toast, and our ever-popular hash browns",
        "name": "Homestyle Breakfast",
        "price": "$6.95"
    }]},
    "modified": "Jun 5, 2013 09:12:15 PM UTC",
    "publishedDate": "Jun 5, 2013 09:12:15 PM UTC",
    "source": ["aaa xml test"],
    "sourceKey": ["www.w3schools.com.xml.simple.xml"],
    "tags": ["tag1"],
    "title": "TBD",
    "url": "http://www.w3schools.com/xml/simple.xml#4"
}

 

Old format source

Code Block
languagejavascript
{
    "description": "wiy",
    "extractType": "Feed",
    "isPublic": true,
    "mediaType": "News",
    "rss": {
        "searchConfig": {
            "extraMeta": [
                {
                    "context": "First",
                    "fieldName": "convert_to_json",
                    "flags": "o",
                    "script": "//breakfast_menu/food[*]",
                    "scriptlang": "xpath"
                }
            ],
            "script": "function convert_to_docs(jsonarray, url)\n{\n    var docs = [];\n    for (var docIt in jsonarray) {\n        var predoc = jsonarray[docIt];\n        delete predoc.content;\n        var doc = {};\n        doc.url = _doc.url.replace(/[?].*/,\"\") + '#' + docIt;\n        doc.fullText = predoc;\n        doc.title = \"TBD\";\n        doc.description = \"TBD\";\n        docs.push(doc);\n    }\n    return docs;\n}\nvar docs = convert_to_docs(_doc.metadata['convert_to_json'], _doc.url);\ndocs;",
            "scriptflags": "d"
        },
        "extraUrls": [
            {
                "url": "http://www.w3schools.com/xml/simple.xml#3"xml"
            }
{       "communityId": ["4d38b72c054548f038a0414a"],
        "createdupdateCycle_secs": "Jun86400
5, 2013 09:12:15 PM UTC"},
    "descriptiontags": "TBD", [
        "fullTexttag1":
  "{ \"calories\" : \"950\" , \"description\" : \"two eggs, bacon or sausage, toast, and our ever-popular hash browns\" , \"price\" : \"$6.95\" , \"name\" : \"Homestyle Breakfast\"}",
    "mediaType": ["News"],
    "metadata": {"json": [ ],
    "title": "aaa xml test",
    "unstructuredAnalysis": {
        "meta": [
            {
        "calories": "950",
        "descriptioncontext": "First"two eggs,
bacon or sausage, toast, and our ever-popular hash browns",         "namefieldName": "Homestyle Breakfastjson",
        "price        "script": "$6.95"var json = eval('('+text+')');  }]},json;",
     "modified": "Jun 5, 2013 09:12:15 PM UTC",     "publishedDatescriptlang": "javascript"Jun
 5, 2013 09:12:15 PM UTC",     "source": ["aaa xml test"],  }
       "sourceKey": ["www.w3schools.com.xml.simple.xml"],
    "tags": ["tag1"]},
    "titleuseExtractor": "TBDnone",
    "urluseTextExtractor": "http://www.w3schools.com/xml/simple.xml#4none"
}

 

...