Friday, July 26, 2013

Using webMethods Java service(SAX Parser) to split a huge XML file in to multiple small files.

I initially wanted to use STAX Parser which is much easy to use and efficient (From looking at some sample codes online) than the SAX parser but i finally decided to use the SAX Parser. I am currently using a webMethods 8.2.2 env with 1.6 Java so i had to use the SAX Parser to split files.

My requirement was to split a 3 GB XML file in to small XML files. This huge XML had a list of orders in them and below is a sample structure of my XML.
<?xml version="1.0" encoding="UTF-8?>

<rootNode>

       <childNode>

               <ordernumber>12354</ordernumber>



       </childNode>

       <childNode>

               <ordernumber>12355</ordernumber>



       </childNode>

       <childNode>

               <ordernumber>12356</ordernumber>



       </childNode>

</rootNode>

The result should look like this:

file1:
<?xml version="1.0" encoding="UTF-8?>

<rootNode>

       <childNode>

               <ordernumber>12354</ordernumber>



       </childNode>

       <childNode>

               <ordernumber>12355</ordernumber>



       </childNode>

</rootNode>

file2:
<?xml version="1.0" encoding="UTF-8?>

<rootNode>

       <childNode>

               <ordernumber>12356</ordernumber>



       </childNode>

</rootNode>

Java Service Code:
final IDataCursor pipelineCursor = pipeline.getCursor();
  String fileName = IDataUtil.getString( pipelineCursor, "fileName" );
  final String targetDirectotryForSplitFiles = IDataUtil.getString( pipelineCursor, "targetDirectotryForSplitFiles" );
  final String searchElement = IDataUtil.getString( pipelineCursor, "searchElement" );
  final String rootName = IDataUtil.getString( pipelineCursor, "rootName" );
  pipelineCursor.destroy();
    
  try {   
    DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
    DocumentBuilder docBuilder;
    docBuilder = docFactory.newDocumentBuilder();    
    Document doc = docBuilder.parse(fileName);
    NodeList list = doc.getElementsByTagName(searchElement);
    
    final int chunkSize=list.getLength()*7;
    
    SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
    SAXParser saxParser = saxParserFactory.newSAXParser();   
    
    DefaultHandler defaultHandler=new DefaultHandler(){
    int count=0;
    int fileNameCount=0;
    ArrayList outList=new ArrayList();
    String searchTerm="close";
    public void startElement(String uri, String localName, String qName,Attributes attributes) throws SAXException {  
    if (qName.equalsIgnoreCase(searchElement)) {
     String tag = "<" + qName;
     for (int i = 0; i < attributes.getLength(); i++) {
    
      tag += " " + attributes.getLocalName(i) + "="
      + attributes.getValue(i);
     }
    
     tag += ">";
     outList.add(tag);
     searchTerm = "open";  
    }
    else{    
     if(qName.compareTo(rootName)==0){
      
     }
     else{
      outList.add("<" + qName + ">");
     }     
     }
    }  
    public void characters(char ch[], int start, int length)throws SAXException {     
     if (searchTerm.equals("open")) {
      String escapeCharPreserved=StringEscapeUtils.escapeXml(new String(ch, start, length));
      outList.add(escapeCharPreserved);
     }
    }
    public void endElement(String uri, String localName, String qName)  
    throws SAXException {
           outList.add("</" + qName + ">");
           count++;
           if (qName.equalsIgnoreCase("PCRBRecord")) {
               searchTerm = "close";
               
               if(count>chunkSize){
         try {
           writeToFile(outList,targetDirectotryForSplitFiles+"/rest_"+fileNameCount+".xml",rootName);
           fileNameCount++;
           outList.clear();
           count=0;
         } catch (IOException e) {
          IDataUtil.put(pipelineCursor, "filerror", e.toString());
         }
        }
              }
     }
    };
   
   saxParser.parse(fileName, defaultHandler);   
  } catch (ParserConfigurationException e) {
   IDataUtil.put(pipelineCursor, "result", e.toString());
  } catch (SAXException e) {
   IDataUtil.put(pipelineCursor, "result", e.toString());
  } catch (IOException e) {
   IDataUtil.put(pipelineCursor, "result", e.toString());
  }
Shared Code:
private static void writeToFile(ArrayList inputLines, String fileName, String rootName) throws IOException{
  
  ArrayList templist=new ArrayList();
  templist.add("<?xml version="1.0" encoding="UTF-8"?>");
  templist.add("<"+rootName+">");
  templist.addAll(inputLines);
  templist.add("</"+rootName+">");
  FileWriter writer = new FileWriter(fileName); 
  for(String str: templist) {
    writer.write(str);
  }
  writer.close();
 }
Imports:
import java.io.*;
import java.util.ArrayList;
import javax.xml.parsers.*;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.commons.lang.StringEscapeUtils;