I am using spark-excel(com.crealytics.spark.excel) library to read excel file. If no duplicate column in excel file, the library working fine. If any duplicate column name occurs in excel file, throwing below exception.

How to overcome this error?

Is there any workaround solution to overcome this?

Exception in thread "main" org.apache.spark.sql.AnalysisException: Found duplicate column(s) in the data schema: `net territory`; at org.apache.spark.sql.util.SchemaUtils$.checkColumnNameDuplication(SchemaUtils.scala:85)

1

There are 1 answers

0
Nanaji On
Using spark excel API getting exception . 
    StructType schema = DataTypes.createStructType(new StructField[]{DataTypes.createStructField("CGISAI", DataTypes.StringType, true), DataTypes.createStructField("SALES TERRITORY", DataTypes.StringType, true)});
    SQLContext sqlcxt = new SQLContext(jsc);
    Dataset<Row> df = sqlcxt.read()
            .format("com.crealytics.spark.excel")
            .option("path", "file:///"+siteinfofile)
            .option("useHeader", "true")
            .option("spark.read.simpleMode", "true")
            .option("treatEmptyValuesAsNulls", "true")
            .option("inferSchema", "false")
            .option("addColorColumns", "False")
            .option("sheetName", "sheet1")
            .option("startColumn", 22)
            .option("endColumn", 23)
            //.schema(schema)
            .load();
    return df;


This is the code I am using. I am using sparkexcel library from com.crealytics.spark.excel. 

I want the solution to identify whether excel file has duplicate columns or not. if have duplicate columns, how to rename/eliminate the duplicate columns.




WorkAround is as below:
convert .xlsx file into .csv . using spark default csv api that can handle duplicate column names by renaming them automatically.

Below is the code to convert from xlsx to csv file.


/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package com.huawei.java.tools;

/**
 *
 * @author Nanaji Jonnadula
 */

import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackageAccess;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.util.CellAddress;
import org.apache.poi.ss.util.CellReference;
import org.apache.poi.util.SAXHelper;
import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFComment;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;

import javax.xml.parsers.ParserConfigurationException;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;

import static org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
public class ExcelXlsx2Csv {


    private static class SheetToCSV implements SheetContentsHandler {
        private boolean firstCellOfRow = false;
        private int currentRow = -1;
        private int currentCol = -1;

        private StringBuffer lineBuffer = new StringBuffer();

        /** * Destination for data */
        private FileOutputStream outputStream;

        public SheetToCSV(FileOutputStream outputStream) {
            this.outputStream = outputStream;
        }

        @Override
        public void startRow(int rowNum) {
            /** * If there were gaps, output the missing rows: * outputMissingRows(rowNum - currentRow - 1); */
            // Prepare for this row
            firstCellOfRow = true;
            currentRow = rowNum;
            currentCol = -1;

            lineBuffer.delete(0, lineBuffer.length());  //clear lineBuffer
        }

        @Override
        public void endRow(int rowNum) {
            lineBuffer.append('\n');
            try {
                outputStream.write(lineBuffer.substring(0).getBytes());
            } catch (IOException e) {
                System.out.println("save date to file error at row number: {}"+ currentCol);
                throw new RuntimeException("save date to file error at row number: " + currentCol);
            }
        }

        @Override
        public void cell(String cellReference, String formattedValue, XSSFComment comment) {
            if (firstCellOfRow) {
                firstCellOfRow = false;
            } else {
                lineBuffer.append(',');
            }

            // gracefully handle missing CellRef here in a similar way as XSSFCell does
            if (cellReference == null) {
                cellReference = new CellAddress(currentRow, currentCol).formatAsString();
            }

            int thisCol = (new CellReference(cellReference)).getCol();
            int missedCols = thisCol - currentCol - 1;
            if (missedCols > 1) {   
                lineBuffer.append(',');
            }
            currentCol = thisCol;
            if (formattedValue.contains("\n")) {    
                formattedValue = formattedValue.replace("\n", "");
            }
            formattedValue = "\"" + formattedValue + "\"";  
            lineBuffer.append(formattedValue);
        }

        @Override
        public void headerFooter(String text, boolean isHeader, String tagName) {
            // Skip, no headers or footers in CSV
        }
    }



    private static void processSheet(StylesTable styles, ReadOnlySharedStringsTable strings,
                                    SheetContentsHandler sheetHandler, InputStream sheetInputStream) throws Exception {
        DataFormatter formatter = new DataFormatter();
        InputSource sheetSource = new InputSource(sheetInputStream);
        try {
            XMLReader sheetParser = SAXHelper.newXMLReader();
            ContentHandler handler = new XSSFSheetXMLHandler(
                    styles, null, strings, sheetHandler, formatter, false);
            sheetParser.setContentHandler(handler);
            sheetParser.parse(sheetSource);
        } catch (ParserConfigurationException e) {
            throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage());
        }
    }


    public static void process(String srcFile, String destFile,String sheetname_) throws Exception {
        File xlsxFile = new File(srcFile);
        OPCPackage xlsxPackage = OPCPackage.open(xlsxFile.getPath(), PackageAccess.READ);
        ReadOnlySharedStringsTable strings = new ReadOnlySharedStringsTable(xlsxPackage);
        XSSFReader xssfReader = new XSSFReader(xlsxPackage);
        StylesTable styles = xssfReader.getStylesTable();
        XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
        int index = 0;
        while (iter.hasNext()) {
            InputStream stream = iter.next();
            String sheetName = iter.getSheetName();
            System.out.println(sheetName + " [index=" + index + "]");
            if(sheetName.equals(sheetname_)){
            FileOutputStream fileOutputStream = new FileOutputStream(destFile);
            processSheet(styles, strings, new SheetToCSV(fileOutputStream), stream);
            fileOutputStream.flush();
            fileOutputStream.close();            
            }
            stream.close();

            ++index;
        }
        xlsxPackage.close();
    }

    public static void main(String[] args) throws Exception {
        ExcelXlsx2Csv.process("D:\\data\\latest.xlsx", "D:\\data\\latest.csv","sheet1"); //source , destination, sheetname
    }
}