[ https://issues.apache.org/jira/browse/IO-200?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Pascal Schumacher closed IO-200. -------------------------------- Resolution: Won't Fix I'm closing this because https://commons.apache.org/proper/commons-csv/ already exists. > CSV component > ------------- > > Key: IO-200 > URL: https://issues.apache.org/jira/browse/IO-200 > Project: Commons IO > Issue Type: New Feature > Components: Utilities > Reporter: haruhiko nishi > Priority: Trivial > > TableBuilder is 'Builder ' that maps the CSV to a matrix and provides > interface that allows user to manipulate after it is build by parsing a csv > file to it parse() method.(There is only one method implemented and it is for > copying a column values to another position, as I could not think of other > operation that may be useful) > Within the TableBuilder, each column of the CSV is represented as byte[] and > each becomes a target to be validated against Rule,represented by the > interface that you find in the example code below. As TableBuilder > "buildTable" ,when parse() method is invoked, a byte[] representation of the > value of each CSV cell is passed to isValid() method of implementations of > Rules, which you apply to the TableBuilder instance through the addRule() > method. (you you can add as many Rule as you need.) > Rule gets executed until the validation fails or succeeds. If any of the Rule > fails, then its replace() is called and the column value being processed gets > replaced by the retun value of this method. > Another goodie is that it is possible to refer to the values of preceding > cell values of the row within a Rule. > It is useful if you need to see the entries of the preceding cell when > validating the value in a Rule. An example would be, > Given a csv, > A,B,C > 1,2,3 > in order for the value 3 of the column C is to be validated true, the Value > of A needs to be less than the value of C. > TableBuilder is RFC 4180 compliant and therefore distinguishes NL exists by > itself and NL found in double quotes. > So you can add Rule that practically removes all NL chars found in value > enclosed within doublequotes. > (useful when you need to remove CRLF in double quotes from CSV exported from > Excel) > Currently, TableBuilder implements a method called copyColumn with method > signature of, > copyColumn(Rule rule,int from,int to, boolean override) which allows user to > manipulate the parsed csv. > What it does is literarly copies column from that is specified at 'from' and > to 'to' position of the matrix. > If override is true, the copying colum is overriden else the column is right > shifted and inserted at the specified position. > You can specify some kind of Rule here to rewrite the value being copied from > the origin. > An example would be copy column value that all ends with .jpg or .gif and to > the position specified prefixing the column value with > "http://some.server.com/imanges." after checking the image exists, after > checking that the named file exists at some location also by an > implementation of another Rule. > TableBuilder is just a "rough skecth" idea of CSV parsing.(The code below > works fine though) it still needs alot of refactoring and so. > I appreciate any comment on this idea. What do you think? My code style sucks > I know! > Here is simple exampe to use TableBuilder. > {code:title=TableBuilder|borderStyle=solid} > public static void main(String[] args)throws Exception{ > TableBuilder tableBuilder=new TableBuilder("UTF-8", > new MessageHandler(){ > public void handleMessage(String message) { > System.err.println(message); > } > },0,true); > tableBuilder.addRule(3,new RemoveNLChars()); //removing NL cahracters > found in value. > tableBuilder.parse(new FileInputStream("test.txt"),TableBuilder.CSV); > List<Record> list=tableBuilder.getRowAsListOf(Record.class); > for(Record record:list) > System.out.println(record.getA());//TODO not implemented yet! > tableBuilder.writeTo(new > FileOutputStream("test_mod.txt"),TableBuilder.CSV); > } > public class RemoveNLChars extends StringValueRuleAdapter { > protected boolean isValid(String columnValue) { > return !columnValue.contains(System.getProperty("line.separator")); > } > protected String replace(String columnValue) { > return > columnValue.replaceAll(System.getProperty("line.separator"),""); > } > public String getMessage() { > return ""; > } > } > public interface Rule { > public void setRowReference(List<byte[]> rowReference); > public void setCharsetName(String charsetName); > boolean isValid(final byte[] columnValue); > byte[] replace(final byte[] columnValue); > String getMessage(); > } > //StringValueruleAdapter is an adapter converts the byte[] representation of > the cell value. > public abstract class StringValueRuleAdapter implements Rule{ > private String charsetName; > private List<byte[]> rowReference; > > public void setRowReference(List<byte[]> rowReference) { > this.rowReference=rowReference; > } > public void setCharsetName(String charsetName) { > this.charsetName=charsetName; > } > public final boolean isValid(final byte[] columnValue) { > String strValue; > try { > if(columnValue.length>0) > strValue=(charsetName!=null) ? new > String(columnValue,charsetName) : new String(columnValue); > else > strValue=""; > } catch (UnsupportedEncodingException e) { > if(columnValue.length>0) > strValue=new String(columnValue); > else > strValue=""; > } > return isValid(strValue); > } > public final byte[] replace(final byte[] columnValue) { > String strValue; > try { > if(columnValue.length>0) > strValue=(charsetName!=null) ? new > String(columnValue,charsetName):new String(columnValue); > else > strValue=""; > return (charsetName!=null) ? > replace(strValue).getBytes(charsetName):replace(strValue).getBytes(); > } catch (UnsupportedEncodingException e) { > if(columnValue.length>0) > strValue=new String(columnValue); > else > strValue=""; > return replace(strValue).getBytes(); > } > } > protected String getRowValue(int column) { > try { > return (charsetName!=null) ? new > String(rowReference.get(column),charsetName) : > new String(rowReference.get(column)); > } catch (UnsupportedEncodingException e) { > return new String(rowReference.get(column)); > } catch(IndexOutOfBoundsException noListFound){ > throw new IllegalArgumentException("no value exists at the > requested column."); > } > } > protected String getPrecedingRowValue(){ > return getRowValue(rowReference.size()-1); > } > protected abstract boolean isValid(String columnValue); > protected abstract String replace(String columnValue); > } > public class TableBuilder { > public static int CSV=0x2c; > public static int TSV=0x09; > private Map<Integer,Set<Rule>> columnRule=new > HashMap<Integer,Set<Rule>>(); > private Table currentTable; > private byte[] newLineChars; > private boolean endsWithNL; > private String charsetName; > private int rowOffset; > private boolean useFirstColumnAsRowName; > private MessageHandler msgHandler=new MessageHandler(){ > public void handleMessage(String message) { > System.err.println(message); > } > }; > public TableBuilder(String charsetName,MessageHandler msgHandler,int > rowOffset,boolean useFirstColumnAsRowName){ > this.charsetName=charsetName; > this.rowOffset=rowOffset; > this.msgHandler=msgHandler; > this.useFirstColumnAsRowName=useFirstColumnAsRowName; > } > public TableBuilder(String charsetName){ > this.charsetName=charsetName; > } > public TableBuilder(){ > > } > public void addRule(int column, Rule rule){ > Set<Rule> ruleset; > if((ruleset=columnRule.get(column))==null){ > ruleset=new LinkedHashSet<Rule>(); > columnRule.put(column,ruleset); > } > rule.setCharsetName(charsetName); > ruleset.add(rule); > } > public void parse(InputStream in, int delimiter)throws Exception{ > int bytesRead; > byte buf[]=new byte[1024]; > ByteArrayOutputStream outbuf=new ByteArrayOutputStream(buf.length); > while((bytesRead=in.read(buf,0,buf.length))!=-1) > outbuf.write(buf,0,bytesRead); > in.close(); > ByteBuffer > bytebuffer=ByteBuffer.allocateDirect(outbuf.size()).put(outbuf.toByteArray()); > bytebuffer.flip(); > currentTable=buildTable(bytebuffer,delimiter); > } > private class Table { > private List<byte[]>[] columnMatrix; > private List<List<byte[]>> rowMatrix; > > Table(List<byte[]>[] columnMatrix,List<List<byte[]>> rowMatrix){ > this.columnMatrix=columnMatrix; > this.rowMatrix=rowMatrix; > } > public int getNumOfColumns() { > return columnMatrix.length; > } > public int getNumOfRows(){ > return rowMatrix.size(); > } > public byte[] getValueAt(int row, int column) { > return columnMatrix[column].get(row); > } > public byte[] getColumnName(int column){ > return columnMatrix[column].get(0); > } > public List<byte[]> getColumn(int column){ > return columnMatrix[column]; > } > public List<byte[]> getRow(int row){ > return rowMatrix.get(row); > } > > } > //TODO extract csv row as JavaBean > public <E> List<E> getRowAsListOf(final Class<E> clazz){ > List<E> list=null; > Iterator<byte[]> header=currentTable.getRow(0).iterator(); > for(int i=1;i<currentTable.getNumOfRows();i++){ > try { > E instance=clazz.newInstance(); > for(byte[] value:currentTable.getRow(i)){ > String name=new String(header.next()); > //BeanUtils.setProperty(instance,name,value); > } > if(list==null) > list=new ArrayList<E>(); > list.add(instance); > header=currentTable.getRow(0).iterator(); > } catch (IllegalAccessException e) { > e.printStackTrace(); > } catch (InvocationTargetException e) { > e.printStackTrace(); > } catch (InstantiationException e) { > e.printStackTrace(); > } > } > return list; > } > public void writeTo(OutputStream out,int delimiter) throws IOException { > for(int i=0,j=0;i<currentTable.getNumOfRows();i++,j=0){ > for(byte[] value:currentTable.getRow(i)){ > out.write(value); > if(++j<currentTable.getNumOfColumns()) > out.write(delimiter); > } > if(i<currentTable.getNumOfRows()-1) > out.write(newLineChars); > else{ > if(endsWithNL) > out.write(newLineChars); > } > } > out.close(); > } > > public void copyColumn(Rule rule,int from,int to, boolean override) { > int numOfColumns=override ? > currentTable.getNumOfColumns():currentTable.getNumOfColumns()+1; > List<byte[]>[] columnMatrix=(List<byte[]>[])new > List[numOfColumns]; > columnMatrix[to]=new ArrayList<byte[]>(); > for(int i=0,j=0;i<columnMatrix.length;i++){ > if(i==to){ > for(int row=0;row<currentTable.getNumOfRows();row++){ > byte[] value; > if(row>=rowOffset) > value=currentTable.getValueAt(row,from); > else > value=new byte[0]; > if(rule!=null && row>rowOffset){ > rule.setCharsetName(charsetName); > rule.setRowReference(currentTable.getRow(row)); > if(!rule.isValid(value)){ > String columnName; > byte[] > columnNameByte=currentTable.getColumnName(from); > if(columnNameByte.length>0){ > try { > if(charsetName!=null) > columnName="'"+new > String(columnNameByte,charsetName).trim()+"'"; > else > columnName="'"+new > String(columnNameByte).trim()+"'"; > } catch (UnsupportedEncodingException e) { > columnName="'"+new > String(columnNameByte).trim()+"'"; > } > }else > columnName="''"; > value=rule.replace(value); > String msg=rule.getMessage(); > if(msg.length()>0) > try { > handleMessage(msg > .replace("${column_from}",""+from) > > .replace("${columnName}",columnName) > .replace("${column_to}",""+(to+1)) > > .replace("${row}",useFirstColumnAsRowName ? new > String(currentTable.getRow(row).get(0),charsetName) : ""+(row+1))); > } catch (UnsupportedEncodingException > ignored) { > > } > } > } > columnMatrix[i].add(value); > if(override) > currentTable.rowMatrix.get(row).remove(i); > currentTable.rowMatrix.get(row).add(i,value); > } > if(override) > ++j; > }else > columnMatrix[i]=currentTable.getColumn(j++); > } > currentTable=new Table(columnMatrix,currentTable.rowMatrix); > } > > private Table buildTable(ByteBuffer buf,int delimiter) throws > ParseException { > List<byte[]>[] columnMatrix=null; > List<List<byte[]>> rowMatrix=new ArrayList<List<byte[]>>(); > int i=0,j,currentRow=0,rowIndex=0,column_count=0,column=0; > endsWithNL=true; > newLineChars=null; > int limit=buf.limit(); > int pos=0; > while(i<limit && > ((j=(buf.get(i)&0xff))==0x0d||(j=(buf.get(i)&0xff))==0x0a)){ > if(j==0x0a) > ++currentRow; > pos=++i; > } > > int headRow=currentRow; > while(i<limit){ > int tmp=buf.get(i) & 0xff; > if(tmp==0x0a){ > int k=i; > while(k>=0 > &&((buf.get(k)&0xff)==0x0d||(buf.get(k)&0xff)==0x0a)) > --k; > byte[] prev=new byte[++k-pos]; > > buf.position(pos); > buf.get(prev,0,prev.length); > List<byte[]> row; > try{ > row=rowMatrix.get(rowIndex); > }catch(IndexOutOfBoundsException noListFound){ > rowMatrix.add(new ArrayList<byte[]>()); > row=rowMatrix.get(rowIndex); > } > if(currentRow==headRow){ > column_count=column; > row.add(prev); > columnMatrix=(List<byte[]>[])new ArrayList[column+1]; > Iterator<byte[]> itr; > for(j=0,itr=row.iterator();j<columnMatrix.length;j++){ > columnMatrix[j]=new ArrayList<byte[]>(); > columnMatrix[j].add(itr.next()); > } > }else if(column_count!=column){ > throw new ParseException("column count mismatch on > row ",currentRow+1); > }else{ > Set<Rule> ruleset=columnRule.get(column); > if(ruleset!=null && currentRow>rowOffset+headRow){ > byte[] > columnNameByte=rowMatrix.get(rowOffset).get(column); > Rule rule=validate(ruleset,prev,row); > if(rule!=null){ > String columnName; > if(columnNameByte.length>0){ > try { > if(charsetName!=null) > columnName="'"+new > String(columnNameByte,charsetName).trim()+"'"; > else > columnName="'"+new > String(columnNameByte).trim()+"'"; > } catch (UnsupportedEncodingException e) { > columnName="'"+new > String(columnNameByte).trim()+"'"; > } > }else > columnName="''"; > prev=rule.replace(prev); > String msg=rule.getMessage(); > if(msg.length()>0) > try { > handleMessage(msg > .replace("${column}",""+column) > > .replace("${columnName}",columnName.trim()) > > .replace("${row}",useFirstColumnAsRowName ? new > String(rowMatrix.get(rowIndex).get(0),charsetName) : ""+(currentRow+1))); > } catch (UnsupportedEncodingException > ignored) { > } > } > } > columnMatrix[column].add(prev); > row.add(prev); > } > if(newLineChars==null){ > newLineChars=new byte[++i-k]; > buf.position(k); > buf.get(newLineChars,0,newLineChars.length); > }else > ++i; > while(i<limit && > ((j=(buf.get(i)&0xff))==0x0d||(j=(buf.get(i)&0xff))==0x0a)){ > if(j==0x0a) > ++currentRow; > ++i; > } > column=0; > ++currentRow; > ++rowIndex; > pos=i; > }else if(tmp==delimiter){ > List<byte[]> row; > try{ > row=rowMatrix.get(rowIndex); > }catch(IndexOutOfBoundsException noListFound){ > rowMatrix.add(new ArrayList<byte[]>()); > row=rowMatrix.get(rowIndex); > } > byte[] prev=new byte[i-pos]; > buf.position(pos); > buf.get(prev,0,prev.length); > if(currentRow==headRow) > row.add(prev); > else{ > Set<Rule> ruleset=columnRule.get(column); > if(ruleset!=null && currentRow>rowOffset+headRow){ > byte[] > columnNameByte=rowMatrix.get(rowOffset).get(column); > Rule rule=validate(ruleset,prev,row); > if(rule!=null){ > String columnName; > if(columnNameByte.length>0){ > try { > if(charsetName!=null) > columnName="'"+new > String(columnNameByte,charsetName).trim()+"'"; > else > columnName="'"+new > String(columnNameByte).trim()+"'"; > } catch (UnsupportedEncodingException e) { > columnName="'"+new > String(columnNameByte).trim()+"'"; > } > }else > columnName="''"; > prev=rule.replace(prev); > String msg=rule.getMessage(); > if(msg.length()>0) > try { > handleMessage(msg > .replace("${column}",""+column) > > .replace("${columnName}",columnName.trim()) > > .replace("${row}",useFirstColumnAsRowName ? new > String(rowMatrix.get(rowIndex).get(0),charsetName) : ""+(currentRow+1))); > } catch (UnsupportedEncodingException ignored) { > } > } > } > columnMatrix[column].add(prev); > row.add(prev); > } > ++column; > pos=++i; > }else > if((i=_ESCAPED(buf,i))==i) > ++i; > } > if(pos!=limit){ > endsWithNL=false; > byte[] remaining=new byte[limit-pos]; > buf.position(pos); > buf.get(remaining,0,remaining.length); > > if(columnMatrix!=null){ > if(column_count!=column) > throw new ParseException("column count mismatch on row > ",+1+currentRow); > List<byte[]> row=rowMatrix.get(rowIndex); > row.add(remaining); > Set<Rule> ruleset=columnRule.get(column); > if(ruleset!=null && currentRow>rowOffset+headRow){ > byte[] > columnNameByte=rowMatrix.get(rowOffset).get(column); > Rule rule=validate(ruleset,remaining,row); > if(rule!=null){ > String columnName; > if(columnNameByte.length>0){ > try { > if(charsetName!=null) > columnName="'"+new > String(columnNameByte,charsetName).trim()+"'"; > else > columnName="'"+new > String(columnNameByte).trim()+"'"; > } catch (UnsupportedEncodingException e) { > columnName="'"+new > String(columnNameByte).trim()+"'"; > } > }else > columnName="''"; > remaining=rule.replace(remaining); > String msg=rule.getMessage(); > if(msg.length()>0) > try { > handleMessage(msg > .replace("${column}",""+column) > .replace("${columnName}",columnName.trim()) > .replace("${row}",useFirstColumnAsRowName ? > new String(rowMatrix.get(rowIndex).get(0),charsetName) : ""+(currentRow+1))); > } catch (UnsupportedEncodingException ignored) { > } > } > } > columnMatrix[column].add(remaining); > }else{ > columnMatrix=(List<byte[]>[])new List[column+1]; > List<byte[]> row; > try{ > row=rowMatrix.get(rowIndex); > }catch(IndexOutOfBoundsException noListFound){ > rowMatrix.add(new ArrayList<byte[]>()); > row=rowMatrix.get(rowIndex); > } > row.add(remaining); > Iterator<byte[]> itr; > for(j=0,itr=row.iterator();j<columnMatrix.length;j++){ > columnMatrix[j]=new ArrayList<byte[]>(1); > columnMatrix[j].add(itr.next()); > } > } > } > return new Table(columnMatrix,rowMatrix); > } > private int _ESCAPED(ByteBuffer src,int i){ > int org=i; > if(i==src.limit()) > return i; > int j; > if((j=_DQUOTE(src,i))==i) > return i; > > for(i=j;(j=_TEXTDATA(src,i))>i||(j=_COMMA(src,i))>i||(j=_CR(src,i))>i||(j=_LF(src,i))>i||(j=_2DQUOTE(src,i))>i;) > i=j; > if(i==_DQUOTE(src,i)) > return org; > return i; > } > private int _TEXTDATA(ByteBuffer src,int i){ > if(i==src.limit()) > return i; > if(_COMMA(src,i)==i && _CR(src,i)==i && _LF(src,i)==i && > _DQUOTE(src,i)==i) > return ++i; > return i; > } > private int _2DQUOTE(ByteBuffer src,int i) { > if(i==src.limit()) > return i; > if(i==_DQUOTE(src,i)) > return i; > if(i+1==_DQUOTE(src,i+1)) > return i; > return i+2; > } > private int _DQUOTE(ByteBuffer src,int i) { > return _CHAR(src,i,0x22); > } > public int _LF(ByteBuffer src,int i) { > return _CHAR(src,i,0x0a); > } > private int _CR(ByteBuffer src,int i) { > return _CHAR(src,i,0x0d); > } > private int _COMMA(ByteBuffer src,int i) { > return _CHAR(src,i,0x2c); > } > private int _CHAR(ByteBuffer src,int i,int token){ > if(i==src.limit()) > return i; > if((src.get(i) & 0xff)==token) > ++i; > return i; > } > > private void handleMessage(String message) { > msgHandler.handleMessage(message); > } > > public Rule validate(Set<Rule> ruleset,byte[] value, List<byte[]> > rowReference) { > for(Rule rule:ruleset){ > if(rule!=null){ > rule.setRowReference(rowReference); > if(!rule.isValid(value)) > return rule; > } > } > return null; > } > } > {code} -- This message was sent by Atlassian JIRA (v6.3.15#6346)