package openminer.data; import java.io.*; import java.util.*; import openminer.association.data.*; public class ArffFileLoader { private FileInputStream m_FileInputStream = null; private BufferedInputStream m_BufferedInputStream = null; /** 属性列表 */ private ArrayList m_AttList = null; /** 当前扫描的数据实例的计数 */ private int m_CurInstIndex = 0; /** 当前的Token字符串 */ private String m_TokenString; /** 当前扫描的行数 */ private int m_CurScanLineNo = 1; /** 当前扫描的字符 */ private int m_NextScanChar; /** Token类型的常数定义 */ private final static int TOKEN_FILEEND = 0; private final static int TOKEN_SAT = 1; private final static int TOKEN_STR = 2; private final static int TOKEN_STR_VAL = 3; private final static int TOKEN_NUM = 4; private final static int TOKEN_FLOAT = 5; private final static int TOKEN_BIG_BRACKET_LEFT = 20; private final static int TOKEN_BIG_BRACKET_RIGHT= 21; private final static int TOKEN_COMMA = 22; private final static int TOKEN_ERROR = -1; public ArffFileLoader() { } /** * 装载arff文件 * @param fileName arff文件名 * @throws Exception */ public void loadFile(String fileName) throws Exception { FileInputStream m_FileInputStream = new FileInputStream(fileName); m_BufferedInputStream = new BufferedInputStream(m_FileInputStream); } /** * 关闭内部数据流 * @throws Exception */ public void close() throws Exception { m_BufferedInputStream.close(); m_FileInputStream.close(); } /** * 创建Array类型的数据对象集Instances * @return 创建的Array数据对象集Instances * @throws Exception */ public Instances buildArrayInstances() throws Exception{ MemoryInstances insts = new MemoryInstances(); m_AttList = new ArrayList(); Attribute att; Instance inst; // 开始解析 beginParse(); // 读取名字 insts.setName(loadRelationName()); // 读取数据属性定义 while((att = loadAttribute()) != null) { insts.addAttribute(att); m_AttList.add(att); } // 读取数据 loadDataLabel(); // 读取数据实例 while((inst = loadInstance()) != null) { insts.add(inst); m_CurInstIndex++; } return insts; } /** * 读取数据关系表的名称 * @return 返回数据关系表的名字 * @throws Exception */ private String loadRelationName() throws Exception { int token1 = nextToken(); int token2 = nextToken(); String token2Str = m_TokenString; int token3 = nextToken(); if(token1 != TOKEN_SAT || token2 != TOKEN_STR || !token2Str.equals("relation")) { throw new Exception("Not found '@relation' " + " in line "+m_CurScanLineNo); } if(token3 != TOKEN_STR && token3 != TOKEN_STR_VAL) { throw new Exception("'@relation' name must be a string " + " in line "+m_CurScanLineNo); } return m_TokenString; } /** * 读取数据关系表中的一个属性 * @return 读取的属性 * @throws Exception */ private Attribute loadAttribute() throws Exception { Attribute att = new Attribute(); m_BufferedInputStream.mark(64); int token = nextToken(); // 检测 '@attribute' if(token != TOKEN_SAT) { m_BufferedInputStream.reset(); throw new Exception("Unexpect attribute name format! " + " in line "+m_CurScanLineNo); } token = nextToken(); if(token != TOKEN_STR || !m_TokenString.equals("attribute")) { m_BufferedInputStream.reset(); if(m_TokenString.equals("data")) return null; else throw new Exception("Unknown label : "+m_TokenString + " in line "+m_CurScanLineNo); } // 获取属性的名字 token = nextToken(); if(token != TOKEN_STR) { throw new Exception("Unexpect format! " + " in line "+m_CurScanLineNo); } att.setName(m_TokenString); // 获取属性的类型 token = nextToken(); switch(token) { case TOKEN_STR: // 普通类型 if(m_TokenString.equals("real") || m_TokenString.equals("numberic")) { // 实数类型 att.setType(Attribute.NUMBERIC_TYPE); } else if(m_TokenString.equals("bool")) { // 布尔类型 att.setType(Attribute.BOOLEAN_TYPE); } else if(m_TokenString.equals("string")) { // 字符串类型 att.setType(Attribute.STRING_TYPE); } else { // 未知类型 throw new Exception("Unknown attribute type "+m_TokenString + " in line "+m_CurScanLineNo); } break; case TOKEN_BIG_BRACKET_LEFT: { // 集合类型 ArrayList valueSet = new ArrayList(); // 获取每个值域内的值 while((token = nextToken()) == TOKEN_STR) { valueSet.add(m_TokenString); token = nextToken(); if (token != TOKEN_COMMA) { if (token == TOKEN_BIG_BRACKET_RIGHT) { break; } else { throw new Exception("Unexpected token " + m_TokenString + " in line "+m_CurScanLineNo); } } } if(token != TOKEN_BIG_BRACKET_RIGHT) { throw new Exception("Unexpected token "+m_TokenString + " in line "+m_CurScanLineNo); } att.setType(Attribute.STRING_TYPE); att.setValueField(valueSet); break; } default: // 错误类型 throw new Exception("Unknown attribute type "+m_TokenString+ " in line "+m_CurScanLineNo); } return att; } /** * 装载‘@data'的标记 * @throws Exception */ private void loadDataLabel() throws Exception{ int token = nextToken(); if(token != TOKEN_SAT) { throw new Exception("No '@data' label ."); } token = nextToken(); if(token != TOKEN_STR || !m_TokenString.equals("data")) { throw new Exception("No '@data' label ."); } } /** * 装载一个数据实例 * @return 数据实例 * @throws Exception */ private Instance loadInstance() throws Exception{ int token; int i = 0; Instance inst = new Instance(m_AttList.size()); Attribute att; if(m_AttList.size() == 0) return null; for(i = 0; i