好吧,之前用的是舊版的,現在出了個新版的,優先選擇用新版的哈。
從官網下載相應的開發包,然后主要需要找到這幾個東西添加到項目工程里面,1.Data文件夾 2.NLPIR_JNI.DLL 3.NLPIR.jar 4.nlpir.properties
添加完那些東西后,需要配置的東西主要為nlpir.properties文件,大概內容如下:
dll_or_so_path=D\:\\Spiliter\\NLPIR_JNI.dll
data_dir_parent_path=.
主要要配置dll_or_so_path,里面的路徑為項目里面NLPIR_JNI.dll的絕對路徑
搞定后就可以開始寫代碼了:
package shell;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Properties;
import kevin.zhang.NLPIR;
public class Spliter {
NLPIR nlpir = null;
public Spliter(){
this.nlpir = new NLPIR();
String argu = "././";
try {
if (this.nlpir.NLPIR_Init(argu.getBytes("GB2312"),0,"0".getBytes("GB2312")) == false){
System.out.println("init failed");
}
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
}
public void SliptFile(String file,int tag){
String encoding = "utf8";
try {
String of = file.replace(".", "_new.");
FileOutputStream out = null;
out = new FileOutputStream(new File(of));
InputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding);
BufferedReader bw = new BufferedReader(read);
String line = null;
while((line = bw.readLine()) != null){
System.out.println(line);
String newLine = getSplitString(line,tag);
out.write((newLine + "\r\n").getBytes("utf8") );
}
bw.close();
out.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public String getSplitString(String text,int tag){
String splitStr = null;
byte nativeBytes[];
try {
nativeBytes = nlpir.NLPIR_ParagraphProcess(text.getBytes("gbk"), tag);
//因為分詞是用c++編寫的所以最后有一個\0,這邊就不需要了,不然轉碼有問題的
splitStr = new String(nativeBytes, 0, nativeBytes.length - 1, "gbk");
System.out.println("分詞結果為: " + splitStr);
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return splitStr;
}
public static void main(String[] args){
Spliter s = new Spliter();
s.SliptFile("d:\\w.txt",0);
}
}