java讀讀取取html文文件件,并并獲獲取取body中中所所有有的的標標簽簽及及內內容容的的案案例例
這里的獲取的是html文件中body 中的所有標簽以及內容
package com.lmt.service.file;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import org.springframework.stereotype.Component;
import com.lmt.config.UrlConstants;
Component
public class ParseFile {
/**
* 解析html文件
* param file
* return
*/
public String readHtml(File file){
String body = "";
try {
FileInputStream iStream = new FileInputStream(file);
Reader reader = new InputStreamReader(iStream);
BufferedReader htmlReader = new BufferedReader(reader);
String line;
boolean found = false;
while (!found && (line = htmlReader.readLine()) != null) {
if (line.toLowerCase().indexOf("
的前面可能存在空格found = true;
}
}
found = false;
while (!found && (line = htmlReader.readLine()) != null) {
if (line.toLowerCase().indexOf("
found = true;
} else {
// 果存在圖片,則將相對路徑轉換為絕對路徑
String lowerCaseLine = line.toLowerCase();
if (lowerCaseLine.contains("src")) {
//這里是定義圖片的訪問路徑
String directory = "D:/test";
// 果路徑名不以反斜杠結尾,則手動添加反斜杠
/*if (!directory.endsWith("\\")) {
directory = directory + "\\";
}*/
// line = line.substring(0, lowerCaseLine.indexOf("src") + 5) + directory +
line.substring(lowerCaseLine.indexOf("src") + 5);
/*String filename = extractFilename(line);
line = line.substri