Java實現HTML轉PDF,主要為了解決將ai返回的html文本數據轉為PDF文件方便用戶下載查看。
一、deepSeek-AI提問詞
基于以上個人數據。總結個人身體信息,分析個人身體指標信息。再按一個月為維度,詳細列舉一個月內訓練計劃,維度詳細至每周每天,要求:不可省略表格內容以精簡示例,文本結構順序為標題個人信息,第一步,第二步。最終回答結果以標準的html形式返回結果,不能帶有meta標簽,字體為STSong-Light,SimSun,html內容禁止使用單標簽。
二、表設計
CREATE TABLE `p_deep_seek_task` (`id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '任務id',`user_id` bigint(20) NOT NULL COMMENT '用戶id',`status` char(2) NOT NULL DEFAULT '0' COMMENT '任務狀態(0:待處理,1:處理中,2:處理成功,3:異常處理失敗,4:重試中,5:重試失敗)',`try_time` int(11) NOT NULL DEFAULT '0' COMMENT '執行次數',`result_url` varchar(255) DEFAULT NULL COMMENT '結果文件url',`prompt` longtext NOT NULL COMMENT '提問內容',`content` longtext COMMENT '結果內容',`reasoning_content` longtext COMMENT '思考過程',`create_time` datetime DEFAULT NULL COMMENT '創建時間',`create_by` bigint(20) DEFAULT NULL COMMENT '創建人',`update_by` bigint(20) DEFAULT NULL COMMENT '更新人',`update_time` datetime DEFAULT NULL COMMENT '更新時間',`task_time` date DEFAULT NULL COMMENT '任務日期',`execute_time` datetime DEFAULT NULL COMMENT '執行時間',`exception_msg` longtext COMMENT '異常信息',`cost_time` bigint(20) NOT NULL DEFAULT '0' COMMENT '執行耗時(s)'PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=16 DEFAULT CHARSET=utf8mb4 COMMENT='deepseek任務';
三、導入Jar包
<dependency><groupId>io.github.pig-mesh.ai</groupId><artifactId>deepseek-spring-boot-starter</artifactId></dependency><dependency><groupId>org.xhtmlrenderer</groupId><artifactId>flying-saucer-pdf</artifactId><version>9.1.22</version> </dependency><dependency><groupId>org.jsoup</groupId><artifactId>jsoup</artifactId><version>1.14.3</version> </dependency>
四、工具類
ai請求工具
package com.company.project.service.client;import io.github.pigmesh.ai.deepseek.config.DeepSeekProperties;
import io.github.pigmesh.ai.deepseek.core.DeepSeekClient;
import io.github.pigmesh.ai.deepseek.core.chat.ChatCompletionRequest;
import io.github.pigmesh.ai.deepseek.core.chat.ChatCompletionResponse;
import org.springframework.stereotype.Service;import javax.annotation.Resource;/*** @author: reshui* description:DeepSeek服務* DateTime:2025/3/31-14:48*/
@Service
public class DeepSeekAiClient {@Resourceprivate DeepSeekClient deepSeekClient;@Resourceprivate DeepSeekProperties deepSeekProperties;/*** 提問接口* 獲取deepseek的響應結果* @param prompt 提示詞*/public ChatCompletionResponse syncChat(String prompt) {ChatCompletionRequest request = ChatCompletionRequest.builder()// 根據渠道模型名稱動態修改這個參數.model(deepSeekProperties.getModel()).addUserMessage(prompt).build();return deepSeekClient.chatCompletion(request).execute();}}
-
特定html字符內容過濾工具
package com.company.project.service.tools;import cn.hutool.core.collection.CollUtil;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;import java.util.regex.Matcher;
import java.util.regex.Pattern;/*** @author reshui* description* dateTime 2025/04/17*/
public class HtmlFormatter {private String htmlContent;private HtmlFormatter(String htmlContent) {this.htmlContent = htmlContent;}// 入口方法,創建處理器實例public static HtmlFormatter process(String htmlContent) {return new HtmlFormatter(htmlContent);}// 鏈式方法:前置過濾干擾字符public HtmlFormatter beforeFilter() {this.htmlContent = beforeFilterInterferenceCharacters(this.htmlContent);return this;}// 鏈式方法:替換內容標簽public HtmlFormatter replaceTags() {this.htmlContent = replaceContentTag(this.htmlContent);return this;}// 鏈式方法:后置過濾干擾字符public HtmlFormatter afterFilter() {this.htmlContent = afterFilterInterferenceCharacters(this.htmlContent);return this;}// 獲取最終結果public String get() {return this.htmlContent;}public static String formatHtml(String htmlContent) {// 過濾掉html中的干擾字符String filteredHtml = beforeFilterInterferenceCharacters(htmlContent);// 去除內容中的大于小于號干擾String replaceContentTag = replaceContentTag(filteredHtml);//過濾html中的干擾標簽return afterFilterInterferenceCharacters(replaceContentTag);}/*** 替換html中的干擾內容** @param html 文本*/public static String replaceContentTag(String html) {Document doc = Jsoup.parse(html);removeTag(doc);traverse(doc.body());doc.outputSettings().prettyPrint(false);return doc.html();}/*** 去除不支持的meta標簽* @param doc jsoupdoc*/public static void removeTag(Document doc) {Elements meta = doc.getElementsByTag("meta");for (Element metaElement : meta) {metaElement.remove();}}public static void traverse(Element element) {if (CollUtil.isEmpty(element.children())) {String text = element.text().replace("<", "小于").replace(">", "大于");element.text(text);}for (Element child : element.children()) { // 遍歷子元素traverse(child); // 遞歸調用以處理子元素及其子元素}}/*** 后置過濾掉html中的干擾字符** @param html 文本*/public static String afterFilterInterferenceCharacters(String html) {return html.replace("<br></br>", "<br/>").replace("<br>", "<br/>").replace("</br>", "<br/>");}/*** 前置過濾掉html中的干擾字符** @param html 文本*/public static String beforeFilterInterferenceCharacters(String html) {return html.replace("```html", "").replace("```", "").replace("<!DOCTYPE html>", "").replace("<!doctype html>", "");}/*** 將HTML字符串中的所有標簽轉為小寫** @param html 原始HTML字符串* @return 轉換后的HTML字符串*/public static String convertTagsToLowerCase(String html) {// 正則表達式匹配HTML標簽Pattern pattern = Pattern.compile("</?\\w+((\\s+\\w+(\\s*=\\s*(?:\".*?\"|'.*?'|[^'\">\\s]+))?)+\\s*|\\s*)/?>");Matcher matcher = pattern.matcher(html);StringBuffer result = new StringBuffer();while (matcher.find()) {// 將匹配到的標簽轉為小寫String lowerCaseTag = matcher.group().toLowerCase();matcher.appendReplacement(result, lowerCaseTag);}matcher.appendTail(result);return result.toString();}}
-
html轉pdf工具
package com.company.project.service.tools;import cn.hutool.core.date.DateUtil;
import cn.hutool.core.io.FileUtil;
import com.company.project.common.utils.SpringUtils;
import com.company.project.service.properties.PdfFontProperties;
import lombok.extern.slf4j.Slf4j;
import org.xhtmlrenderer.pdf.ITextRenderer;import java.io.File;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Date;/*** @author: reshui* description: html轉pdf工具類* DateTime:2025/3/31-15:55*/
@Slf4j
public class HtmlToPdfTools {/*** 文件暫存地址*/private static final String TEMP_FILE_PATH = System.getProperty("java.io.tmpdir");/*** pdf文件暫存地址*/private static final String PDF_FILE_PATH = TEMP_FILE_PATH + File.separator + "ai_train_pdf";/*** 時間格式*/private static final String TIMESTAMP_FORMAT = "yyyyMMddHHmmss";/*** pdf配置文件*/public final static PdfFontProperties CONFIG = SpringUtils.getBean(PdfFontProperties.class);/*** html轉pdf文件** @param htmlContent html內容文本*/public static File convertHtmlToPdfFile(String htmlContent) throws Exception {String formatDateTimeStamp = DateUtil.format(new Date(), TIMESTAMP_FORMAT);String pdfFilePath = PDF_FILE_PATH + File.separator + formatDateTimeStamp + ".pdf";FileUtil.touch(pdfFilePath);String resultHtmlContent = HtmlFormatter.process(htmlContent).beforeFilter().replaceTags().afterFilter().get();generatePdfReport(pdfFilePath, resultHtmlContent);log.info("pdf文件儲存地址:{}", pdfFilePath);return new File(pdfFilePath);}/*** 生成pdf文件** @param outputPath 輸出文件地址* @param htmlContent html內容文本*/public static void generatePdfReport(String outputPath, String htmlContent) throws Exception {try (OutputStream outputStream = Files.newOutputStream(Paths.get(outputPath))) {ITextRenderer renderer = new ITextRenderer();renderer.getFontResolver().addFont(CONFIG.getPath(),CONFIG.getEncoding(),CONFIG.getEmbedded());
// renderer.getFontResolver().addFont(
// "c://Windows//Fonts//simsun.ttc",
// "Identity-H",
// true
// );renderer.setDocumentFromString(htmlContent);renderer.layout();renderer.createPDF(outputStream);}}public static void main(String[] args) throws Exception {String html = "";convertHtmlToPdfFile(html);}
}
配置文件yml
# deepseek配置文件
deepseek:base-url: https://api.deepseek.com/v1api-key: xxxxxxxxxxxxxmodel: deepseek-reasonerconnectTimeout: 60readTimeout: 240callTimeout: 360# windows-pdf字體配置
pdf:font:path: c://Windows//Fonts//simsun.ttcencoding: Identity-Hembedded: true# linux-pdf字體配置
pdf:font:path: c://Windows//Fonts//simsun.ttcencoding: Identity-Hembedded: true