一、 关于设计
(一)数据库
确定外键标识,需判断该外键是否有可能被修改。如菜单id,菜单code,菜单名,前两者都可做外键,后面一个则不应做外键。
二、关于组件
(一)POI
1. 文档页数统计
import lombok. extern. slf4j. Slf4j ;
import org. apache. pdfbox. pdmodel. PDDocument ;
import org. ofdrw. reader. OFDReader ;
import org. springframework. web. multipart. MultipartFile ;
import java. io. File ;
import java. io. FileInputStream ;
import java. io. IOException ;
import java. io. InputStream ;
import java. nio. file. Path ;
import java. nio. file. Paths ;
@Slf4j
public class LvDocPageCounter {
public static final String DOCUMENT_PAGE_TEMP = "DOCUMENT_PAGE_TEMP" ;
public static int getPageCount ( String filePath) {
String fileType = getFileType ( filePath) ;
try {
switch ( fileType) {
case "pdf" :
return getPdfPageCount ( filePath) ;
case "docx" :
return getDocxPageCount ( filePath) ;
case "doc" :
return getDocPageCount ( filePath) ;
case "ofd" :
return getOfdPageCount ( filePath) ;
default :
log. warn ( "不支持的文件类型:{}" , filePath) ;
return 1 ;
}
} catch ( Exception e) {
log. warn ( "读取文件异常:{},{}" , filePath, e) ;
return 0 ;
}
}
private static String getFileType ( String filePath) {
int dotIndex = filePath. lastIndexOf ( '.' ) ;
if ( dotIndex == - 1 || dotIndex == filePath. length ( ) - 1 ) {
log. warn ( "文件名中没有找到扩展名:{}" , filePath) ;
return "" ;
}
return filePath. substring ( dotIndex + 1 ) . toLowerCase ( ) ;
}
private static int getPdfPageCount ( String filePath) throws IOException {
try ( PDDocument document = Loader . loadPDF ( new File ( filePath) ) ) {
int numberOfPages = document. getNumberOfPages ( ) ;
document. close ( ) ;
return numberOfPages;
}
}
private static int getDocPageCount ( String filePath) throws IOException {
try ( InputStream inputStream = new FileInputStream ( filePath) ) {
com. aspose. words. Document doc = new com. aspose. words. Document( inputStream) ;
int num = doc. getPageCount ( ) ;
doc. cleanup ( ) ;
return num;
} catch ( Exception e) {
e. printStackTrace ( ) ;
return 0 ;
}
}
private static int getDocxPageCount ( String filePath) throws IOException {
try ( InputStream inputStream = new FileInputStream ( filePath) ) {
com. aspose. words. Document doc = new com. aspose. words. Document( inputStream) ;
int num = doc. getPageCount ( ) ;
doc. cleanup ( ) ;
return num;
} catch ( Exception e) {
e. printStackTrace ( ) ;
return 0 ;
}
}
private static int getOfdPageCount ( String filePath) throws IOException {
Path ofdFile = Paths . get ( filePath) ;
OFDReader ofdReader = new OFDReader ( ofdFile) ;
int numberOfPages = ofdReader. getNumberOfPages ( ) ;
ofdReader. close ( ) ;
return numberOfPages;
}
public static Integer getPageCount ( MultipartFile inputStream, String originalFilename) {
try ( InputStream inputStream1 = inputStream. getInputStream ( ) ) {
return getPageCount ( inputStream1, originalFilename) ;
} catch ( IOException e) {
log. warn ( "读取文件异常:{},{}" , originalFilename, e) ;
return 0 ;
}
}
}
2. 文本提取
import cn. hutool. core. io. FileUtil ;
import lombok. extern. slf4j. Slf4j ;
import org. apache. commons. io. FilenameUtils ;
import org. apache. pdfbox. pdmodel. PDDocument ;
import org. apache. pdfbox. text. PDFTextStripper ;
import org. apache. poi. hwpf. HWPFDocument ;
import org. apache. poi. hwpf. extractor. WordExtractor ;
import org. apache. poi. xwpf. extractor. XWPFWordExtractor ;
import org. apache. poi. xwpf. usermodel. XWPFDocument ;
import org. ofdrw. converter. export. TextExporter ;
import java. io. File ;
import java. io. FileInputStream ;
import java. io. IOException ;
import java. io. InputStream ;
import java. nio. file. Path ;
import java. nio. file. Paths ;
import java. util. concurrent. atomic. AtomicInteger ;
@Slf4j
public class LvDocTxTHunter {
private static AtomicInteger UPPER_LIMIT = new AtomicInteger ( 50 ) ;
public static String readText ( String filePath) {
int pageCount = LvDocPageCounter . getPageCount ( filePath) ;
if ( pageCount > UPPER_LIMIT . get ( ) ) {
log. warn ( "文件过大:{},{}" , filePath, pageCount) ;
return "" ;
}
String fileType = getFileType ( filePath) ;
try {
switch ( fileType) {
case "pdf" :
return readPdfText ( filePath) ;
case "doc" :
return readDocText ( filePath) ;
case "docx" :
return readDocxText ( filePath) ;
case "ofd" :
return readOfdText ( filePath) ;
default :
log. warn ( "不支持的文件类型:{}" , filePath) ;
return "" ;
}
} catch ( IOException e) {
log. warn ( "读取文件异常:{},{}" , filePath, e) ;
return "" ;
}
}
private static String getFileType ( String filePath) {
int dotIndex = filePath. lastIndexOf ( '.' ) ;
if ( dotIndex == - 1 || dotIndex == filePath. length ( ) - 1 ) {
log. warn ( "文件名中没有找到扩展名:{}" , filePath) ;
return "" ;
}
return filePath. substring ( dotIndex + 1 ) . toLowerCase ( ) ;
}
private static String readPdfText ( String filePath) throws IOException {
try ( PDDocument document = Loader . loadPDF ( filePath) ) {
String text = new PDFTextStripper ( ) . getText ( document) ;
document. close ( ) ;
return text;
}
}
private static String readDocText ( String filePath) throws IOException {
try ( InputStream inputStream = new FileInputStream ( filePath) ;
HWPFDocument document = new HWPFDocument ( inputStream) ) {
WordExtractor extractor = new WordExtractor ( document) ;
String text = extractor. getText ( ) ;
document. close ( ) ;
return text;
}
}
private static String readDocxText ( String filePath) throws IOException {
try ( InputStream inputStream = new FileInputStream ( filePath) ;
XWPFDocument document = new XWPFDocument ( inputStream) ) {
XWPFWordExtractor extractor = new XWPFWordExtractor ( document) ;
String text = extractor. getText ( ) ;
document. close ( ) ;
return text;
}
}
private static String readOfdText ( String filePath) throws IOException {
Path txtPath = Paths . get ( "DOCUMENT_PAGE_TEMP" , FilenameUtils . getBaseName ( filePath) + ".txt" ) ;
TextExporter textExporter = new TextExporter ( Paths . get ( filePath) , txtPath) ;
textExporter. export ( ) ;
String s = FileUtil . readUtf8String ( txtPath. toFile ( ) ) ;
textExporter. close ( ) ;
return s;
}
public static String readText ( File tempFile) {
return readText ( tempFile. getPath ( ) ) ;
}
}
3. 文案转换
private static void systemInit ( ) {
FontLoader preload = FontLoader. Preload ( ) ;
preload. scanFontDir ( Paths . get ( FileUtil . local, "font" ) ) ;
Field namePathMapping = ReflectUtil . getField ( FontLoader . class , "fontNamePathMapping" ) ;
Map < String , String > fontNamePathMapping = ( Map < String , String > ) ReflectUtil . getFieldValue ( preload, namePathMapping) ;
System . out. println ( "加载字体:" + JSONUtil . toJsonStr ( fontNamePathMapping. keySet ( ) ) ) ;
}
public static void convertOfdToPDFByBridge ( String ofdPath, String distPath, String pdfPath) throws IOException {
log. debug ( "解析文件:{}" , ofdPath) ;
Path ofdFilePath = Paths . get ( ofdPath) ;
Path dir = Paths . get ( distPath) ;
PDFExporterIText exporter = new PDFExporterIText ( ofdFilePath, Paths . get ( pdfPath) ) ;
exporter. export ( ) ;
exporter. close ( ) ;
}