gpt4 book ai didi

java - 如何搜索 from PDMetadata(xmpmeta) using java

转载 作者:行者123 更新时间:2023-11-30 06:45:13 26 4
gpt4 key购买 nike

From the below code I got pdf metadata in xmp as shown below. From this output how to get the values of each property (eg: "pdf:Keywords")

InputStream in=new FileInputStream(pdfFile);
PDFParser parser=new PDFParser(in);
parser.parse();
PDMetadata metadata = parser.getPDDocument().getDocumentCatalog().getMetadata();
if(metadata!=null)
{
System.out.println(metadata.getInputStreamAsString());
}

输出类似

    metadata<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>
<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="Adobe XMP Core 5.2-c001 63.139439, 2010/09/27-13:37:26 ">
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
<rdf:Description rdf:about=""
xmlns:pdf="http://ns.adobe.com/pdf/1.3/">
<pdf:Producer>Acrobat Distiller 7.0 (Windows)</pdf:Producer>
<pdf:Keywords>F3392|4|Name 2016|02|2016|04|Sub111 |Three Hours|30|(5X1=5),(5X2=10), (3X5=15)&#xD;&#xA;</pdf:Keywords>
</rdf:Description>
<rdf:Description rdf:about=""
xmlns:pdfx="http://ns.adobe.com/pdfx/1.3/"
<pdfx:semester>02</pdfx:semester>
</rdf:RDF>
</x:xmpmeta>
<?xpacket end="w"?>

最佳答案

您需要 Apache PDFBox 及其已弃用的 JempBox 库(实现 Adob​​e 的 XMP(TM) 规范 - 他们现在使用 Xmpbox,但它更加严格,无法处理大多数 pdf 格式)

PDDocument document = PDDocument.load(inputStream);
PDDocumentCatalog cat = document.getDocumentCatalog();
PDMetadata metadata = cat.getMetadata();

if( metadata != null )
{
try
{
XMPMetadata meta = XMPMetadata.load(metadata.exportXMPMetadata());

XMPSchema xs = meta.getDublinCoreSchema();
if (xs != null)
{
System.out.println("About: " + xs.getAbout());
List<String> bagList = xs.getBagList("author");
if(bagList != null)
{
System.out.println("BagList: ");
ShowString(bagList);
}
List<String> creators = xs.getBagList("creator");
if(creators != null)
{
System.out.println("Creators: ");
ShowString(creators);
}
}

XMPSchemaDublinCore dc = meta.getDublinCoreSchema();
if (dc != null)
{
System.out.println("Title: " + dc.getTitle());
System.out.println("Description:" + dc.getDescription());
System.out.println("Source:" + dc.getSource());
System.out.println("Identifier:" + dc.getIdentifier());
System.out.println("Coverage:" + dc.getCoverage());
System.out.println("About:" + dc.getAbout());
List<String> relationships = dc.getRelationships();
if(relationships != null)
{
System.out.println("Relationships: ");
ShowString(relationships);
}
List<String> contributors = dc.getContributors();
if(contributors != null)
{
System.out.println("Contributors: ");
ShowString(contributors);
}
List<String> creators = dc.getCreators();
if(creators != null)
{
System.out.println("Creators: ");
ShowString(creators);
}
List<Calendar> dates = dc.getDates();
if(dates != null)
{
System.out.println("Dates: ");
ShowCalendar(dates);
}
List<String> subjects = dc.getSubjects();
if(subjects != null)
{
System.out.println("Subjects: ");
ShowString(subjects);
}
List<String> publishers = dc.getPublishers();
if(publishers != null)
{
System.out.println("Publishers: ");
ShowString(publishers);
}
List<String> languages = dc.getLanguages();
if(languages != null)
{
System.out.println("Languages: ");
ShowString(languages);
}
}

XMPSchemaPDF pdf = meta.getPDFSchema();
if (pdf != null)
{
System.out.println("Keywords:" + pdf.getKeywords());
System.out.println("PDF Version:" + pdf.getPDFVersion());
System.out.println("PDF Producer:" + pdf.getProducer());
}

XMPSchemaBasic basic = meta.getBasicSchema();
if (basic != null)
{
System.out.println("Create Date:" + basic.getCreateDate().getTime());
System.out.println("Modify Date:" + basic.getModifyDate().getTime());
System.out.println("Creator Tool:" + basic.getCreatorTool());
System.out.println("Label:" + basic.getLabel());
System.out.println("About:" + basic.getAbout());
System.out.println("Nickname:" + basic.getNickname());
System.out.println("Title:" + basic.getTitle());
}
}
catch (Exception e)
{
System.err.println("An error occurred while parsing the meta data: "
+ e.getMessage());
}
}
else
{
PDDocumentInformation info = document.getDocumentInformation();
System.out.println( "Title:" + info.getTitle() );
System.out.println( "Author:" + info.getAuthor() );
System.out.println( "Subject:" + info.getSubject() );
System.out.println( "Keywords:" + info.getKeywords() );
System.out.println( "Creator:" + info.getCreator() );
System.out.println( "Producer:" + info.getProducer() );
System.out.println( "Creation Date:" + info.getCreationDate().getTime() );
System.out.println( "Modification Date:" + info.getModificationDate().getTime() );
System.out.println( "Trapped:" + info.getTrapped() );
}

document.close();

Direct download of PDFBox jar

PDFBox Maven link

Direct download of JempBox jar

JempBox Maven link

关于java - 如何搜索<pdf :Keywords> from PDMetadata(xmpmeta) using java,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/43806174/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com