C#使用iTextSharp将PDF转成文本的方法
本文实例讲述了C#使用iTextSharp将PDF转成文本的方法。分享给大家供大家参考。具体实现方法如下:
usingSystem;
usingSystem.IO;
usingiTextSharp.text;
usingiTextSharp.text.pdf;
usingiTextSharp.text.pdf.parser;
publicclassParsingPDF{
staticstringPDF;
staticstringTEXT2;
/**
*ParsesthePDFusingPRTokeniser
*@paramsrcthepathtotheoriginalPDFfile
*@paramdestthepathtotheresultingtextfile
*/
publicvoidparsePdf(Stringsrc,Stringdest)
{
PdfReaderreader=newPdfReader(src);
StreamWriteroutput=newStreamWriter(newFileStream(dest,FileMode.Create));
intpageCount=reader.NumberOfPages;
for(intpg=1;pg<=pageCount;pg++)
{
//wecaninspectthesyntaxoftheimportedpage
byte[]streamBytes=reader.GetPageContent(pg);
PRTokenisertokenizer=newPRTokeniser(streamBytes);
while(tokenizer.NextToken())
{
if(tokenizer.TokenType==PRTokeniser.TokType.STRING)
{
output.WriteLine(tokenizer.StringValue);
}
}
}
output.Flush();
output.Close();
}
/**
*Mainmethod.
*/
staticvoidMain(string[]args)
{
if(args.Length<1||args.Length>2)
{
Console.WriteLine("USAGE:ParsePDFinfile.pdf<outfile.txt>");
return;
}
elseif(args.Length==1)
{
PDF=args[0];
TEXT2=Path.GetFileNameWithoutExtension(PDF)+".txt";
}
else
{
PDF=args[0];
TEXT2=args[1];
}
try
{
DateTimet1=DateTime.Now;
ParsingPDFexample=newParsingPDF();
example.parsePdf(PDF,TEXT2);
DateTimet2=DateTime.Now;
TimeSpants=t2-t1;
Console.WriteLine("Parsingcompletedin{0:0.00}seconds.",ts.TotalSeconds);
}
catch(Exceptionex)
{
Console.WriteLine("ERROR:"+ex.Message);
}
}//class
publicclassMyTextRenderListener:IRenderListener
{
/**Theprintwritertowhichtheinformationwillbewritten.*/
protectedStreamWriteroutput;
/**
*CreatesaRenderListenerthatwilllookfortext.
*/
publicMyTextRenderListener(StreamWriteroutput)
{
this.output=output;
}
publicvoidBeginTextBlock()
{
output.Write("<");
}
publicvoidEndTextBlock()
{
output.WriteLine(">");
}
publicvoidRenderImage(ImageRenderInforenderInfo)
{
}
publicvoidRenderText(TextRenderInforenderInfo)
{
output.Write("<");
output.Write(renderInfo.GetText());
output.Write(">");
}
}//class
}//namespace
希望本文所述对大家的C#程序设计有所帮助。