基本信息
源码名称:PDF转文本(VC6.0代码)
源码大小:1.71M
文件格式:.rar
开发语言:C/C++
更新时间:2021-01-27
   友情提示:(无需注册或充值,赞助后即可获取资源下载链接)

     嘿,亲!知识可是无价之宝呢,但咱这精心整理的资料也耗费了不少心血呀。小小地破费一下,绝对物超所值哦!如有下载和支付问题,请联系我们QQ(微信同号):813200300

本次赞助数额为: 2 元 
   源码介绍

一个识别PDF文件并提取文字内容的动态库,附带一个简单的调用例子。

示例是用VC6.0写的,版本比较老


部分代码:

//szPDFFilePath:PDF文件完整路径
//szTextFilePath:TEXT文件完整路径
//返回值:0-成功;1-读取PDF失败;2-文本文件操作失败;3-PDF拷贝失败;99-其它错误
PDFTOTEXT_API int WINAPI fnPDFToText(char* szPDFFilePath,char* szTextFilePath)
{
int argc=5;
char* argv[5];
argv[0]=NULL;
argv[1]=szPDFFilePath;
char pLayout[]="-layout";
argv[2]=pLayout;
char pEnc[]="-enc";
argv[3]=pEnc;
char pGBK[]="GBK";
argv[4]=pGBK;


PDFDoc *doc;
GString *fileName;
GString *textFileName;
GString *ownerPW, *userPW;
TextOutputDev *textOut;
FILE *f;
UnicodeMap *uMap;
Object info;
GBool ok;
char *p;
int exitCode;

exitCode = 99;

// parse args
ok = parseArgs(argDesc, &argc, argv);
if (!ok || argc < 2 || argc > 3 || printVersion || printHelp) {
fprintf(stderr, "pdftotext version %s\n", xpdfVersion);
fprintf(stderr, "%s\n", xpdfCopyright);
if (!printVersion) {
printUsage("pdftotext", "<PDF-file> [<text-file>]", argDesc);
}
exitCode=4;
goto err0;
}
fileName = new GString(argv[1]);
if (fixedPitch) {
physLayout = gTrue;
}

// read config file
globalParams = new GlobalParams(cfgFileName);
if (textEncName[0]) {
globalParams->setTextEncoding(textEncName);
}
if (textEOL[0]) {
if (!globalParams->setTextEOL(textEOL)) {
fprintf(stderr, "Bad '-eol' value on command line\n");
}
}
if (noPageBreaks) {
globalParams->setTextPageBreaks(gFalse);
}
if (quiet) {
globalParams->setErrQuiet(quiet);
}

// get mapping to output encoding
if (!(uMap = globalParams->getTextEncoding())) {
error(errConfig, -1, "Couldn't get text encoding");
delete fileName;
exitCode=5;
goto err1;
}

// open PDF file
if (ownerPassword[0] != '\001') {
ownerPW = new GString(ownerPassword);
} else {
ownerPW = NULL;
}
if (userPassword[0] != '\001') {
userPW = new GString(userPassword);
} else {
userPW = NULL;
}
doc = new PDFDoc(fileName, ownerPW, userPW);
if (userPW) {
delete userPW;
}
if (ownerPW) {
delete ownerPW;
}
if (!doc->isOk()) {
exitCode = 1;
goto err2;
}

// check for copy permission
if (!doc->okToCopy()) {
error(errNotAllowed, -1,
"Copying of text from this document is not allowed.");
exitCode = 3;
goto err2;
}

// construct text file name
//  if (argc == 3) {
//    textFileName = new GString(argv[2]);
if(szTextFilePath)
{
textFileName = new GString(szTextFilePath);
}
else {
p = fileName->getCString() fileName->getLength() - 4;
if (!strcmp(p, ".pdf") || !strcmp(p, ".PDF")) {
textFileName = new GString(fileName->getCString(),
fileName->getLength() - 4);
} else {
textFileName = fileName->copy();
}
textFileName->append(htmlMeta ? ".html" : ".txt");
}

// get page range
if (firstPage < 1) {
firstPage = 1;
}
if (lastPage < 1 || lastPage > doc->getNumPages()) {
lastPage = doc->getNumPages();
}

// write HTML header
if (htmlMeta) {
if (!textFileName->cmp("-")) {
f = stdout;
} else {
if (!(f = fopen(textFileName->getCString(), "wb"))) {
error(errIO, -1, "Couldn't open text file '{0:t}'", textFileName);
exitCode = 2;
goto err3;
}
}
fputs("<html>\n", f);
fputs("<head>\n", f);
doc->getDocInfo(&info);
if (info.isDict()) {
printInfoString(f, info.getDict(), "Title", "<title>", "</title>\n",
uMap);
printInfoString(f, info.getDict(), "Subject",
"<meta name=\"Subject\" content=\"", "\">\n", uMap);
printInfoString(f, info.getDict(), "Keywords",
"<meta name=\"Keywords\" content=\"", "\">\n", uMap);
printInfoString(f, info.getDict(), "Author",
"<meta name=\"Author\" content=\"", "\">\n", uMap);
printInfoString(f, info.getDict(), "Creator",
"<meta name=\"Creator\" content=\"", "\">\n", uMap);
printInfoString(f, info.getDict(), "Producer",
"<meta name=\"Producer\" content=\"", "\">\n", uMap);
printInfoDate(f, info.getDict(), "CreationDate",
"<meta name=\"CreationDate\" content=\"%s\">\n");
printInfoDate(f, info.getDict(), "LastModifiedDate",
"<meta name=\"ModDate\" content=\"%s\">\n");
}
info.free();
fputs("</head>\n", f);
fputs("<body>\n", f);
fputs("<pre>\n", f);
if (f != stdout) {
fclose(f);
}
}

// write text file
//physLayout = gFalse;//wxg 是否保持原来的排版
textOut = new TextOutputDev(textFileName->getCString(),
physLayout, fixedPitch, rawOrder, htmlMeta);
if (textOut->isOk()) {
doc->displayPages(textOut, firstPage, lastPage, 72, 72, 0,
gFalse, gTrue, gFalse);
} else {
delete textOut;
exitCode = 2;
goto err3;
}
delete textOut;

// write end of HTML file
if (htmlMeta) {
if (!textFileName->cmp("-")) {
f = stdout;
} else {
if (!(f = fopen(textFileName->getCString(), "ab"))) {
error(errIO, -1, "Couldn't open text file '{0:t}'", textFileName);
exitCode = 2;
goto err3;
}
}
fputs("</pre>\n", f);
fputs("</body>\n", f);
fputs("</html>\n", f);
if (f != stdout) {
fclose(f);
}
}

exitCode = 0;

// clean up
err3:
delete textFileName;
err2:
delete doc;
uMap->decRefCnt();
err1:
delete globalParams;
err0:

// check for memory leaks
Object::memCheck(stderr);
gMemReport(stderr);

return exitCode;
}