Pdf javascript to open file

Read PDF file using Javascript

In one of our previous post we have explained about how to read excel using Javascript and read csv using Javascript, but in this post I have provided working example to read content of pdf file in Javascript. For this example, we will be using PDF.js to extract pdf content.

Read PDF text using JavaScript

  • First, we will convert PDF file contents into ArrayBuffer
  • ArrayBuffer is passed to PDF.js, and read text using getDocument()
  • Each page is data is extracted using getPage()
  • Each page text is extracted using textContent.items

Let’s begin by adding require Javscript file and creating required HTMl to browse PDF file

Once the file is browsed and selected we are calling JS function ExtractText()

Here is the complete Javascript, code which will be used

 var datass = ''; var DataArr = []; PDFJS.workerSrc = ''; function ExtractText() < var input = document.getElementById("file-id"); var fReader = new FileReader(); fReader.readAsDataURL(input.files[0]); // console.log(input.files[0]); fReader.onloadend = function (event) < convertDataURIToBinary(event.target.result); >> var BASE64_MARKER = ';base64,'; function convertDataURIToBinary(dataURI) < var base64Index = dataURI.indexOf(BASE64_MARKER) + BASE64_MARKER.length; var base64 = dataURI.substring(base64Index); var raw = window.atob(base64); var rawLength = raw.length; var array = new Uint8Array(new ArrayBuffer(rawLength)); for (var i = 0; i < rawLength; i++) < array[i] = raw.charCodeAt(i); >pdfAsArray(array) > function getPageText(pageNum, PDFDocumentInstance) < // Return a Promise that is solved once the text of the page is retrieven return new Promise(function (resolve, reject) < PDFDocumentInstance.getPage(pageNum).then(function (pdfPage) < // The main trick to obtain the text of the PDF page, use the getTextContent method pdfPage.getTextContent().then(function (textContent) < var textItems = textContent.items; var finalString = ""; // Concatenate the string of the item to the final string for (var i = 0; i < textItems.length; i++) < var item = textItems[i]; finalString += item.str + " "; >// Solve promise with the text retrieven from the page resolve(finalString); >); >); >); > function pdfAsArray(pdfAsArray) < PDFJS.getDocument(pdfAsArray).then(function (pdf) < var pdfDocument = pdf; // Create an array that will contain our promises var pagesPromises = []; for (var i = 0; i < pdf.pdfInfo.numPages; i++) < // Required to prevent that i is always the total of pages (function (pageNumber) < // Store the promise of getPageText that returns the text of a page pagesPromises.push(getPageText(pageNumber, pdfDocument)); >)(i + 1); > // Execute all the promises Promise.all(pagesPromises).then(function (pagesText) < // Display text of all the pages in the console // e.g ["Text content page 1", "Text content page 2", "Text content page 3" . ] console.log(pagesText); // representing every single page of PDF Document by array indexing console.log(pagesText.length); var outputStr = ""; for (var pageNum = 0; pageNum < pagesText.length; pageNum++) < console.log(pagesText[pageNum]); outputStr = ""; outputStr = "

Page " + (pageNum + 1) + " contents

"; var div = document.getElementById('output'); div.innerHTML += (outputStr + pagesText[pageNum]); > >); >, function (reason) < // PDF loading error console.error(reason); >); >

This is our Sample PDF which will use to test this example, it has 2 pages as shown in the below image

I have explained many part of the code using comments.

Complete HTML/Javascript will look like this

       var datass = ''; var DataArr = []; PDFJS.workerSrc = ''; function ExtractText() < var input = document.getElementById("file-id"); var fReader = new FileReader(); fReader.readAsDataURL(input.files[0]); // console.log(input.files[0]); fReader.onloadend = function (event) < convertDataURIToBinary(event.target.result); >> var BASE64_MARKER = ';base64,'; function convertDataURIToBinary(dataURI) < var base64Index = dataURI.indexOf(BASE64_MARKER) + BASE64_MARKER.length; var base64 = dataURI.substring(base64Index); var raw = window.atob(base64); var rawLength = raw.length; var array = new Uint8Array(new ArrayBuffer(rawLength)); for (var i = 0; i < rawLength; i++) < array[i] = raw.charCodeAt(i); >pdfAsArray(array) > function getPageText(pageNum, PDFDocumentInstance) < // Return a Promise that is solved once the text of the page is retrieven return new Promise(function (resolve, reject) < PDFDocumentInstance.getPage(pageNum).then(function (pdfPage) < // The main trick to obtain the text of the PDF page, use the getTextContent method pdfPage.getTextContent().then(function (textContent) < var textItems = textContent.items; var finalString = ""; // Concatenate the string of the item to the final string for (var i = 0; i < textItems.length; i++) < var item = textItems[i]; finalString += item.str + " "; >// Solve promise with the text retrieven from the page resolve(finalString); >); >); >); > function pdfAsArray(pdfAsArray) < PDFJS.getDocument(pdfAsArray).then(function (pdf) < var pdfDocument = pdf; // Create an array that will contain our promises var pagesPromises = []; for (var i = 0; i < pdf.pdfInfo.numPages; i++) < // Required to prevent that i is always the total of pages (function (pageNumber) < // Store the promise of getPageText that returns the text of a page pagesPromises.push(getPageText(pageNumber, pdfDocument)); >)(i + 1); > // Execute all the promises Promise.all(pagesPromises).then(function (pagesText) < // Display text of all the pages in the console // e.g ["Text content page 1", "Text content page 2", "Text content page 3" . ] console.log(pagesText); // representing every single page of PDF Document by array indexing console.log(pagesText.length); var outputStr = ""; for (var pageNum = 0; pageNum < pagesText.length; pageNum++) < console.log(pagesText[pageNum]); outputStr = ""; outputStr = "

Page " + (pageNum + 1) + " contents

"; var div = document.getElementById('output'); div.innerHTML += (outputStr + pagesText[pageNum]); > >); >, function (reason) < // PDF loading error console.error(reason); >); >

Once we are done, we can use the above code in our browser, and you will see output as below

Complete Fiddle sample

Читайте также:  Где у питона сердце

As you can see from above example output, we were able to extract PDF contents using Javascript and show all the text.

Источник

Saved searches

Use saved searches to filter your results more quickly

You signed in with another tab or window. Reload to refresh your session. You signed out in another tab or window. Reload to refresh your session. You switched accounts on another tab or window. Reload to refresh your session.

License

mozilla/pdf.js

This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?

Sign In Required

Please sign in to use Codespaces.

Launching GitHub Desktop

If nothing happens, download GitHub Desktop and try again.

Launching GitHub Desktop

If nothing happens, download GitHub Desktop and try again.

Launching Xcode

If nothing happens, download Xcode and try again.

Launching Visual Studio Code

Your codespace will open once ready.

There was a problem preparing your codespace, please try again.

Latest commit

[Annotation] Use the clip-path property when an annotation has some quad points

Git stats

Files

Failed to load latest commit information.

README.md

PDF.js is a Portable Document Format (PDF) viewer that is built with HTML5.

PDF.js is community-driven and supported by Mozilla. Our goal is to create a general-purpose, web standards-based platform for parsing and rendering PDFs.

PDF.js is an open source project and always looking for more contributors. To get involved, visit:

Feel free to stop by our Matrix room for questions or guidance.

Читайте также:  Illegalstateexception example in java

Please note that the «Modern browsers» version assumes native support for the latest JavaScript features; please also see this wiki page.

PDF.js is built into version 19+ of Firefox.

  • The official extension for Chrome can be installed from the Chrome Web Store. This extension is maintained by @Rob—W.
  • Build Your Own — Get the code as explained below and issue gulp chromium . Then open Chrome, go to Tools > Extension and load the (unpackaged) extension from the directory build/chromium .

To get a local copy of the current code, clone it using git:

$ git clone https://github.com/mozilla/pdf.js.git $ cd pdf.js 

Next, install Node.js via the official package or via nvm. You need to install the gulp package globally (see also gulp’s getting started):

If everything worked out, install all dependencies for PDF.js:

Finally, you need to start a local web server as some browsers do not allow opening PDF files using a file:// URL. Run:

Please keep in mind that this assumes the latest version of Mozilla Firefox; refer to Building PDF.js for non-development usage of the PDF.js library.

It is also possible to view all test PDF files on the right side by opening:

In order to bundle all src/ files into two production scripts and build the generic viewer, run:

If you need to support older browsers, run:

This will generate pdf.js and pdf.worker.js in the build/generic/build/ directory (respectively build/generic-legacy/build/ ). Both scripts are needed but only pdf.js needs to be included since pdf.worker.js will be loaded by pdf.js . The PDF.js files are large and should be minified for production.

Using PDF.js in a web application

To use PDF.js in a web application you can choose to use a pre-built version of the library or to build it from source. We supply pre-built versions for usage with NPM and Bower under the pdfjs-dist name. For more information and examples please refer to the wiki page on this subject.

PDF.js is hosted on several free CDNs:

You can play with the PDF.js API directly from your browser using the live demos below:

More examples can be found in the examples folder. Some of them are using the pdfjs-dist package, which can be built and installed in this repo directory via gulp dist-install command.

For an introduction to the PDF.js code, check out the presentation by our contributor Julian Viereck:

More learning resources can be found at:

The API documentation can be found at:

Check out our FAQs and get answers to common questions:

Источник

How to create a PDF Viewer in JavaScript

Many candidates are rejected or down-leveled in technical interviews due to poor performance in behavioral or cultural fit interviews. Ace your interviews with this free course, where you will practice confidently tackling behavioral interview questions.

PDF.js is a JavaScript library maintained by Mozilla and designed for handling PDFs in JavaScript.

We are going to create a PDF viewer that has the following functionalities:

  • View a PDF
  • Go to the next page
  • Go to the previous page
  • Go to a particular page number

Steps

Create an index.html file that includes:

  • canvas → Where the pdf will be rendered.
  • previous button → To go to the previous page.
  • next button → To go to the next page.
  • input box → To enter a page number.
  • Go to page button → Button to go to a particular page.
  • 2-span elements → Display the current page number and total pages of the PDF.
Читайте также:  Напечатать код функции python

Initializing the JavaScript file for rendering the PDF

In addition to the index.html file, we will create a script.js file where we can write our JavaScript code to create a PDF viewer.

Initialize the variables

let pdf ; // to store pdf data let canvas; // to render pdf let isPageRendering; // to check if the pdf is currently rendering let pageRenderingQueue = null; // to store next page number to render let canvasContext; // context of canvas let totalPages; // total pages of pdf let currentPageNum = 1; 

Next, add event listeners to handle the PDF renderer once the page loads:

window.addEventListener('load', function () < isPageRendering= false; pageRenderingQueue = null; canvas = document.getElementById('pdf_canvas'); canvasContext = canvas.getContext('2d'); initEvents(); Add events initPDFRenderer(); // render first page >); 

Implement initPDFRenderer function

  • We need to initialize the PDF.js with a source PDF
  • We can use the getDocument method to get a promise that resolves to pdfData
  • The PDF data has a getPage function
  • The getPage will return a promise
  • Once the promise is resolved , we get the page data
  • We can then use the render method in the page data to render it in the canvas
function initPDFRenderer() < let url = 'https://raw.githubusercontent.com/mozilla/pdf.js/ba2edeae/web/compressed.tracemonkey-pldi-09.pdf'; // const url = 'filepath.pdf'; // to load pdf from our machine let option = < url>; pdfjsLib.getDocument(option) .promise .then( pdfData => < totalPages = pdfData.numPages; // total number of pages let pagesCounter= document.getElementById('total_page_num'); // update total pages text pagesCounter.textContent = totalPages; // assigning read pdfContent to global variable pdf = pdfData; console.log(pdfData); renderPage(currentPageNum); >); > 

Now, when we call initPdfRenderer it will assign the pdfData to the PDF variable.

Add events for pagination buttons

Add events for previousButton , nextButton , and goToPage buttons.

Implement renderPage function

Now, let’s create a renderPage function to render the PDF page to the canvas.

function renderPage(pageNumToRender = 1) < isPageRendering = true; document.getElementById('current_page_num').textContent = pageNumToRender; // use getPage method pdf .getPage(pageNumToRender) .then( page =>< const viewport = page.getViewport(); canvas.height = viewport.height; canvas.width = viewport.width; let renderCtx = ; page .render(renderCtx) .promise .then(()=> < isPageRendering = false; // this is to check if there is next page to be rendered in the queue if(pageRenderingQueue !== null) < renderPage(pageRenderingQueue); pageRenderingQueue = null; >>); >); > 

We have a method to get pdfData and render the page. Let’s write our pageRenderingQueue .

If the user clicks next page/previous page, it will add/subtract 1 to the currentPageNum and pass it to the renderPageQueue method. This will check if the pageRenderingQueue is null. If it is null, then we call the renderPage method, or else it will assign the page number that is to be rendered to the queue . Once the page rendering is complete, it will check if the pageQueue is empty and perform the respective action (if needed).

function renderPageQueue(pageNum) < if(pageRenderingQueue != null) < pageRenderingQueue = pageNum; >else < renderPage(pageNum); >> 

Let’s create a renderNextPage and renderPreviousPage method. If the user clicks:

  • next page – currentPageNum + 1 and render page.
  • previous page – currentPageNum — 1 and render page.
function renderNextPage(ev) < if(currentPageNum >= totalPages) < alert("This is the last page"); return ; >currentPageNum++; renderPageQueue(currentPageNum); > function renderPreviousPage(ev) < if(currentPageNum<=1) < alert("This is the first page"); return ; >currentPageNum--; renderPageQueue(currentPageNum); > 

Now, let’s implement the “go to page number” function.

Get the page number from the input box, then check if the number is valid and call the renderPage method.

function goToPageNum(ev) < let numberInput = document.getElementById('page_num'); let pageNumber = parseInt(numberInput.value); if(pageNumber) < if(pageNumber = 1) < currentPageNum = pageNumber; numberInput.value =""; renderPageQueue(pageNumber); return ; >> alert("Enter a valide page numer"); > 

Источник

Оцените статью