Skip to content

Commit

Permalink
pdf text added
Browse files Browse the repository at this point in the history
  • Loading branch information
elementdavv committed Jul 30, 2024
1 parent 5af474d commit eabd7de
Show file tree
Hide file tree
Showing 39 changed files with 3,811 additions and 214 deletions.
11 changes: 5 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@

[HathiTrust Digital Library](https://hathitrust.org) is also supported. All books with full view permission can be downloaded.

The extension works by fetching every leaf of a book, constructing a PDF stream on the fly targeting to disk storage. So it consumes only a little RAM resource and can handle books of thousands of leafs easily.
The extension works by fetching every leaf of a book, constructing a PDF stream on the fly targeting to disk storage. So it can handle books of thousands of leafs easily with little memory.

## Features
* Download a book as a PDF file
* Download a book as a collection of image files (JPEG/PNG) one for each leaf
## What It Can Do
* Download a book as a PDF file With text embedded
* Download a book as a collection of image files (JPEG/PNG) one for each leaf, and the text of the book
* Optional leaf range
* Optional leaf quality
* Download multiple books in parallel
Expand Down Expand Up @@ -63,12 +63,11 @@ The download process may take some breaks due to server constraints.
## Availability
* Chromium family(Chrome, Edge, Brave, Vivaldi, Opera, Yandex, Kiwi, etc) version 90+ supported
* Firefox version 115+ supported
* For Brave below version 1.67.119, item 'File System Access API' on the 'brave://flags' page should be enabled.

## License
[GPL3](LICENSE) ©Element Davv

Any questions and/or suggestions are appreciatiated.

## Donation
If you found the extension helpful consider supporting me with a coffee <a href='https://www.buymeacoffee.com/timelegend' target='_blank'><img src='resources/logo/bmc-orange.png' style='width:100px;height:28px'></a>.
If you found the extension helpful consider buy a cup of coffee supporting the proceeding development<a href='https://www.buymeacoffee.com/timelegend' target='_blank'><img src='resources/logo/bmc-orange.png' style='width:100px;height:28px'></a>.
147 changes: 104 additions & 43 deletions moz/js/content1.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,16 +41,15 @@ export default function(){
var ctrl; // ctrlKey
var alt; // altKey

window.onmessage = evt => {
window.onmessage = async evt => {
if (evt.origin != origin || evt.data.extid != extid) return;

const data = evt.data;
console.log(`message: ${data.cmd}`);

switch(data.cmd) {
case 'init':
br = JSON.parse(data.br);
init();
await init();
break;
case 'begin':
ctrl = false;
Expand Down Expand Up @@ -85,20 +84,17 @@ export default function(){
}

function loadButton() {
const ab = fromClass('action-buttons-section');

if (ab.length == 0) return;

console.log('load buttons');
ab[0].insertAdjacentHTML("afterbegin", buttonstring);
const iadlabel = document.getElementsByClassName('iadlabel');
if (iadlabel.length > 0) return;
const ab = fromClass('action-buttons-section');
ab[0]?.insertAdjacentHTML("afterbegin", buttonstring);
}

function loadScales() {
console.log('load scales');
var s = fromId('iadscaleid');

if (!s) return;

console.log('load scales');
const factors = br.reductionFactors;
const star = "★★★★";
var n = 0;
Expand All @@ -120,27 +116,33 @@ export default function(){
});
}

var fileid = ""; // book basename
var data = []; // page urls
var fontdata = null;

async function loadFont() {
console.log('load font data');
const fonturl = chrome.runtime.getURL('/js/pdf/font/data/Georgia.afm');
const response = await fetch(fonturl);
fontdata = await response.text();
}

var fileid = ''; // book basename
var data = []; // page image urls
var url2 = ''; // page text urls
var pagecount = 0; // page count

function getBookInfo() {
console.log('get book info');
fileid = br.bookId;
data = br.data;
url2 = `https://${br.server}/BookReader/BookReaderGetTextWrapper.php?path=${br.bookPath}_djvu.xml&mode=djvu_xml&page=`;
pagecount = data.length;
}

var info = {}; // book metadata

function getMetadata() {
console.log('get metadata');
const title = fromClass('item-title');

if (title.length > 0) {
info.Title = title[0].innerText;
}

info.Title = br.bookTitle;
const meta = new Map();
meta.set('by', 'Author');
meta.set('Isbn', 'ISBN')
Expand Down Expand Up @@ -168,11 +170,12 @@ export default function(){
progress = fromId('iadprogressid');
}

function init(){
async function init(){
console.log('init begin');
loadCss("/css/iad.css");
loadButton();
loadScales();
await loadFont();
getBookInfo();
getMetadata();
getProgress();
Expand All @@ -189,7 +192,6 @@ export default function(){
if (sw) {
chrome.runtime.onMessage.addListener((message, sender, sendResponse) => {
if (sender.id != chrome.runtime.id || (status != 1 && swaitcreate == false)) return;

console.log(message);

switch(message.cmd) {
Expand Down Expand Up @@ -398,7 +400,7 @@ export default function(){
} catch(e) {
// error from showSaveFilePicker
const message = e.toString();
console.log(message);
console.log(e);

// SecurityError: Failed to execute 'showSaveFilePicker' on 'Window': Must be handling a user gesture to show a file picker.
if (e.code == 18) {
Expand All @@ -419,6 +421,7 @@ export default function(){
var paused = 0; // paused count
var recover = 0; // recover file
var ac = null; // AbortController
var content = []; // book content for zip document
const FILELIMIT = 6; // parallel download limit
const TRYLIMIT = 3; // each leaf retry limit

Expand All @@ -434,6 +437,7 @@ export default function(){
paused = 0;
recover = 0;
ac = new AbortController();
content = Array.from({length: realcount}, (v, i) => '');
}

function getLeafs() {
Expand Down Expand Up @@ -492,6 +496,10 @@ export default function(){
}

async function clear() {
if (ctrl) {
createZIPText();
}

ac = null;
doc.end();
await writer.ready;
Expand All @@ -500,20 +508,20 @@ export default function(){

function dispatch() {
if (jobs.isEmpty) return;

const job = jobs.deque();
const pageindex = job.pageindex;
const tri = job.tri;
console.log(`chunk ${pageindex}`);
var uri = data[pageindex].uri;
uri += uri.indexOf("?") > -1 ? "&" : "?";
uri += `scale=${scale}&rotate=0`;
var uri2 = url2 + pageindex.toString();
const width = Math.ceil(data[pageindex].width / scale);
const height = Math.ceil(data[pageindex].height / scale);
syncfetch(pageindex, tri, uri, width, height);
syncfetch(pageindex, tri, uri, uri2, width, height);
}

async function syncfetch(pageindex, tri, uri, width, height) {
async function syncfetch(pageindex, tri, uri, uri2, width, height) {
try {
const response = await fetch(uri, {
method: "GET",
Expand All @@ -522,22 +530,30 @@ export default function(){
signal: ac.signal,
});

if (response.ok) {
const buffer = await response.arrayBuffer();

if (doc) {
const view = new DataView(buffer);
createPage(view, pageindex, width, height);
nextLeaf();
}
}
else {
if (!response.ok) {
throw new Error(response.status);
}

const buffer = await response.arrayBuffer();
const view = new DataView(buffer);

const response2 = await fetch(uri2, {
method: "GET",
credentials: "include",
signal: ac.signal,
});

if (!response2.ok) {
throw new Error(response2.status);
}

var text = await response2.text();
createPage(view, text, pageindex, width, height);
nextLeaf();
}
catch(e) {
const message = e.toString();
console.log(message);
console.log(e);

if (!ac.signal.aborted) {
// chrome: failed to fetch
Expand Down Expand Up @@ -593,37 +609,45 @@ export default function(){
}
}

function createPage(view, pageindex, width, height) {
function createPage(view, text, pageindex, width, height) {
console.log(`chunk ${pageindex} ready`);

if (ctrl) {
createZIPPage(view, pageindex);
createZIPPage(view, text, pageindex);
}
else {
createPDFPage(view, pageindex, width, height);
createPDFPage(view, text, pageindex, width, height);
}
}

function createZIPPage(view, pageindex) {
function createZIPPage(view, text, pageindex) {
content[pageindex - startp + 1] = text;
pageindex++;
const name = fileid + '_' + pageindex.toString().padStart(4, '0');
doc.image({view, name});
}

function createPDFPage(view, pageindex, width, height) {
function createZIPText() {
const uint8 = new TextEncoder().encode(getContent());
const view = new DataView(uint8.buffer);
const name = fileid + '.txt';
doc.image({view, name});
}

function createPDFPage(view, text, pageindex, width, height) {
pageindex -= startp - 1;
doc.addPage({
pageindex
, margin: 0
, size: [width, height]
});

doc.image(view, 0, 0);
doc.image(view, text, 0, 0);
}

var filehandle = null; // filesystemfilehandle
var writer = null; // file stream writer
var doc = null; // pdf document object
var doc = null; // pdf/zip document object

async function createDoc() {
if (ctrl) {
Expand Down Expand Up @@ -690,6 +714,8 @@ export default function(){
doc = new PDFDocument(writer, {
pagecount: realcount
, info
, fontdata
, font: 'Times-Roman'
});
}

Expand All @@ -701,9 +727,44 @@ export default function(){
doc = new PDFDocument(writer, {
pagecount: realcount
, info
, fontdata
, font: 'Times-Roman'
});
}

function getContent() {
var result = '', xmldoc, pars, page, lines, words, t;

content.forEach((text) => {
xmldoc = new DOMParser().parseFromString(text, 'text/xml');
pars= xmldoc.querySelectorAll('PARAGRAPH');
page = '';

pars.forEach((par) => {
lines = par.querySelectorAll('LINE');

lines.forEach((line) => {
words = line.querySelectorAll('WORD');
t = '';

words.forEach((word) => {
if (t != '') t += ' ';
t += word.textContent;
});

page += t + '\n';
});

page += '\n';
});

if (result != '') result += '\n';
result += page;
});

return result;
}

function returnBook() {
console.log('return the book.');
const uri = 'https://archive.org/services/loans/loan';
Expand Down
2 changes: 1 addition & 1 deletion moz/js/hathitrust.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* content.js
* hathitrust.js
* Copyright (C) 2023 Element Davv<elementdavv@hotmail.com>
*
* Distributed under terms of the GPL3 license.
Expand Down
Loading

0 comments on commit eabd7de

Please sign in to comment.