feat: add OCR for image descriptions using tesseract.js (#1433)

* feat: add OCR for image descriptions using tesseract.js

* tweak style of alt editor

* remove unnecessary files
This commit is contained in:
Nolan Lawson 2019-08-25 18:33:44 -07:00 committed by GitHub
parent bd2a7abe2a
commit ca9a32d303
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 272 additions and 12 deletions

View File

@ -53,5 +53,6 @@ module.exports = [
{ id: 'fa-refresh', src: 'src/thirdparty/font-awesome-svg-png/white/svg/refresh.svg' },
{ id: 'fa-plus', src: 'src/thirdparty/font-awesome-svg-png/white/svg/plus.svg' },
{ id: 'fa-info-circle', src: 'src/thirdparty/font-awesome-svg-png/white/svg/info-circle.svg' },
{ id: 'fa-crosshairs', src: 'src/thirdparty/font-awesome-svg-png/white/svg/crosshairs.svg' }
{ id: 'fa-crosshairs', src: 'src/thirdparty/font-awesome-svg-png/white/svg/crosshairs.svg' },
{ id: 'fa-magic', src: 'src/thirdparty/font-awesome-svg-png/white/svg/magic.svg' }
]

View File

@ -66,6 +66,7 @@
"express": "^4.17.1",
"file-api": "^0.10.4",
"file-drop-element": "0.2.0",
"file-loader": "^4.2.0",
"form-data": "^2.5.0",
"glob": "^7.1.4",
"indexeddb-getall-shim": "^1.3.5",
@ -97,6 +98,8 @@
"svelte-transitions": "^1.2.0",
"svgo": "^1.3.0",
"terser-webpack-plugin": "^1.4.1",
"tesseract.js": "^2.0.0-alpha.13",
"tesseract.js-core": "^2.0.0-beta.10",
"text-encoding": "^0.7.0",
"tiny-queue": "^0.2.1",
"webpack": "^4.39.2",

View File

@ -2,7 +2,7 @@
<textarea
id="the-media-alt-input-{realm}-{index}"
class="media-alt-input"
placeholder="Description"
placeholder="Describe for the visually impaired"
ref:textarea
bind:value=rawText
></textarea>
@ -20,6 +20,21 @@
max={mediaAltCharLimit}
style="width: 100%; text-align: right;"
/>
<button class="extract-text-button" type="button"
on:click="onClick()"
disabled={extracting}
>
<SvgIcon href="{extracting ? '#fa-spinner' : '#fa-magic'}"
className="extract-text-svg {extracting ? 'spin' : ''}"
/>
<span>
{#if extracting}
Extracting text…
{:else}
Extract text from image
{/if}
</span>
</button>
</div>
<style>
.media-alt-editor {
@ -38,12 +53,35 @@
max-height: 70vh;
}
.extract-text-button {
display: flex;
justify-content: center;
align-items: center;
margin-top: 10px;
}
.extract-text-button span {
margin-left: 15px;
}
:global(.extract-text-svg) {
fill: var(--button-text);
width: 18px;
height: 18px;
}
@media (max-height: 767px) {
.media-alt-input {
max-height: 40vh;
width: 100%;
}
}
@media (min-height: 768px) {
.media-alt-input {
min-width: 250px;
}
}
</style>
<script>
import { requestPostAnimationFrame } from '../../../_utils/requestPostAnimationFrame'
@ -57,6 +95,9 @@
import LengthGauge from '../../LengthGauge.html'
import LengthIndicator from '../../LengthIndicator.html'
import { length } from 'stringz'
import { runTesseract } from '../../../_utils/runTesseract'
import SvgIcon from '../../SvgIcon.html'
import { toast } from '../../toast/toast'
const updateRawTextInStore = throttleTimer(requestPostAnimationFrame)
@ -72,11 +113,13 @@
store: () => store,
data: () => ({
rawText: '',
mediaAltCharLimit: MEDIA_ALT_CHAR_LIMIT
mediaAltCharLimit: MEDIA_ALT_CHAR_LIMIT,
extracting: false
}),
computed: {
length: ({ rawText }) => length(rawText || ''),
overLimit: ({ mediaAltCharLimit, length }) => length > mediaAltCharLimit
overLimit: ({ mediaAltCharLimit, length }) => length > mediaAltCharLimit,
url: ({ media, index }) => get(media, [index, 'data', 'url'])
},
methods: {
observe,
@ -118,11 +161,32 @@
},
measure () {
autosize.update(this.refs.textarea)
},
async onClick () {
this.set({ extracting: true })
try {
const { url } = this.get()
const text = await runTesseract(url)
const { media, index, realm } = this.get()
if (media[index].description !== text) {
media[index].description = text
this.store.setComposeData(realm, { media })
this.store.save()
}
} catch (err) {
console.error(err)
/* no await */ toast.say(
'Unable to extract text. Ensure your instance supports cross-origin resource sharing (CORS) for images.'
)
} finally {
this.set({ extracting: false })
}
}
},
components: {
LengthGauge,
LengthIndicator
LengthIndicator,
SvgIcon
}
}
</script>

View File

@ -47,3 +47,7 @@ export const importSnackbar = () => import(
export const importComposeBox = () => import(
/* webpackChunkName: 'ComposeBox.html' */ '../_components/compose/ComposeBox.html'
).then(getDefault)
export const importTesseractWorker = () => import(
/* webpackChunkName: 'tesseractWorker' */ '../_utils/tesseractWorker.js'
).then(getDefault)

View File

@ -0,0 +1,19 @@
import { importTesseractWorker } from '../_utils/asyncModules'
export async function runTesseract (image) {
const worker = await importTesseractWorker()
// TODO: have to trick tesseract into not creating a blob URL because that would break our CSP
// see https://github.com/naptha/tesseract.js/pull/322
let promise
const OldBlob = window.Blob
window.Blob = null
try {
promise = worker.recognize(image)
} finally {
window.Blob = OldBlob
}
promise.progress(_ => console.log('progress', _))
const res = await promise
return res.text
}

View File

@ -0,0 +1,21 @@
import workerPath from 'tesseract.js/dist/worker.min.js'
// TODO: we should use .wasm instead of .wasm.js. But currently can't because:
// 1. not supported https://github.com/naptha/tesseract.js/blob/9f1e782/docs/local-installation.md#corepath
// 2. webpack defaultRules issues (fixable with https://github.com/webpack/webpack/issues/8412#issuecomment-445586591)
// We should explore this at a later date.
import corePath from 'tesseract.js-core/tesseract-core.wasm.js'
import { TesseractWorker } from 'tesseract.js'
// tesseract has a bug where broken image URLs will silently fail. We could spawn a new worker
// every time to work around the issue, but then it literally spawns a new web worker for each request,
// which seems excessive. So we just live with the bug for now.
// https://github.com/naptha/tesseract.js/issues/325
const { origin } = location
const tesseractWorker = new TesseractWorker({
workerPath: `${origin}/${workerPath}`,
langPath: `${origin}/`,
corePath: `${origin}/${corePath}`
})
export default tesseractWorker

View File

@ -18,11 +18,13 @@ const assets = __assets__
.map(file => file.startsWith('/') ? file : `/${file}`)
.filter(filename => !filename.endsWith('.map'))
.filter(filename => filename !== '/robots.txt')
.filter(filename => !filename.includes('traineddata.gz')) // Tesseract already caches it in IDB
// `shell` is an array of all the files generated by webpack
// also contains '/index.html' for some reason
const webpackAssets = __shell__
.filter(filename => !filename.endsWith('.map')) // don't bother with sourcemaps
.filter(filename => !filename.includes('tesseract-core.wasm')) // cache on-demand
// `routes` is an array of `{ pattern: RegExp }` objects that
// match the pages in your src
@ -93,6 +95,17 @@ self.addEventListener('fetch', event => {
if (response) {
return response
}
if (/tesseract-core\.wasm/.test(url.pathname)) {
// cache this on-demand
const response = await fetch(req)
if (response && response.status >= 200 && response.status < 300) {
const clonedResponse = response.clone()
/* no await */ caches.open(WEBPACK_ASSETS).then(cache => cache.put(req, clonedResponse))
}
return response
}
// for routes, serve the /service-worker-index.html file from the most recent
// static cache
if (routes.find(route => route.pattern.test(url.pathname))) {

BIN
static/eng.traineddata.gz Normal file

Binary file not shown.

View File

@ -38,7 +38,24 @@ module.exports = {
{
test: /\/_workers\/blurhash\.js$/,
use: {
loader: 'worker-loader'
loader: 'worker-loader',
options: {
name: 'blurhash.[hash].[name].[ext]'
}
}
},
{
test: [
/tesseract\.js\/dist\/worker\.min\.js$/,
/tesseract\.js\/dist\/worker\.min\.js.map$/,
/tesseract\.js-core\/tesseract-core\.wasm$/,
/tesseract\.js-core\/tesseract-core\.wasm.js$/
],
use: {
loader: 'file-loader',
options: {
name: 'tesseract-asset.[hash].[name].[ext]'
}
}
},
{
@ -109,6 +126,9 @@ module.exports = {
]),
devtool: dev ? 'inline-source-map' : 'source-map',
performance: {
hints: dev ? false : 'error' // fail if we exceed the default performance budgets
hints: dev ? false : 'error',
assetFilter: assetFilename => {
return !(/\.map$/.test(assetFilename)) && !/tesseract-asset/.test(assetFilename)
}
}
}

View File

@ -3,9 +3,22 @@ const config = require('sapper/config/webpack.js')
const pkg = require('../package.json')
const { mode, dev, resolve, inlineSvgs } = require('./shared.config')
// modules that the server should ignore, either because they cause errors or warnings
// (because they're only used on the client side)
const NOOP_MODULES = [
'page-lifecycle/dist/lifecycle.mjs',
'../_workers/blurhash',
'tesseract.js/dist/worker.min.js',
'tesseract.js/dist/worker.min.js.map',
'tesseract.js-core/tesseract-core.wasm',
'tesseract.js-core/tesseract-core.wasm.js',
'tesseract.js'
]
const serverResolve = JSON.parse(JSON.stringify(resolve))
serverResolve.alias['page-lifecycle/dist/lifecycle.mjs'] = 'lodash-es/noop' // page lifecycle fails in Node
serverResolve.alias['../_workers/blurhash'] = 'lodash-es/noop' // not used on the server side
NOOP_MODULES.forEach(mod => {
serverResolve.alias[mod] = 'lodash-es/noop'
})
module.exports = {
entry: config.server.entry(),

View File

@ -1,11 +1,12 @@
const TerserWebpackPlugin = require('terser-webpack-plugin')
module.exports = () => new TerserWebpackPlugin({
exclude: /tesseract-asset/,
cache: true,
parallel: true,
sourceMap: true,
terserOptions: {
ecma: 6,
ecma: 8,
mangle: true,
compress: {
pure_funcs: ['console.log']

105
yarn.lock
View File

@ -811,6 +811,14 @@ aws4@^1.8.0:
resolved "https://registry.yarnpkg.com/aws4/-/aws4-1.8.0.tgz#f0e003d9ca9e7f59c7a508945d7b2ef9a04a542f"
integrity sha512-ReZxvNHIOv88FlT7rxcXIIC0fPt4KZqZbOlivyWtXLt8ESx84zd3kMC6iK5jVeS2qt+g7ftS7ye4fi06X5rtRQ==
axios@^0.18.0:
version "0.18.1"
resolved "https://registry.yarnpkg.com/axios/-/axios-0.18.1.tgz#ff3f0de2e7b5d180e757ad98000f1081b87bcea3"
integrity sha512-0BfJq4NSfQXd+SkFdrvFbG7addhYSBA2mQwISr46pD6E5iqkWg02RAs8vyTT/j0RTnoYmeXauBuSv1qKwR179g==
dependencies:
follow-redirects "1.5.10"
is-buffer "^2.0.2"
babel-code-frame@^6.26.0:
version "6.26.0"
resolved "https://registry.yarnpkg.com/babel-code-frame/-/babel-code-frame-6.26.0.tgz#63fd43f7dc1e3bb7ce35947db8fe369a3f58c74b"
@ -1561,6 +1569,11 @@ blurhash@^1.1.3:
resolved "https://registry.yarnpkg.com/blurhash/-/blurhash-1.1.3.tgz#dc325af7da836d07a0861d830bdd63694382483e"
integrity sha512-yUhPJvXexbqbyijCIE/T2NCXcj9iNPhWmOKbPTuR/cm7Q5snXYIfnVnz6m7MWOXxODMz/Cr3UcVkRdHiuDVRDw==
bmp-js@^0.1.0:
version "0.1.0"
resolved "https://registry.yarnpkg.com/bmp-js/-/bmp-js-0.1.0.tgz#e05a63f796a6c1ff25f4771ec7adadc148c07233"
integrity sha1-4Fpj95amwf8l9Hcex62twUjAcjM=
bn.js@^4.0.0, bn.js@^4.1.0, bn.js@^4.1.1, bn.js@^4.4.0:
version "4.11.8"
resolved "https://registry.yarnpkg.com/bn.js/-/bn.js-4.11.8.tgz#2cde09eb5ee341f484746bb0309b3253b1b1442f"
@ -1922,6 +1935,11 @@ check-error@^1.0.2:
resolved "https://registry.yarnpkg.com/check-error/-/check-error-1.0.2.tgz#574d312edd88bb5dd8912e9286dd6c0aed4aac82"
integrity sha1-V00xLt2Iu13YkS6Sht1sCu1KrII=
check-types@^7.4.0:
version "7.4.0"
resolved "https://registry.yarnpkg.com/check-types/-/check-types-7.4.0.tgz#0378ec1b9616ec71f774931a3c6516fad8c152f4"
integrity sha512-YbulWHdfP99UfZ73NcUDlNJhEIDgm9Doq9GhpyXbF+7Aegi3CVV7qqMCKTTqJxlvEvnQBp9IA+dxsGN6xK/nSg==
check-types@^8.0.3:
version "8.0.3"
resolved "https://registry.yarnpkg.com/check-types/-/check-types-8.0.3.tgz#3356cca19c889544f2d7a95ed49ce508a0ecf552"
@ -2535,6 +2553,13 @@ debug@3.2.6, debug@^3.2.6:
dependencies:
ms "^2.1.1"
debug@=3.1.0:
version "3.1.0"
resolved "https://registry.yarnpkg.com/debug/-/debug-3.1.0.tgz#5bb5a0672628b64149566ba16819e61518c67261"
integrity sha512-OX8XqP7/1a9cqkxYw2yXss15f26NKWBpDXQd0/uK/KPqdQhxbPa994hnzjcE2VqQpDslf55723cKPUOGSmMY3g==
dependencies:
ms "2.0.0"
debug@^4.0.1, debug@^4.1.0:
version "4.1.1"
resolved "https://registry.yarnpkg.com/debug/-/debug-4.1.1.tgz#3b72260255109c6b589cee050f1d516139664791"
@ -3409,6 +3434,19 @@ file-entry-cache@^5.0.1:
resolved "https://registry.yarnpkg.com/file-error/-/file-error-0.10.2.tgz#963b48b9273b3d4b84b400ee571bc78b1739724a"
integrity sha1-ljtIuSc7PUuEtADuVxvHixc5cko=
file-loader@^4.2.0:
version "4.2.0"
resolved "https://registry.yarnpkg.com/file-loader/-/file-loader-4.2.0.tgz#5fb124d2369d7075d70a9a5abecd12e60a95215e"
integrity sha512-+xZnaK5R8kBJrHK0/6HRlrKNamvVS5rjyuju+rnyxRGuwUJwpAMsVzUl5dz6rK8brkzjV6JpcFNjp6NqV0g1OQ==
dependencies:
loader-utils "^1.2.3"
schema-utils "^2.0.0"
file-type@^10.5.0:
version "10.11.0"
resolved "https://registry.yarnpkg.com/file-type/-/file-type-10.11.0.tgz#2961d09e4675b9fb9a3ee6b69e9cd23f43fd1890"
integrity sha512-uzk64HRpUZyTGZtVuvrjP0FYxzQrBf4rojot6J65YMEbwBLB0CWm0CLojVpwpmFmxcE/lkvYICgfcGozbBq6rw==
"filereader@>= 0.10.3", filereader@^0.10.3:
version "0.10.3"
resolved "https://registry.yarnpkg.com/filereader/-/filereader-0.10.3.tgz#c747d4a2cd8f61e5418a7c07fe1257a43f0acdb1"
@ -3514,6 +3552,13 @@ flush-write-stream@^1.0.0:
inherits "^2.0.3"
readable-stream "^2.3.6"
follow-redirects@1.5.10:
version "1.5.10"
resolved "https://registry.yarnpkg.com/follow-redirects/-/follow-redirects-1.5.10.tgz#7b7a9f9aea2fdff36786a94ff643ed07f4ff5e2a"
integrity sha512-0V5l4Cizzvqt5D44aTXbFZz+FtyXV1vrDN6qrelxtfYQKW0KO0W2T/hkE8xvGa/540LkZlkaUjO4ailYTFtHVQ==
dependencies:
debug "=3.1.0"
for-in@^1.0.2:
version "1.0.2"
resolved "https://registry.yarnpkg.com/for-in/-/for-in-1.0.2.tgz#81068d295a8142ec0ac726c6e2200c30fb6d5e80"
@ -4048,6 +4093,11 @@ icss-utils@^4.0.0, icss-utils@^4.1.1:
dependencies:
postcss "^7.0.14"
idb-keyval@^3.1.0:
version "3.2.0"
resolved "https://registry.yarnpkg.com/idb-keyval/-/idb-keyval-3.2.0.tgz#cbbf354deb5684b6cdc84376294fc05932845bd6"
integrity sha512-slx8Q6oywCCSfKgPgL0sEsXtPVnSbTLWpyiDcu6msHOyKOLari1TD1qocXVCft80umnkk3/Qqh3lwoFt8T/BPQ==
ieee754@^1.1.4:
version "1.1.13"
resolved "https://registry.yarnpkg.com/ieee754/-/ieee754-1.1.13.tgz#ec168558e95aa181fd87d37f55c32bbcb6708b84"
@ -4256,7 +4306,7 @@ is-buffer@^1.1.5:
resolved "https://registry.yarnpkg.com/is-buffer/-/is-buffer-1.1.6.tgz#efaa2ea9daa0d7ab2ea13a97b2b8ad51fefbe8be"
integrity sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w==
is-buffer@~2.0.3:
is-buffer@^2.0.2, is-buffer@~2.0.3:
version "2.0.3"
resolved "https://registry.yarnpkg.com/is-buffer/-/is-buffer-2.0.3.tgz#4ecf3fcf749cbd1e472689e109ac66261a25e725"
integrity sha512-U15Q7MXTuZlrbymiz95PJpZxu8IlipAp4dtS3wOdgPXx3mqBnslrWU14kxfHB+Py/+2PVKSr37dMAgM2A4uArw==
@ -4466,6 +4516,16 @@ is-typedarray@~1.0.0:
resolved "https://registry.yarnpkg.com/is-typedarray/-/is-typedarray-1.0.0.tgz#e479c80858df0c1b11ddda6940f96011fcda4a9a"
integrity sha1-5HnICFjfDBsR3dppQPlgEfzaSpo=
is-url@1.2.2:
version "1.2.2"
resolved "https://registry.yarnpkg.com/is-url/-/is-url-1.2.2.tgz#498905a593bf47cc2d9e7f738372bbf7696c7f26"
integrity sha1-SYkFpZO/R8wtnn9zg3K792lsfyY=
is-url@^1.2.4:
version "1.2.4"
resolved "https://registry.yarnpkg.com/is-url/-/is-url-1.2.4.tgz#04a4df46d28c4cff3d73d01ff06abeb318a1aa52"
integrity sha512-ITvGim8FhRiYe4IQ5uHSkj7pVaPDrCTkNd3yq3cV7iZAcJdHTUMPMEHcqSOy9xZ9qFenQCvi+2wjH9a1nXqHww==
is-utf8@^0.2.0:
version "0.2.1"
resolved "https://registry.yarnpkg.com/is-utf8/-/is-utf8-0.2.1.tgz#4b0da1442104d1b336340e80797e865cf39f7d72"
@ -5276,7 +5336,7 @@ node-environment-flags@1.0.5:
object.getownpropertydescriptors "^2.0.3"
semver "^5.7.0"
node-fetch@^2.6.0:
node-fetch@^2.3.0, node-fetch@^2.6.0:
version "2.6.0"
resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-2.6.0.tgz#e633456386d4aa55863f676a7ab0daa8fdecb0fd"
integrity sha512-8dG4H5ujfvFiqDmVu9fQ5bOHUC15JMjMY/Zumv26oOvvVJjM67KF8koCWIabKQ1GJIa9r2mMZscBq/TbdOcmNA==
@ -5593,6 +5653,11 @@ onetime@^2.0.0, onetime@^2.0.1:
dependencies:
mimic-fn "^1.0.0"
opencollective-postinstall@^2.0.2:
version "2.0.2"
resolved "https://registry.yarnpkg.com/opencollective-postinstall/-/opencollective-postinstall-2.0.2.tgz#5657f1bede69b6e33a45939b061eb53d3c6c3a89"
integrity sha512-pVOEP16TrAO2/fjej1IdOyupJY8KDUM1CvsaScRbw6oddvpQoOfGk4ywha0HKKVAD6RkW4x6Q+tNBwhf3Bgpuw==
opener@^1.5.1:
version "1.5.1"
resolved "https://registry.yarnpkg.com/opener/-/opener-1.5.1.tgz#6d2f0e77f1a0af0032aca716c2c1fbb8e7e8abed"
@ -7440,6 +7505,37 @@ terser@^4.1.2:
source-map "~0.6.1"
source-map-support "~0.5.12"
tesseract.js-core@^2.0.0-beta.10:
version "2.0.0-beta.10"
resolved "https://registry.yarnpkg.com/tesseract.js-core/-/tesseract.js-core-2.0.0-beta.10.tgz#b8f0dd2be4686650c4350f648900adccfaf58d6b"
integrity sha512-QmNgMA9m5ES5uMTqpOAPysrUA80vUx/6WKQlfkK3zhOeAgqv8DjwwcDv9tQv2TgRzOQ+LFKrJn94Y2rw5b2IGw==
tesseract.js-utils@^1.0.0-beta.8:
version "1.0.0-beta.8"
resolved "https://registry.yarnpkg.com/tesseract.js-utils/-/tesseract.js-utils-1.0.0-beta.8.tgz#d1ef25c12609a337c3e0ac12a33f9903f3145a68"
integrity sha512-qjHBfWfzo2o1ZY9XI0Wh2hmpp38+mIgCMOk60W5Yyie/pBl421VLBKOZUEwQgpbLnOJ24VU6Q8yXsVgtFFHcFg==
dependencies:
axios "^0.18.0"
bmp-js "^0.1.0"
file-type "^10.5.0"
idb-keyval "^3.1.0"
is-url "^1.2.4"
zlibjs "^0.3.1"
tesseract.js@^2.0.0-alpha.13:
version "2.0.0-alpha.13"
resolved "https://registry.yarnpkg.com/tesseract.js/-/tesseract.js-2.0.0-alpha.13.tgz#87bb3d71fe646c0993b073552241d203d9dfef3a"
integrity sha512-ZFEdak7jWtN5vIDwZcw8OdAqA7RvG0QRailZKQFS5rtnl/Yy5vC4WcqfJh9+o+cA3bdr2zV5SENoWDtEihlSVA==
dependencies:
axios "^0.18.0"
check-types "^7.4.0"
is-url "1.2.2"
node-fetch "^2.3.0"
opencollective-postinstall "^2.0.2"
resolve-url "^0.2.1"
tesseract.js-core "^2.0.0-beta.10"
tesseract.js-utils "^1.0.0-beta.8"
testcafe-browser-tools@1.6.8:
version "1.6.8"
resolved "https://registry.yarnpkg.com/testcafe-browser-tools/-/testcafe-browser-tools-1.6.8.tgz#74ace1ee4c21a20bd6d88238f0d9bc97c596b8fb"
@ -8363,3 +8459,8 @@ yargs@^7.0.0:
which-module "^1.0.0"
y18n "^3.2.1"
yargs-parser "^5.0.0"
zlibjs@^0.3.1:
version "0.3.1"
resolved "https://registry.yarnpkg.com/zlibjs/-/zlibjs-0.3.1.tgz#50197edb28a1c42ca659cc8b4e6a9ddd6d444554"
integrity sha1-UBl+2yihxCymWcyLTmqd3W1ERVQ=