Skip to content

Commit c075fed

Browse files
committed
PDF Options support added.
1 parent 081893c commit c075fed

File tree

9 files changed

+354
-15
lines changed

9 files changed

+354
-15
lines changed

src/ExtractorService/Extractors/ImageExtractor.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
namespace Nilgems\PhpTextract\ExtractorService\Extractors;
44

55
use Nilgems\PhpTextract\ExtractorService\Contracts\AbstractTextExtractor;
6-
use Nilgems\PhpTextract\ExtractorService\Ocr\Contracts\TesseractOcrOptions;
76
use Nilgems\PhpTextract\ExtractorService\Ocr\TesseractOcrRun;
7+
use Nilgems\PhpTextract\ExtractorService\Options\TesseractOcrOptions;
88

99
class ImageExtractor extends AbstractTextExtractor
1010
{

src/ExtractorService/Extractors/PdfExtractor.php

Lines changed: 65 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,78 @@ class PdfExtractor extends AbstractTextExtractor
2121
protected function getExtractedText(): string
2222
{
2323
if ($this->hasOsExtensionInstalled()) {
24-
$file_path = $this->utilsService->getFilePath();
25-
$process = new Process(['pdftotext', '-layout', $file_path , '-']);
24+
$params = $this->getExtractionCommandParams();
25+
$process = new Process($params);
2626
$process->start();
2727
$process->wait();
2828
return $this->getFilteredOutput($process);
2929
}
3030
return "";
3131
}
3232

33+
protected function getExtractionCommandParams(): array
34+
{
35+
$params = ['pdftotext'];
36+
$file_path = $this->utilsService->getFilePath();
37+
$options = $this->utilsService->getPdfOptions();
38+
if (!is_null($options->firstPage)) {
39+
$params[] = '-f';
40+
$params[] = $options->firstPage;
41+
}
42+
if (!is_null($options->lastPage)) {
43+
$params[] = '-l';
44+
$params[] = $options->lastPage;
45+
}
46+
if (!is_null($options->resolution)) {
47+
$params[] = '-r';
48+
$params[] = $options->resolution;
49+
}
50+
if (!is_null($options->xCoordinate)) {
51+
$params[] = '-x';
52+
$params[] = $options->xCoordinate;
53+
}
54+
if (!is_null($options->yCoordinate)) {
55+
$params[] = '-y';
56+
$params[] = $options->yCoordinate;
57+
}
58+
if (!is_null($options->widthOfCorpArea)) {
59+
$params[] = '-W';
60+
$params[] = $options->widthOfCorpArea;
61+
}
62+
if (!is_null($options->heightOfCorpArea)) {
63+
$params[] = '-H';
64+
$params[] = $options->heightOfCorpArea;
65+
}
66+
if ($options->layoutModeEnabled) {
67+
$params[] = '-layout';
68+
}
69+
if (!is_null($options->fixedPitch)) {
70+
$params[] = '-fixed';
71+
$params[] = $options->fixedPitch;
72+
}
73+
if ($options->rawEnabled) {
74+
$params[] = '-raw';
75+
}
76+
if (!is_null($options->encodingName)) {
77+
$params[] = '-enc';
78+
$params[] = $options->encodingName;
79+
}
80+
if ($options->noPageBreaksEnabled) {
81+
$params[] = '-nopgbrk';
82+
}
83+
if (!is_null($options->ownerPassword)) {
84+
$params[] = '-opw';
85+
$params[] = $options->ownerPassword;
86+
}
87+
if (!is_null($options->ownerPassword)) {
88+
$params[] = '-upw';
89+
$params[] = $options->userPassword;
90+
}
91+
$params[] = $file_path;
92+
$params[] = '-';
93+
return $params;
94+
}
95+
3396
/**
3497
* Has 'pdftotext' extension is installed or enabled in OS.
3598
* @return bool

src/ExtractorService/Ocr/TesseractOcrRun.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
namespace Nilgems\PhpTextract\ExtractorService\Ocr;
44

55
use Nilgems\PhpTextract\Exceptions\TextractException;
6-
use Nilgems\PhpTextract\ExtractorService\Ocr\Contracts\TesseractOcrOptions;
6+
use Nilgems\PhpTextract\ExtractorService\Options\TesseractOcrOptions;
77
use Nilgems\PhpTextract\Services\UtilsService;
88
use Symfony\Component\Process\Process;
99
use thiagoalessio\TesseractOCR\TesseractOCR;
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
<?php
2+
3+
namespace Nilgems\PhpTextract\ExtractorService\Options\Contracts;
4+
5+
abstract class AbstractOptions
6+
{
7+
/**
8+
* Get the new option instance.
9+
* @return static
10+
*/
11+
public static function create(): static
12+
{
13+
return new static();
14+
}
15+
}
Lines changed: 241 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,241 @@
1+
<?php
2+
3+
namespace Nilgems\PhpTextract\ExtractorService\Options;
4+
5+
use Nilgems\PhpTextract\ExtractorService\Options\Contracts\AbstractOptions;
6+
7+
class PdfOptions extends AbstractOptions
8+
{
9+
/**
10+
* Specifies the first page to convert.
11+
* @var int|null $firstPage
12+
*/
13+
public ?int $firstPage = null;
14+
/**
15+
* Specifies the last page to convert.
16+
* @var int|null $lastPage
17+
*/
18+
public ?int $lastPage = null;
19+
/**
20+
* Specifies the resolution, in DPI. The default is 72 DPI.
21+
* @var int|null $resolution
22+
*/
23+
public ?int $resolution = null;
24+
/**
25+
* Specifies the x-coordinate of the crop area top left corner
26+
* @var int|null
27+
*/
28+
public ?int $xCoordinate = null;
29+
/**
30+
* Specifies the y-coordinate of the crop area top left corner
31+
* @var int|null $yCoordinate
32+
*/
33+
public ?int $yCoordinate = null;
34+
/**
35+
* Specifies the width of crop area in pixels (default is 0)
36+
* @var int|null
37+
*/
38+
public ?int $widthOfCorpArea = null;
39+
/**
40+
* Specifies the height of crop area in pixels (default is 0)
41+
* @var int|null
42+
*/
43+
public ?int $heightOfCorpArea = null;
44+
/**
45+
* Maintain (as best as possible) the original physical layout of the text. The default is to ´undo'
46+
* physical layout (columns, hyphenation, etc.) and output the text in reading order.
47+
* @var bool $layoutModeEnabled
48+
*/
49+
public bool $layoutModeEnabled = false;
50+
51+
/**
52+
* Assume fixed-pitch (or tabular) text, with the specified character width (in points). This forces
53+
* physical layout mode.
54+
* @var int|null $fixedPitch
55+
*/
56+
public ?int $fixedPitch = null;
57+
/**
58+
* Keep the text in content stream order. This is a hack which often "undoes" column formatting,
59+
* etc. Use of raw mode is no longer recommended.
60+
* @var bool $rawEnabled
61+
*/
62+
public bool $rawEnabled = false;
63+
/**
64+
* Sets the encoding to use for text output. This defaults to "UTF-8".
65+
* @var string|null $encodingName
66+
*/
67+
public ?string $encodingName = null;
68+
/**
69+
* Don't insert page breaks (form feed characters) between pages.
70+
* @var bool $pdfNoPageBreaks
71+
*/
72+
public bool $noPageBreaksEnabled = false;
73+
/**
74+
* Specify the owner password for the PDF file. Providing this will bypass all security
75+
* restrictions.
76+
* @var string|null $ownerPassword
77+
*/
78+
public ?string $ownerPassword = null;
79+
/**
80+
* Specify the user password for the PDF file.
81+
* @var string|null $userPassword
82+
*/
83+
public ?string $userPassword = null;
84+
85+
86+
87+
/**
88+
* Specifies the first page to convert.
89+
* @param int $pageNo
90+
* @return $this
91+
*/
92+
public function addFirstPage(int $pageNo): static
93+
{
94+
$this->firstPage = $pageNo;
95+
return $this;
96+
}
97+
98+
/**
99+
* Specifies the last page to convert.
100+
* @param int $pageNo
101+
* @return $this
102+
*/
103+
public function addLastPage(int $pageNo): static
104+
{
105+
$this->lastPage = $pageNo;
106+
return $this;
107+
}
108+
109+
/**
110+
* Specifies the resolution, in DPI. The default is 72 DPI.
111+
* @param int $resolution
112+
* @return $this
113+
*/
114+
public function addResolution(int $resolution): static
115+
{
116+
$this->resolution = $resolution;
117+
return $this;
118+
}
119+
120+
/**
121+
* Specifies the x-coordinate of the crop area top left corner
122+
* @param int $xCoordinate
123+
* @return $this
124+
*/
125+
public function addXCoordinate(int $xCoordinate): static
126+
{
127+
$this->xCoordinate = $xCoordinate;
128+
return $this;
129+
}
130+
131+
/**
132+
* Specifies the y-coordinate of the crop area top left corner
133+
* @param int $yCoordinate
134+
* @return $this
135+
*/
136+
public function addYCoordinate(int $yCoordinate): static
137+
{
138+
$this->yCoordinate = $yCoordinate;
139+
return $this;
140+
}
141+
142+
/**
143+
* Specifies the width of crop area in pixels (default is 0)
144+
* @param int $width
145+
* @return $this
146+
*/
147+
public function addWidthOfCorpArea(int $width): static
148+
{
149+
$this->widthOfCorpArea = $width;
150+
return $this;
151+
}
152+
153+
/**
154+
* Specifies the height of crop area in pixels (default is 0)
155+
* @param int $height
156+
* @return $this
157+
*/
158+
public function addHeightOfCorpArea(int $height): static
159+
{
160+
$this->heightOfCorpArea = $height;
161+
return $this;
162+
}
163+
164+
/**
165+
* Maintain (as best as possible) the original physical layout of the text. The default is to ´undo'
166+
* physical layout (columns, hyphenation, etc.) and output the text in reading order.
167+
* @return $this
168+
*/
169+
public function useLayoutMode(): static
170+
{
171+
$this->layoutModeEnabled = true;
172+
return $this;
173+
}
174+
175+
/**
176+
* Assume fixed-pitch (or tabular) text, with the specified character width (in points). This forces
177+
* physical layout mode.
178+
* @param int $pitch
179+
* @return $this
180+
*/
181+
public function addFixedPitch(int $pitch): static
182+
{
183+
$this->fixedPitch = $pitch;
184+
return $this;
185+
}
186+
187+
/**
188+
* Keep the text in content stream order. This is a hack which often "undoes" column formatting,
189+
* etc. Use of raw mode is no longer recommended.
190+
* @return $this
191+
*/
192+
public function useRaw(): static
193+
{
194+
$this->rawEnabled = true;
195+
return $this;
196+
}
197+
198+
/**
199+
* Sets the encoding to use for text output. This defaults to "UTF-8".
200+
* @param string $encodingName
201+
* @return $this
202+
*/
203+
public function addEncodingName(string $encodingName): static
204+
{
205+
$this->encodingName = $encodingName;
206+
return $this;
207+
}
208+
209+
/**
210+
* Don't insert page breaks (form feed characters) between pages.
211+
* @return $this
212+
*/
213+
public function useNoPageBreak(): static
214+
{
215+
$this->noPageBreaksEnabled = true;
216+
return $this;
217+
}
218+
219+
/**
220+
* Specify the owner password for the PDF file. Providing this will bypass all security
221+
* restrictions.
222+
* @param string $password
223+
* @return $this
224+
*/
225+
public function addOwnerPassword(string $password): static
226+
{
227+
$this->ownerPassword = $password;
228+
return $this;
229+
}
230+
231+
/**
232+
* Specify the user password for the PDF file.
233+
* @param string $password
234+
* @return $this
235+
*/
236+
public function addUserPassword(string $password): static
237+
{
238+
$this->userPassword = $password;
239+
return $this;
240+
}
241+
}

src/ExtractorService/Ocr/Contracts/TesseractOcrOptions.php renamed to src/ExtractorService/Options/TesseractOcrOptions.php

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,14 @@
11
<?php
22

3-
namespace Nilgems\PhpTextract\ExtractorService\Ocr\Contracts;
3+
namespace Nilgems\PhpTextract\ExtractorService\Options;
44

55
use Illuminate\Contracts\Support\Arrayable;
6+
use Nilgems\PhpTextract\ExtractorService\Options\Contracts\AbstractOptions;
67

7-
class TesseractOcrOptions implements Arrayable
8+
class TesseractOcrOptions extends AbstractOptions implements Arrayable
89
{
910
protected array $options;
1011

11-
12-
1312
public function __construct()
1413
{
1514
$this->options = [

0 commit comments

Comments
 (0)