Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 14 additions & 9 deletions .github/workflows/check-package-versions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,28 +10,33 @@ concurrency:
on:
pull_request:
types: [opened, synchronize, reopened]
paths:
- "server/package.json"
- "collector/package.json"

jobs:
run-script:
runs-on: ubuntu-latest

steps:
- name: Checkout repository
uses: actions/checkout@v2
- uses: actions/checkout@v4

- uses: dorny/paths-filter@v3
id: filter
with:
filters: |
packages:
- 'server/package.json'
- 'collector/package.json'

- name: Set up Node.js
if: steps.filter.outputs.packages == 'true'
uses: actions/setup-node@v3
with:
node-version: '18'

- name: Run verifyPackageVersions.mjs script
if: steps.filter.outputs.packages == 'true'
run: |
cd extras/scripts
node verifyPackageVersions.mjs

- name: Fail job on error
if: failure()
run: exit 1
- name: Skip message
if: steps.filter.outputs.packages != 'true'
run: echo "No package.json changes detected, skipping version check"
3 changes: 1 addition & 2 deletions collector/.gitignore
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
hotdir/*
!hotdir/__HOTDIR__.md
yarn-error.log
!yarn.lock
outputs
scripts
.env.development
.env.production
.env.test
storage/
55 changes: 51 additions & 4 deletions collector/eslint.config.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,23 @@ import { defineConfig } from "eslint/config";
import pluginPrettier from "eslint-plugin-prettier";
import configPrettier from "eslint-config-prettier";
import unusedImports from "eslint-plugin-unused-imports";
import pluginImport from "eslint-plugin-import";

export default defineConfig([
{ ignores: ["__tests__/**"] },
{
files: ["**/*.{js,mjs,cjs}"],
plugins: { js, prettier: pluginPrettier, "unused-imports": unusedImports },
files: ["**/*.js"],
plugins: {
js,
prettier: pluginPrettier,
"unused-imports": unusedImports,
import: pluginImport,
},
extends: ["js/recommended"],
languageOptions: { globals: { ...globals.node, ...globals.browser } },
languageOptions: {
sourceType: "commonjs",
globals: { ...globals.node, ...globals.browser },
},
rules: {
...configPrettier.rules,
"prettier/prettier": "error",
Expand All @@ -32,7 +41,45 @@ export default defineConfig([
argsIgnorePattern: "^_",
},
],
"import/no-unresolved": [
"error",
{ commonjs: true, ignore: ["^youtubei.js$"] },
],
"import/named": "error",
},
settings: {
"import/resolver": {
node: true,
},
"import/core-modules": ["eslint/config"],
},
},
{
files: ["**/*.mjs"],
plugins: {
js,
prettier: pluginPrettier,
"unused-imports": unusedImports,
import: pluginImport,
},
extends: ["js/recommended"],
languageOptions: {
sourceType: "module",
globals: { ...globals.node },
},
rules: {
...configPrettier.rules,
"prettier/prettier": "error",
"no-unused-vars": "off",
"unused-imports/no-unused-imports": "error",
"import/no-unresolved": "error",
"import/named": "error",
},
settings: {
"import/resolver": {
node: true,
},
"import/core-modules": ["eslint/config"],
},
},
{ files: ["**/*.js"], languageOptions: { sourceType: "commonjs" } },
]);
3 changes: 0 additions & 3 deletions collector/hotdir/__HOTDIR__.md

This file was deleted.

10 changes: 7 additions & 3 deletions collector/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@ const { ACCEPTED_MIMES } = require("./utils/constants");
const { reqBody } = require("./utils/http");
const { processSingleFile } = require("./processSingleFile");
const { processLink, getLinkText } = require("./processLink");
const { wipeCollectorStorage } = require("./utils/files");
const {
ensureRequiredDirectoriesExist,
wipeCollectorStorage,
} = require("./utils/files");
const extensions = require("./extensions");
const { processRawText } = require("./processRawText");
const { verifyPayloadIntegrity } = require("./middleware/verifyIntegrity");
Expand Down Expand Up @@ -186,8 +189,9 @@ app.all("*", function (_, response) {
});

app
.listen(8888, async () => {
await wipeCollectorStorage();
.listen(8888, () => {
ensureRequiredDirectoriesExist();
wipeCollectorStorage();
console.log(`Document processor app listening on port 8888`);
})
.on("error", function (_) {
Expand Down
1 change: 1 addition & 0 deletions collector/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
"cross-env": "^7.0.3",
"eslint": "^9.0.0",
"eslint-config-prettier": "^9.0.0",
"eslint-plugin-import": "^2.32.0",
"eslint-plugin-prettier": "^5.0.0",
"eslint-plugin-unused-imports": "^4.0.0",
"globals": "^17.4.0",
Expand Down
13 changes: 2 additions & 11 deletions collector/processSingleFile/index.js
Original file line number Diff line number Diff line change
@@ -1,16 +1,13 @@
const path = require("path");
const fs = require("fs");
const {
WATCH_DIRECTORY,
SUPPORTED_FILETYPE_CONVERTERS,
} = require("../utils/constants");
const { SUPPORTED_FILETYPE_CONVERTERS } = require("../utils/constants");
const {
trashFile,
isTextType,
normalizePath,
isWithin,
WATCH_DIRECTORY,
} = require("../utils/files");
const RESERVED_FILES = ["__HOTDIR__.md"];

/**
* Process a single file and return the documents
Expand All @@ -32,12 +29,6 @@ async function processSingleFile(targetFilename, options = {}, metadata = {}) {
documents: [],
};

if (RESERVED_FILES.includes(targetFilename))
return {
success: false,
reason: "Filename is a reserved filename and cannot be processed.",
documents: [],
};
if (!fs.existsSync(fullFilePath))
return {
success: false,
Expand Down
2 changes: 0 additions & 2 deletions collector/storage/.gitignore

This file was deleted.

Empty file removed collector/storage/tmp/.placeholder
Empty file.
7 changes: 2 additions & 5 deletions collector/utils/OCRLoader/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ const fs = require("fs");
const os = require("os");
const path = require("path");
const { VALID_LANGUAGE_CODES } = require("./validLangs");
const { basePrimaryStoragePath } = require("../files");

class OCRLoader {
/**
Expand All @@ -22,11 +23,7 @@ class OCRLoader {
*/
constructor({ targetLanguages = "eng" } = {}) {
this.language = this.parseLanguages(targetLanguages);
this.cacheDir = path.resolve(
process.env.STORAGE_DIR
? path.resolve(process.env.STORAGE_DIR, `models`, `tesseract`)
: path.resolve(__dirname, `../../../server/storage/models/tesseract`)
);
this.cacheDir = path.resolve(basePrimaryStoragePath, "models/tesseract");

// Ensure the cache directory exists or else Tesseract will persist the cache in the default location.
if (!fs.existsSync(this.cacheDir))
Expand Down
8 changes: 2 additions & 6 deletions collector/utils/WhisperProviders/localWhisper.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
const fs = require("fs");
const path = require("path");
const { v4 } = require("uuid");
const { basePrimaryStoragePath } = require("../files");
const defaultWhisper = "Xenova/whisper-small"; // Model Card: https://huggingface.co/Xenova/whisper-small
const fileSize = {
"Xenova/whisper-small": "250mb",
Expand All @@ -11,12 +12,7 @@ class LocalWhisper {
constructor({ options }) {
this.model = options?.WhisperModelPref ?? defaultWhisper;
this.fileSize = fileSize[this.model];
this.cacheDir = path.resolve(
process.env.STORAGE_DIR
? path.resolve(process.env.STORAGE_DIR, `models`)
: path.resolve(__dirname, `../../../server/storage/models`)
);

this.cacheDir = path.resolve(basePrimaryStoragePath, "models");
this.modelPath = path.resolve(this.cacheDir, ...this.model.split("/"));
// Make directory when it does not exist in existing installations
if (!fs.existsSync(this.cacheDir))
Expand Down
10 changes: 2 additions & 8 deletions collector/utils/comKey/index.js
Original file line number Diff line number Diff line change
@@ -1,14 +1,8 @@
const crypto = require("crypto");
const fs = require("fs");
const path = require("path");
const keyPath =
process.env.NODE_ENV === "development"
? path.resolve(__dirname, `../../../server/storage/comkey`)
: path.resolve(
process.env.STORAGE_DIR ??
path.resolve(__dirname, `../../../server/storage`),
`comkey`
);
const { basePrimaryStoragePath } = require("../files");
const keyPath = path.resolve(basePrimaryStoragePath, "comkey");

class CommunicationKey {
#pubKeyName = "ipc-pub.pem";
Expand Down
3 changes: 0 additions & 3 deletions collector/utils/constants.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
const WATCH_DIRECTORY = require("path").resolve(__dirname, "../hotdir");

const ACCEPTED_MIMES = {
"text/plain": [".txt", ".md", ".org", ".adoc", ".rst"],
"text/html": [".html"],
Expand Down Expand Up @@ -77,6 +75,5 @@ const SUPPORTED_FILETYPE_CONVERTERS = {

module.exports = {
SUPPORTED_FILETYPE_CONVERTERS,
WATCH_DIRECTORY,
ACCEPTED_MIMES,
};
25 changes: 23 additions & 2 deletions collector/utils/downloadURIToFile/index.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
const { WATCH_DIRECTORY } = require("../constants");
const {
isWithin,
WATCH_DIRECTORY,
normalizePath,
sanitizeFileName,
} = require("../files");
const fs = require("fs");
const path = require("path");
const { pipeline } = require("stream/promises");
Expand Down Expand Up @@ -37,7 +42,23 @@ async function downloadURIToFile(url, maxTimeout = 10_000) {
urlObj.pathname.replace(/\//g, "-"),
{ lower: true }
)}`;
const localFilePath = path.join(WATCH_DIRECTORY, filename);
const localFilePath = normalizePath(
path.resolve(WATCH_DIRECTORY, sanitizeFileName(filename))
);

if (!isWithin(path.resolve(WATCH_DIRECTORY), localFilePath)) {
console.error(
`[DownloadURIToFile]: File name ${localFilePath} is not within the storage path ${path.resolve(
WATCH_DIRECTORY
)}`
);
return {
success: false,
reason: "File name is not within the storage path.",
fileLocation: null,
};
}

const writeStream = fs.createWriteStream(localFilePath);
await pipeline(res.body, writeStream);

Expand Down
15 changes: 6 additions & 9 deletions collector/utils/extensions/Confluence/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@ const fs = require("fs");
const path = require("path");
const { default: slugify } = require("slugify");
const { v4 } = require("uuid");
const { writeToServerDocuments, sanitizeFileName } = require("../../files");
const {
writeToServerDocuments,
sanitizeFileName,
documentsFolder,
} = require("../../files");
const { tokenizeString } = require("../../tokenizer");
const { ConfluencePagesLoader } = require("./ConfluenceLoader");

Expand Down Expand Up @@ -80,14 +84,7 @@ async function loadConfluence(
`confluence-${hostname}-${v4().slice(0, 4)}`
).toLowerCase();

const outFolderPath =
process.env.NODE_ENV === "development"
? path.resolve(
__dirname,
`../../../../server/storage/documents/${outFolder}`
)
: path.resolve(process.env.STORAGE_DIR, `documents/${outFolder}`);

const outFolderPath = path.resolve(documentsFolder, outFolder);
if (!fs.existsSync(outFolderPath))
fs.mkdirSync(outFolderPath, { recursive: true });

Expand Down
6 changes: 2 additions & 4 deletions collector/utils/extensions/DrupalWiki/DrupalWiki/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
const { htmlToText } = require("html-to-text");
const { tokenizeString } = require("../../../tokenizer");
const {
WATCH_DIRECTORY,
sanitizeFileName,
writeToServerDocuments,
documentsFolder,
Expand All @@ -18,10 +19,7 @@ const { default: slugify } = require("slugify");
const path = require("path");
const fs = require("fs");
const { processSingleFile } = require("../../../../processSingleFile");
const {
WATCH_DIRECTORY,
SUPPORTED_FILETYPE_CONVERTERS,
} = require("../../../constants");
const { SUPPORTED_FILETYPE_CONVERTERS } = require("../../../constants");

class Page {
/**
Expand Down
11 changes: 2 additions & 9 deletions collector/utils/extensions/RepoLoader/GithubRepo/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ const fs = require("fs");
const path = require("path");
const { default: slugify } = require("slugify");
const { v4 } = require("uuid");
const { writeToServerDocuments } = require("../../../files");
const { writeToServerDocuments, documentsFolder } = require("../../../files");
const { tokenizeString } = require("../../../tokenizer");

/**
Expand Down Expand Up @@ -38,14 +38,7 @@ async function loadGithubRepo(args, response) {
`${repo.author}-${repo.project}-${repo.branch}-${v4().slice(0, 4)}`
).toLowerCase();

const outFolderPath =
process.env.NODE_ENV === "development"
? path.resolve(
__dirname,
`../../../../../server/storage/documents/${outFolder}`
)
: path.resolve(process.env.STORAGE_DIR, `documents/${outFolder}`);

const outFolderPath = path.resolve(documentsFolder, outFolder);
if (!fs.existsSync(outFolderPath))
fs.mkdirSync(outFolderPath, { recursive: true });

Expand Down
Loading
Loading