Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,4 @@ JIRA_HOST=ADD_YOURS_HERE
JIRA_USERNAME=ADD_YOURS_HERE
JIRA_ACCESS_TOKEN=ADD_YOURS_HERE
JIRA_PROJECT_KEY=ADD_YOURS_HERE
POLARIS_AI_DATA_INSIGHT_API_KEY=ADD_YOURS_HERE # https://datainsight.polarisoffice.com/_api/keys
Binary file not shown.
Binary file not shown.
28 changes: 28 additions & 0 deletions examples/src/document_loaders/polaris_ai_datainsight.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import * as fs from "fs";
import path from "path";
import { fileURLToPath } from "url";
import { PolarisAIDataInsightLoader } from "@langchain/community/document_loaders/web/polaris_ai_datainsight";

const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);

const apiKey = process.env.POLARIS_AI_DATA_INSIGHT_API_KEY;

async function main() {
const filePath = path.join(
__dirname,
"./example_data/polaris_ai_datainsight/example.docx"
);
const file = fs.readFileSync(filePath);

const loader = new PolarisAIDataInsightLoader({
apiKey,
file,
filename: "example.docx",
});

const docs = await loader.load();
console.log(docs);
}

main().catch(console.error);
2 changes: 2 additions & 0 deletions libs/langchain-community/langchain.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,7 @@ export const config = {
"document_loaders/web/sort_xyz_blockchain",
"document_loaders/web/spider": "document_loaders/web/spider",
"document_loaders/web/youtube": "document_loaders/web/youtube",
"document_loaders/web/polaris_ai_datainsight": "document_loaders/web/polaris_ai_datainsight",
"document_loaders/fs/chatgpt": "document_loaders/fs/chatgpt",
"document_loaders/fs/srt": "document_loaders/fs/srt",
"document_loaders/fs/pdf": "document_loaders/fs/pdf",
Expand Down Expand Up @@ -545,6 +546,7 @@ export const config = {
"document_loaders/web/confluence",
"document_loaders/web/couchbase",
"document_loaders/web/youtube",
"document_loaders/web/polaris_ai_datainsight",
"document_loaders/fs/chatgpt",
"document_loaders/fs/srt",
"document_loaders/fs/pdf",
Expand Down
17 changes: 17 additions & 0 deletions libs/langchain-community/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -122,10 +122,12 @@
"@types/jsdom": "^21.1.1",
"@types/jsonwebtoken": "^9",
"@types/lodash": "^4",
"@types/mime-types": "^2.1.4",
"@types/mozilla-readability": "^0.2.1",
"@types/pdf-parse": "^1.1.1",
"@types/pg": "^8.11.0",
"@types/pg-copy-streams": "^1.2.2",
"@types/unzipper": "^0",
"@types/uuid": "^9",
"@types/word-extractor": "^1",
"@types/ws": "^8",
Expand Down Expand Up @@ -191,6 +193,7 @@
"mammoth": "^1.6.0",
"mariadb": "^3.4.0",
"mem0ai": "^2.1.8",
"mime-types": "^3.0.1",
"mongodb": "^6.17.0",
"mysql2": "^3.9.8",
"neo4j-driver": "^5.17.0",
Expand Down Expand Up @@ -218,6 +221,7 @@
"typeorm": "^0.3.20",
"typescript": "~5.8.3",
"typesense": "^1.5.3",
"unzipper": "^0.12.3",
"usearch": "^2.17.1",
"voy-search": "0.6.2",
"weaviate-client": "^3.5.2",
Expand Down Expand Up @@ -3087,6 +3091,15 @@
"import": "./document_loaders/web/youtube.js",
"require": "./document_loaders/web/youtube.cjs"
},
"./document_loaders/web/polaris_ai_datainsight": {
"types": {
"import": "./document_loaders/web/polaris_ai_datainsight.d.ts",
"require": "./document_loaders/web/polaris_ai_datainsight.d.cts",
"default": "./document_loaders/web/polaris_ai_datainsight.d.ts"
},
"import": "./document_loaders/web/polaris_ai_datainsight.js",
"require": "./document_loaders/web/polaris_ai_datainsight.cjs"
},
"./document_loaders/fs/chatgpt": {
"types": {
"import": "./document_loaders/fs/chatgpt.d.ts",
Expand Down Expand Up @@ -4337,6 +4350,10 @@
"document_loaders/web/youtube.js",
"document_loaders/web/youtube.d.ts",
"document_loaders/web/youtube.d.cts",
"document_loaders/web/polaris_ai_datainsight.cjs",
"document_loaders/web/polaris_ai_datainsight.js",
"document_loaders/web/polaris_ai_datainsight.d.ts",
"document_loaders/web/polaris_ai_datainsight.d.cts",
"document_loaders/fs/chatgpt.cjs",
"document_loaders/fs/chatgpt.js",
"document_loaders/fs/chatgpt.d.ts",
Expand Down
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
import fs from "fs";
import path from "path";
import { fileURLToPath } from "url";
import axios from "axios";
import { jest } from "@jest/globals";
import { Document } from "@langchain/core/documents";
import { PolarisAIDataInsightLoader } from "../web/polaris_ai_datainsight.js";

const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);

const EXAMPLES_DIR = path.resolve(
__dirname,
"../../document_loaders/tests/example_data/polaris_ai_datainsight"
);
const EXAMPLE_DOC_PATH = path.resolve(EXAMPLES_DIR, "example.docx");
const MOCK_RESPONSE_ZIP_PATH = path.resolve(EXAMPLES_DIR, "example.zip");

interface PageData {
total: number;
text?: number;
image: number;
}

interface ResponseDataStructure {
elements: {
total: number;
text: number;
image: number;
};
pages: {
total: number;
[key: string]: PageData | number;
};
}

const MOCK_RESPONSE_DATA_STRUCTURE: ResponseDataStructure = {
elements: {
total: 10,
text: 5,
image: 5,
},
pages: {
total: 2,
"1": {
total: 7,
text: 5,
image: 2,
},
"2": {
total: 3,
image: 3,
},
},
};

describe("PolarisAIDataInsightLoader Integration Tests", () => {
let tempResourcesDir: string;

beforeEach(() => {
tempResourcesDir = fs.mkdtempSync(
path.join(EXAMPLES_DIR, "/examples/example_")
);
jest.spyOn(axios, "post").mockResolvedValue({
status: 200,
data: fs.readFileSync(MOCK_RESPONSE_ZIP_PATH),
});
});

afterEach(() => {
fs.rmSync(tempResourcesDir, { recursive: true, force: true });
jest.restoreAllMocks();
});

it("should load documents in element mode", async () => {
const loader = new PolarisAIDataInsightLoader({
filePath: EXAMPLE_DOC_PATH,
apiKey: "api_key",
resourcesDir: tempResourcesDir,
mode: "element",
});
const docs = await loader.load();

expect(docs[0]).toBeInstanceOf(Document);
expect(docs.length).toBe(MOCK_RESPONSE_DATA_STRUCTURE.elements.total);

for (const doc of docs) {
if (doc.metadata.type === "text") {
expect(doc.pageContent).not.toBe("");
expect(doc.metadata.resources).toBeUndefined();
} else {
const match = doc.pageContent.match(/id="([^"]+)"/);
expect(match).not.toBeNull();
if (!match) throw new Error("No image resource ID found");

const resourceId = match[1];
const resourcePath = doc.metadata.resources?.[resourceId];
expect(resourcePath && fs.existsSync(resourcePath)).toBe(true);
expect(resourcePath && fs.lstatSync(resourcePath).isFile()).toBe(true);
expect(resourcePath && path.dirname(path.dirname(resourcePath))).toBe(
tempResourcesDir
);
}
}
});

it("should load documents in page mode", async () => {
const loader = new PolarisAIDataInsightLoader({
filePath: EXAMPLE_DOC_PATH,
apiKey: "api_key",
resourcesDir: tempResourcesDir,
mode: "page",
});
const docs = await loader.load();

expect(docs[0]).toBeInstanceOf(Document);
expect(docs.length).toBe(MOCK_RESPONSE_DATA_STRUCTURE.pages.total);

docs.forEach((doc, i) => {
const pageId = `${i + 1}`;
const pageData = MOCK_RESPONSE_DATA_STRUCTURE.pages[
pageId as keyof typeof MOCK_RESPONSE_DATA_STRUCTURE.pages
] as PageData;
expect(doc.metadata.elements.length).toBe(pageData.total);
expect(Object.keys(doc.metadata.resources).length).toBe(pageData.image);

const resourceIds = [
...doc.pageContent.matchAll(/<img src="#" alt="" id="([^"]+)"\/>/g),
].map((m) => m[1]);
expect(resourceIds.length).toBe(pageData.image);

resourceIds.forEach((resourceId) => {
const resourcePath = doc.metadata.resources?.[resourceId];
expect(resourcePath && fs.existsSync(resourcePath)).toBe(true);
expect(resourcePath && fs.lstatSync(resourcePath).isFile()).toBe(true);
expect(resourcePath && path.dirname(path.dirname(resourcePath))).toBe(
tempResourcesDir
);
});
});
});

it("should load a single document in single mode", async () => {
const loader = new PolarisAIDataInsightLoader({
filePath: EXAMPLE_DOC_PATH,
apiKey: "api_key",
resourcesDir: tempResourcesDir,
mode: "single",
});
const docs = await loader.load();

expect(docs[0]).toBeInstanceOf(Document);
expect(docs.length).toBe(1);

const doc = docs[0];
const resourceIds = [
...doc.pageContent.matchAll(/<img src="#" alt="" id="([^"]+)"\/>/g),
].map((m) => m[1]);
expect(resourceIds.length).toBe(
MOCK_RESPONSE_DATA_STRUCTURE.elements.image
);

resourceIds.forEach((resourceId) => {
const resourcePath = doc.metadata.resources?.[resourceId];
expect(resourcePath && fs.existsSync(resourcePath)).toBe(true);
expect(resourcePath && fs.lstatSync(resourcePath).isFile()).toBe(true);
expect(resourcePath && path.dirname(path.dirname(resourcePath))).toBe(
tempResourcesDir
);
});
});
});
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import fs from "fs";
import path from "path";
import { fileURLToPath } from "url";
import { PolarisAIDataInsightLoader } from "../web/polaris_ai_datainsight.js";

const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);

const EXAMPLES_DIR = path.resolve(
__dirname,
"../../document_loaders/tests/example_data/polaris_ai_datainsight"
);
const EXAMPLE_DOC_PATH = path.join(EXAMPLES_DIR, "example.docx");
const EXAMPLE_UNSUPPORTED_DOC_PATH = path.join(EXAMPLES_DIR, "example.txt");
const EXAMPLE_NOT_EXIST_DOC_PATH = path.join(EXAMPLES_DIR, "no_file.docx");

// -- For Success Test -- //
describe("PolarisAIDataInsightLoader - Success Initialization", () => {
test("should initialize with filePath", () => {
const loader = new PolarisAIDataInsightLoader({
filePath: EXAMPLE_DOC_PATH,
apiKey: "api_key",
resourcesDir: EXAMPLES_DIR,
});

expect(loader).toBeDefined();
});

test("should initialize with file and filename", () => {
const buffer = fs.readFileSync(EXAMPLE_DOC_PATH);
const loader = new PolarisAIDataInsightLoader({
file: buffer,
filename: path.basename(EXAMPLE_DOC_PATH),
apiKey: "api_key",
resourcesDir: EXAMPLES_DIR,
});

expect(loader).toBeDefined();
});
});

// -- For Failure Test -- //
describe("PolarisAIDataInsightLoader - Failure Initialization", () => {
test("should throw error when both filePath and file are provided", () => {
const buffer = fs.readFileSync(EXAMPLE_DOC_PATH);
expect(() => {
const loader = new PolarisAIDataInsightLoader({
filePath: EXAMPLE_DOC_PATH,
file: buffer,
filename: path.basename(EXAMPLE_DOC_PATH),
apiKey: "api_key",
resourcesDir: EXAMPLES_DIR,
});
}).toThrow("Both file_path and file/filename provided");
});

test("should throw error when only file is provided without filename", () => {
const buffer = fs.readFileSync(EXAMPLE_DOC_PATH);
expect(() => {
const loader = new PolarisAIDataInsightLoader({
file: buffer,
apiKey: "api_key",
resourcesDir: EXAMPLES_DIR,
});
}).toThrow(
"When using file data, both `file` and `filename` must be provided."
);
});

test("should throw error when non-existent filePath is provided", () => {
expect(() => {
const loader = new PolarisAIDataInsightLoader({
filePath: EXAMPLE_NOT_EXIST_DOC_PATH,
apiKey: "api_key",
resourcesDir: EXAMPLES_DIR,
});
}).toThrow(/does not exist/);
});

test("should throw error when unsupported file type is provided (filePath)", () => {
expect(() => {
const loader = new PolarisAIDataInsightLoader({
filePath: EXAMPLE_UNSUPPORTED_DOC_PATH,
apiKey: "api_key",
resourcesDir: EXAMPLES_DIR,
});
}).toThrow(/Unsupported file extension/);
});

test("should throw error when unsupported file type is provided (file buffer)", () => {
const buffer = fs.readFileSync(EXAMPLE_UNSUPPORTED_DOC_PATH);
expect(() => {
const loader = new PolarisAIDataInsightLoader({
file: buffer,
filename: path.basename(EXAMPLE_UNSUPPORTED_DOC_PATH),
apiKey: "api_key",
resourcesDir: EXAMPLES_DIR,
});
}).toThrow(/Unsupported file extension/);
});
});
Loading