parsing docx->html in the browser

Pick a file using an input , read it's contents using the FileReader browser API and it's converted to HTML using the mammoth library. Then, it extracts and parses tables from the HTML into JSON format using the HtmlTableToJson library.

November 16, 2024
  const handleFileChange = (e: React.ChangeEvent<HTMLInputElement>) => {
    const selectedFile = e.target.files?.[0];
    if (!selectedFile) return;

    // https://developer.mozilla.org/en-US/docs/Web/API/FileReader
    // The FileReader API is used to read the contents of a Blob or File.
    // It makes it possible to read the contents of a file in JavaScript.
    const reader = new FileReader();

    // The onload event is fired when the file has been read successfully.
    // The event handler is passed the result of the file read as an argument.
    // The result is an ArrayBuffer which is a typed array of integers.
    reader.onload = (e) => {
      if (e.target?.result instanceof ArrayBuffer) {
        setFile(e.target.result);
      }
    };

    // The onerror event is fired when there is an error reading the file.
    // The event handler is passed the error as an argument.
    reader.onerror = (e) => {
      console.error("Error reading file:", e);
    };

    // The onabort event is fired when the file read has been aborted.
    // The event handler is passed no arguments.
    reader.onabort = () => {
      console.log("File reading aborted");
    };

    // Start the file read by calling the readAsArrayBuffer method.
    // This method takes a Blob or File as an argument.
    reader.readAsArrayBuffer(selectedFile);
  };

  return (
    <div className="flex h-full w-full flex-col items-center justify-center">
      <input
        onChange={handleFileChange}
        type="file"
        // limits files that will appear in the file explorer when picking to .dcx
        accept=".docx"
        // tailwind + daisyui classes for file inputs
        className="file-input w-full max-w-xs"
      />
</div>)
// parse the docx using mammoth and convert to json 
import { convertToHtml } from "mammoth";
// below library does not ship types 
//  learn about how to do typescipt module declarations for it
//  https://tigawanna-portfolio.vercel.app/lessons/zvkw94okeaegslk
import HtmlTableToJson from "html-table-to-json";



export async function handleConvert(file: ArrayBuffer | null) {
    if (file) {
      try {
        const options = { };
        const result = await convertToHtml({ arrayBuffer: file }, options);
        const tableAsJson = HtmlTableToJson.parse(result.value)

        return {html: result.value, tableJson: tableAsJson._results};
      } catch (error) {
        console.error('Error converting file:', error);
        return
      }
    } else {
      console.error("File not found");
      return
    }
  };