Skip to content

Fails to handle newlines and tabs #35

@SamuelLarkin

Description

@SamuelLarkin

Given an input sentence that contains spaces or tabs, spacy_conll outputs an invalid conll.

The motivation for using a global shared memory to help coordination and the idea of using the transformer to implement it are clear.\n2.\t

#!/usr/bin/env  python3
"""
uv venv --relocatable --python=3.13 --prompt=conll.issue venv.gh_issue
source venv.gh_issue/bin/activate
uv pip install spacy spacy-conll pyconll
uv pip install en_core_web_lg@https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl
"""
import pyconll
import spacy
from pyconll.unit.conll import Conll

from spacy_conll import ConllFormatter

formatter = ConllFormatter(include_headers=True)
spacy_model = "en_core_web_lg"

sentence = "The motivation for using a global shared memory to help coordination and the idea of using the transformer to implement it are clear.\n2.\t"

nlp = spacy.load(spacy_model, disable=["ner"])
nlp.add_pipe(
    "conll_formatter",
    last=True,
    config={"include_headers": True},
)
docs = nlp.pipe([sentence])
for doc in docs:
    for sent in doc.sents:
        conll_str = sent._.conll_str
        print(conll_str)

        parsed: Conll = pyconll.load.load_from_string(conll_str)

This produces the following. Note word #25's output which becomes multi line and word #28 which has too many columns.

# sent_id = 1
# text = The motivation for using a global shared memory to help coordination and the idea of using the transformer to implement it are clear.
2.
1       The     the     DET     DT      Definite=Def|PronType=Art       2       det     _       _
2       motivation      motivation      NOUN    NN      Number=Sing     22      nsubj   _       _
3       for     for     ADP     IN      _       2       prep    _       _
4       using   use     VERB    VBG     Aspect=Prog|Tense=Pres|VerbForm=Part    3       pcomp   _       _
5       a       a       DET     DT      Definite=Ind|PronType=Art       8       det     _       _
6       global  global  ADJ     JJ      Degree=Pos      8       amod    _       _
7       shared  shared  ADJ     JJ      Degree=Pos      8       amod    _       _
8       memory  memory  NOUN    NN      Number=Sing     4       dobj    _       _
9       to      to      PART    TO      _       10      aux     _       _
10      help    help    VERB    VB      VerbForm=Inf    4       xcomp   _       _
11      coordination    coordination    NOUN    NN      Number=Sing     10      dobj    _       _
12      and     and     CCONJ   CC      ConjType=Cmp    11      cc      _       _
13      the     the     DET     DT      Definite=Def|PronType=Art       14      det     _       _
14      idea    idea    NOUN    NN      Number=Sing     11      conj    _       _
15      of      of      ADP     IN      _       14      prep    _       _
16      using   use     VERB    VBG     Aspect=Prog|Tense=Pres|VerbForm=Part    15      pcomp   _       _
17      the     the     DET     DT      Definite=Def|PronType=Art       18      det     _       _
18      transformer     transformer     NOUN    NN      Number=Sing     16      dobj    _       _
19      to      to      PART    TO      _       20      aux     _       _
20      implement       implement       VERB    VB      VerbForm=Inf    16      xcomp   _       _
21      it      it      PRON    PRP     Case=Acc|Gender=Neut|Number=Sing|Person=3|PronType=Prs  20      dobj    _       _
22      are     be      AUX     VBP     Mood=Ind|Tense=Pres|VerbForm=Fin        0       ROOT    _       _
23      clear   clear   ADJ     JJ      Degree=Pos      22      acomp   _       SpaceAfter=No
24      .       .       PUNCT   .       PunctType=Peri  22      punct   _       SpaceAfter=No
25

        SPACE   _SP     _       24      dep     _       SpaceAfter=No
26      2       2       X       LS      NumType=Ord     22      attr    _       SpaceAfter=No
27      .       .       PUNCT   .       PunctType=Peri  22      punct   _       SpaceAfter=No
28                                      SPACE   _SP     _       27      dep     _       SpaceAfter=No

With error messages

Traceback (most recent call last):
  File "/gpfs/fs5/nrc/nrc-fs1/ict/portage/corpora/OpenReview/venv.gh_issue/lib/python3.13/site-packages/pyconll/unit/sentence.py", line 86, in __init__
    token = Token(line)
  File "/gpfs/fs5/nrc/nrc-fs1/ict/portage/corpora/OpenReview/venv.gh_issue/lib/python3.13/site-packages/pyconll/unit/token.py", line 666, in __init__
    raise ParseError(error_msg)
pyconll.exception.ParseError: The number of columns per token line must be 10. Invalid token: 2.

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/gpfs/fs5/nrc/nrc-fs1/ict/portage/corpora/OpenReview/venv.gh_issue/lib/python3.13/site-packages/pyconll/_parser.py", line 30, in _create_sentence
    sentence = Sentence(sent_source)
  File "/gpfs/fs5/nrc/nrc-fs1/ict/portage/corpora/OpenReview/venv.gh_issue/lib/python3.13/site-packages/pyconll/unit/sentence.py", line 88, in __init__
    raise ParseError(
        f'Error creating token on line {i} for the current sentence'
    ) from err
pyconll.exception.ParseError: Error creating token on line 2 for the current sentence

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/gpfs/fs5/nrc/nrc-fs1/ict/portage/corpora/OpenReview/scripts/gh.issue.py", line 31, in <module>
    parsed: Conll = pyconll.load.load_from_string(conll_str)
                    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^
  File "/gpfs/fs5/nrc/nrc-fs1/ict/portage/corpora/OpenReview/venv.gh_issue/lib/python3.13/site-packages/pyconll/load.py", line 32, in load_from_string
    c = Conll(lines)
  File "/gpfs/fs5/nrc/nrc-fs1/ict/portage/corpora/OpenReview/venv.gh_issue/lib/python3.13/site-packages/pyconll/unit/conll.py", line 34, in __init__
    for sentence in pyconll._parser.iter_sentences(it):
                    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^
  File "/gpfs/fs5/nrc/nrc-fs1/ict/portage/corpora/OpenReview/venv.gh_issue/lib/python3.13/site-packages/pyconll/_parser.py", line 64, in iter_sentences
    sentence = _create_sentence(sent_lines, last_empty_line + 2)
  File "/gpfs/fs5/nrc/nrc-fs1/ict/portage/corpora/OpenReview/venv.gh_issue/lib/python3.13/site-packages/pyconll/_parser.py", line 32, in _create_sentence
    raise ParseError(
        f'Failed to create sentence at line {line_num}') from err
pyconll.exception.ParseError: Failed to create sentence at line 1

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions