Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions docs/source/aspects.rst
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,14 @@ WhiteSpaces
-----------------------

.. autoclass:: wildnlp.aspects.white_spaces.WhiteSpaces
:members:
:special-members: __init__
:show-inheritance:

Lowercase
-----------------------

.. autoclass:: wildnlp.aspects.lowercase.LowerCase
:members:
:special-members: __init__
:show-inheritance:
12 changes: 12 additions & 0 deletions tests/aspects/test_lowercase.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from wildnlp.aspects import LowerCase


def test_single_word():
assert LowerCase()("Language") == "language"


def test_sentence():
sentence = "EU rejects German call to boycott British lamb."
transformed = LowerCase()(sentence)

assert transformed == "eu rejects german call to boycott british lamb."
1 change: 1 addition & 0 deletions wildnlp/aspects/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@
from .change_char import ChangeChar
from .white_spaces import WhiteSpaces
from .add_sub_string import AddSubString
from .lowercase import LowerCase
18 changes: 18 additions & 0 deletions wildnlp/aspects/lowercase.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from .base import Aspect


class LowerCase(Aspect):
"""Lower-cases the dataset.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add the info that it's especially important for NER tasks?

"""

def __call__(self, sentence):
return " ".join([self._lowercase_word(word)
if word != '' else ''
for word in sentence.split(' ')])

@staticmethod
def _lowercase_word(word):
if len(word) == 0:
raise ValueError("Can't lowercase empty words")
return word.lower()
14 changes: 10 additions & 4 deletions wildnlp/datasets/conll.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,17 @@ def load(self, path):
processed = self._process_sample(sample)
self._data.append(processed)

def apply(self, aspect, apply_to_ne=False):
def apply(self, aspect, apply_to_ne=False, apply_to_both=False):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe it would be better to rewrite it so for example there is the apply_to parameter that can take one of three values:
NON_NE, NE, BOTH? I think that when it's controlled by two variables it's a little bit confusing.

"""

:param aspect: transformation function

:param apply_to_ne: if `False`, transformation won't be applied
to Named Entities. If `True`, transformation
will be applied only to Named Entities.
:param apply_to_both: if `True`, transformation will be applied
to both the Named Entities and other tokens.


:return: modified dataset in the following form:

Expand All @@ -82,10 +85,13 @@ def apply(self, aspect, apply_to_ne=False):
for entry in self._data:
tags = entry['ner_tags']

if apply_to_ne is False:
non_ner = np.where(tags == 'O')[0]
if not apply_to_both:
if apply_to_ne is False:
non_ner = np.where(tags == 'O')[0]
else:
non_ner = np.where(tags != 'O')[0]
else:
non_ner = np.where(tags != 'O')[0]
non_ner = range(len(entry['tokens']))

if len(non_ner) == 0:
modified.append(entry)
Expand Down