Skip to content

Commit

Permalink
Add string.contains APIs to pylibcudf (rapidsai#16814)
Browse files Browse the repository at this point in the history
Contributes to rapidsai#15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: rapidsai#16814
  • Loading branch information
mroeschke authored Sep 19, 2024
1 parent e9b5b53 commit 51c2dd6
Show file tree
Hide file tree
Showing 5 changed files with 199 additions and 69 deletions.
80 changes: 13 additions & 67 deletions python/cudf/cudf/_lib/strings/contains.pyx
Original file line number Diff line number Diff line change
@@ -1,27 +1,10 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from cython.operator cimport dereference
from libc.stdint cimport uint32_t

from cudf.core.buffer import acquire_spill_lock

from libcpp.memory cimport unique_ptr
from libcpp.string cimport string
from libcpp.utility cimport move

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.strings.contains cimport (
count_re as cpp_count_re,
like as cpp_like,
matches_re as cpp_matches_re,
)
from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
from pylibcudf.libcudf.strings.regex_program cimport regex_program

from cudf._lib.column cimport Column
from cudf._lib.scalar cimport DeviceScalar

from pylibcudf.strings import contains
from pylibcudf.strings.regex_program import RegexProgram
Expand All @@ -45,21 +28,10 @@ def count_re(Column source_strings, object reg_ex, uint32_t flags):
Returns a Column with count of occurrences of `reg_ex` in
each string of `source_strings`
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef string reg_ex_string = <string>str(reg_ex).encode()
cdef regex_flags c_flags = <regex_flags>flags
cdef unique_ptr[regex_program] c_prog

with nogil:
c_prog = move(regex_program.create(reg_ex_string, c_flags))
c_result = move(cpp_count_re(
source_view,
dereference(c_prog)
))

return Column.from_unique_ptr(move(c_result))
prog = RegexProgram.create(str(reg_ex), flags)
return Column.from_pylibcudf(
contains.count_re(source_strings.to_pylibcudf(mode="read"), prog)
)


@acquire_spill_lock()
Expand All @@ -68,21 +40,10 @@ def match_re(Column source_strings, object reg_ex, uint32_t flags):
Returns a Column with each value True if the string matches `reg_ex`
regular expression with each record of `source_strings`
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef string reg_ex_string = <string>str(reg_ex).encode()
cdef regex_flags c_flags = <regex_flags>flags
cdef unique_ptr[regex_program] c_prog

with nogil:
c_prog = move(regex_program.create(reg_ex_string, c_flags))
c_result = move(cpp_matches_re(
source_view,
dereference(c_prog)
))

return Column.from_unique_ptr(move(c_result))
prog = RegexProgram.create(str(reg_ex), flags)
return Column.from_pylibcudf(
contains.matches_re(source_strings.to_pylibcudf(mode="read"), prog)
)


@acquire_spill_lock()
Expand All @@ -91,24 +52,9 @@ def like(Column source_strings, object py_pattern, object py_escape):
Returns a Column with each value True if the string matches the
`py_pattern` like expression with each record of `source_strings`
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

cdef DeviceScalar pattern = py_pattern.device_value
cdef DeviceScalar escape = py_escape.device_value

cdef const string_scalar* scalar_ptn = <const string_scalar*>(
pattern.get_raw_ptr()
)
cdef const string_scalar* scalar_esc = <const string_scalar*>(
escape.get_raw_ptr()
plc_column = contains.like(
source_strings.to_pylibcudf(mode="read"),
py_pattern.device_value.c_value,
py_escape.device_value.c_value,
)

with nogil:
c_result = move(cpp_like(
source_view,
scalar_ptn[0],
scalar_esc[0]
))

return Column.from_unique_ptr(move(c_result))
return Column.from_pylibcudf(plc_column)
7 changes: 6 additions & 1 deletion python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,9 @@ cdef extern from "cudf/strings/contains.hpp" namespace "cudf::strings" nogil:
cdef unique_ptr[column] like(
column_view source_strings,
string_scalar pattern,
string_scalar escape) except +
string_scalar escape_character) except +

cdef unique_ptr[column] like(
column_view source_strings,
column_view patterns,
string_scalar escape_character) except +
14 changes: 14 additions & 0 deletions python/pylibcudf/pylibcudf/strings/contains.pxd
Original file line number Diff line number Diff line change
@@ -1,7 +1,21 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from pylibcudf.column cimport Column
from pylibcudf.scalar cimport Scalar
from pylibcudf.strings.regex_program cimport RegexProgram

ctypedef fused ColumnOrScalar:
Column
Scalar

cpdef Column contains_re(Column input, RegexProgram prog)

cpdef Column count_re(Column input, RegexProgram prog)

cpdef Column matches_re(Column input, RegexProgram prog)

cpdef Column like(
Column input,
ColumnOrScalar pattern,
Scalar escape_character = *
)
130 changes: 129 additions & 1 deletion python/pylibcudf/pylibcudf/strings/contains.pyx
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from cython.operator import dereference

from pylibcudf.column cimport Column
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.scalar.scalar cimport string_scalar
from pylibcudf.libcudf.scalar.scalar_factories cimport (
make_string_scalar as cpp_make_string_scalar,
)
from pylibcudf.libcudf.strings cimport contains as cpp_contains
from pylibcudf.strings.regex_program cimport RegexProgram

Expand Down Expand Up @@ -32,9 +38,131 @@ cpdef Column contains_re(
cdef unique_ptr[column] result

with nogil:
result = cpp_contains.contains_re(
result = move(cpp_contains.contains_re(
input.view(),
prog.c_obj.get()[0]
))

return Column.from_libcudf(move(result))


cpdef Column count_re(
Column input,
RegexProgram prog
):
"""Returns the number of times the given regex_program's pattern
matches in each string.
For details, see :cpp:func:`cudf::strings::count_re`.
Parameters
----------
input : Column
The input strings
prog : RegexProgram
Regex program instance
Returns
-------
pylibcudf.Column
New column of match counts for each string
"""

cdef unique_ptr[column] result

with nogil:
result = move(cpp_contains.count_re(
input.view(),
prog.c_obj.get()[0]
))

return Column.from_libcudf(move(result))


cpdef Column matches_re(
Column input,
RegexProgram prog
):
"""Returns a boolean column identifying rows which
matching the given regex_program object but only at
the beginning the string.
For details, see :cpp:func:`cudf::strings::matches_re`.
Parameters
----------
input : Column
The input strings
prog : RegexProgram
Regex program instance
Returns
-------
pylibcudf.Column
New column of boolean results for each string
"""

cdef unique_ptr[column] result

with nogil:
result = move(cpp_contains.matches_re(
input.view(),
prog.c_obj.get()[0]
))

return Column.from_libcudf(move(result))


cpdef Column like(Column input, ColumnOrScalar pattern, Scalar escape_character=None):
"""
Returns a boolean column identifying rows which
match the given like pattern.
For details, see :cpp:func:`cudf::strings::like`.
Parameters
----------
input : Column
The input strings
pattern : Column or Scalar
Like patterns to match within each string
escape_character : Scalar
Optional character specifies the escape prefix.
Default is no escape character.
Returns
-------
pylibcudf.Column
New column of boolean results for each string
"""
cdef unique_ptr[column] result

if escape_character is None:
escape_character = Scalar.from_libcudf(
cpp_make_string_scalar("".encode())
)

cdef const string_scalar* c_escape_character = <const string_scalar*>(
escape_character.c_obj.get()
)
cdef const string_scalar* c_pattern

if ColumnOrScalar is Column:
with nogil:
result = move(cpp_contains.like(
input.view(),
pattern.view(),
dereference(c_escape_character)
))
elif ColumnOrScalar is Scalar:
c_pattern = <const string_scalar*>(pattern.c_obj.get())
with nogil:
result = move(cpp_contains.like(
input.view(),
dereference(c_pattern),
dereference(c_escape_character)
))
else:
raise ValueError("pattern must be a Column or a Scalar")

return Column.from_libcudf(move(result))
37 changes: 37 additions & 0 deletions python/pylibcudf/pylibcudf/tests/test_string_contains.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,40 @@ def test_contains_re(target_col, pa_target_scalar, plc_target_pat):
pa_target_col, pa_target_scalar.as_py()
)
assert_column_eq(got, expected)


def test_count_re():
pattern = "[1-9][a-z]"
arr = pa.array(["A1a2A3a4", "A1A2A3", None])
result = plc.strings.contains.count_re(
plc.interop.from_arrow(arr),
plc.strings.regex_program.RegexProgram.create(
pattern, plc.strings.regex_flags.RegexFlags.DEFAULT
),
)
expected = pc.count_substring_regex(arr, pattern)
assert_column_eq(result, expected)


def test_match_re():
pattern = "[1-9][a-z]"
arr = pa.array(["1a2b", "b1a2", None])
result = plc.strings.contains.matches_re(
plc.interop.from_arrow(arr),
plc.strings.regex_program.RegexProgram.create(
pattern, plc.strings.regex_flags.RegexFlags.DEFAULT
),
)
expected = pc.match_substring_regex(arr, f"^{pattern}")
assert_column_eq(result, expected)


def test_like():
pattern = "%a"
arr = pa.array(["1a2aa3aaa"])
result = plc.strings.contains.like(
plc.interop.from_arrow(arr),
plc.interop.from_arrow(pa.array([pattern])),
)
expected = pc.match_like(arr, pattern)
assert_column_eq(result, expected)

0 comments on commit 51c2dd6

Please sign in to comment.