-
Notifications
You must be signed in to change notification settings - Fork 341
Fix filesystem #2291
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Fix filesystem #2291
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -381,21 +381,38 @@ def to_input_file(self) -> PyArrowFile: | |
|
||
class PyArrowFileIO(FileIO): | ||
fs_by_scheme: Callable[[str, Optional[str]], FileSystem] | ||
config: Config | ||
|
||
def __init__(self, properties: Properties = EMPTY_DICT): | ||
self.fs_by_scheme: Callable[[str, Optional[str]], FileSystem] = lru_cache(self._initialize_fs) | ||
self.config = Config() | ||
super().__init__(properties=properties) | ||
|
||
@staticmethod | ||
def parse_location(location: str) -> Tuple[str, str, str]: | ||
"""Return the path without the scheme.""" | ||
def parse_location(location: str, config: Config) -> Tuple[str, str, str]: | ||
"""Return (scheme, netloc, path) for the given location. | ||
|
||
Uses environment variables DEFAULT_SCHEME and DEFAULT_NETLOC | ||
if scheme/netloc are missing. | ||
""" | ||
uri = urlparse(location) | ||
if not uri.scheme: | ||
return "file", uri.netloc, os.path.abspath(location) | ||
elif uri.scheme in ("hdfs", "viewfs"): | ||
return uri.scheme, uri.netloc, uri.path | ||
|
||
# Load defaults from environment | ||
default_scheme = config.get_str("default-scheme") or "file" | ||
default_netloc = config.get_str("default-netloc") or "" | ||
|
||
# Apply logic | ||
scheme = uri.scheme or default_scheme | ||
netloc = uri.netloc or default_netloc | ||
|
||
if scheme in ("hdfs", "viewfs"): | ||
return scheme, netloc, uri.path | ||
else: | ||
return uri.scheme, uri.netloc, f"{uri.netloc}{uri.path}" | ||
# For non-HDFS URIs, include netloc in the path if present | ||
path = uri.path if uri.scheme else os.path.abspath(location) | ||
if netloc and not path.startswith(netloc): | ||
path = f"{netloc}{path}" | ||
return scheme, netloc, path | ||
Comment on lines
+404
to
+415
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i actually really want to get rid of this if {scheme} logic here. Is there a way to refactor these changes down to the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i don't see a nice way to do this since the path used in the pyarrowfile is actually different in the different cases, i tried to see if we could use the same path with netloc in it for hdfs but it doesn't seem to work |
||
|
||
def _initialize_fs(self, scheme: str, netloc: Optional[str] = None) -> FileSystem: | ||
"""Initialize FileSystem for different scheme.""" | ||
|
@@ -584,7 +601,7 @@ def new_input(self, location: str) -> PyArrowFile: | |
Returns: | ||
PyArrowFile: A PyArrowFile instance for the given location. | ||
""" | ||
scheme, netloc, path = self.parse_location(location) | ||
scheme, netloc, path = self.parse_location(location, self.config) | ||
return PyArrowFile( | ||
fs=self.fs_by_scheme(scheme, netloc), | ||
location=location, | ||
|
@@ -601,7 +618,7 @@ def new_output(self, location: str) -> PyArrowFile: | |
Returns: | ||
PyArrowFile: A PyArrowFile instance for the given location. | ||
""" | ||
scheme, netloc, path = self.parse_location(location) | ||
scheme, netloc, path = self.parse_location(location, self.config) | ||
return PyArrowFile( | ||
fs=self.fs_by_scheme(scheme, netloc), | ||
location=location, | ||
|
@@ -622,7 +639,7 @@ def delete(self, location: Union[str, InputFile, OutputFile]) -> None: | |
an AWS error code 15. | ||
""" | ||
str_location = location.location if isinstance(location, (InputFile, OutputFile)) else location | ||
scheme, netloc, path = self.parse_location(str_location) | ||
scheme, netloc, path = self.parse_location(str_location, self.config) | ||
fs = self.fs_by_scheme(scheme, netloc) | ||
|
||
try: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: i think its better to pass these in through the properties field
https://py.iceberg.apache.org/configuration/#hdfs
we can get the env variable and then pass into the properties.