orc.py 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. """ orc compat """
  2. import distutils
  3. from typing import TYPE_CHECKING, List, Optional
  4. from pandas._typing import FilePathOrBuffer
  5. from pandas.io.common import get_filepath_or_buffer
  6. if TYPE_CHECKING:
  7. from pandas import DataFrame
  8. def read_orc(
  9. path: FilePathOrBuffer, columns: Optional[List[str]] = None, **kwargs,
  10. ) -> "DataFrame":
  11. """
  12. Load an ORC object from the file path, returning a DataFrame.
  13. .. versionadded:: 1.0.0
  14. Parameters
  15. ----------
  16. path : str, path object or file-like object
  17. Any valid string path is acceptable. The string could be a URL. Valid
  18. URL schemes include http, ftp, s3, and file. For file URLs, a host is
  19. expected. A local file could be:
  20. ``file://localhost/path/to/table.orc``.
  21. If you want to pass in a path object, pandas accepts any
  22. ``os.PathLike``.
  23. By file-like object, we refer to objects with a ``read()`` method,
  24. such as a file handler (e.g. via builtin ``open`` function)
  25. or ``StringIO``.
  26. columns : list, default None
  27. If not None, only these columns will be read from the file.
  28. **kwargs
  29. Any additional kwargs are passed to pyarrow.
  30. Returns
  31. -------
  32. DataFrame
  33. """
  34. # we require a newer version of pyarrow than we support for parquet
  35. import pyarrow
  36. if distutils.version.LooseVersion(pyarrow.__version__) < "0.13.0":
  37. raise ImportError("pyarrow must be >= 0.13.0 for read_orc")
  38. import pyarrow.orc
  39. path, _, _, _ = get_filepath_or_buffer(path)
  40. orc_file = pyarrow.orc.ORCFile(path)
  41. result = orc_file.read(columns=columns, **kwargs).to_pandas()
  42. return result