sas7bdat.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732
  1. """
  2. Read SAS7BDAT files
  3. Based on code written by Jared Hobbs:
  4. https://bitbucket.org/jaredhobbs/sas7bdat
  5. See also:
  6. https://github.com/BioStatMatt/sas7bdat
  7. Partial documentation of the file format:
  8. https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf
  9. Reference for binary data compression:
  10. http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
  11. """
  12. from collections import abc
  13. from datetime import datetime
  14. import struct
  15. import numpy as np
  16. from pandas.errors import EmptyDataError
  17. import pandas as pd
  18. from pandas.io.common import get_filepath_or_buffer
  19. from pandas.io.sas._sas import Parser
  20. import pandas.io.sas.sas_constants as const
  21. class _subheader_pointer:
  22. pass
  23. class _column:
  24. pass
  25. # SAS7BDAT represents a SAS data file in SAS7BDAT format.
  26. class SAS7BDATReader(abc.Iterator):
  27. """
  28. Read SAS files in SAS7BDAT format.
  29. Parameters
  30. ----------
  31. path_or_buf : path name or buffer
  32. Name of SAS file or file-like object pointing to SAS file
  33. contents.
  34. index : column identifier, defaults to None
  35. Column to use as index.
  36. convert_dates : boolean, defaults to True
  37. Attempt to convert dates to Pandas datetime values. Note that
  38. some rarely used SAS date formats may be unsupported.
  39. blank_missing : boolean, defaults to True
  40. Convert empty strings to missing values (SAS uses blanks to
  41. indicate missing character variables).
  42. chunksize : int, defaults to None
  43. Return SAS7BDATReader object for iterations, returns chunks
  44. with given number of lines.
  45. encoding : string, defaults to None
  46. String encoding.
  47. convert_text : bool, defaults to True
  48. If False, text variables are left as raw bytes.
  49. convert_header_text : bool, defaults to True
  50. If False, header text, including column names, are left as raw
  51. bytes.
  52. """
  53. def __init__(
  54. self,
  55. path_or_buf,
  56. index=None,
  57. convert_dates=True,
  58. blank_missing=True,
  59. chunksize=None,
  60. encoding=None,
  61. convert_text=True,
  62. convert_header_text=True,
  63. ):
  64. self.index = index
  65. self.convert_dates = convert_dates
  66. self.blank_missing = blank_missing
  67. self.chunksize = chunksize
  68. self.encoding = encoding
  69. self.convert_text = convert_text
  70. self.convert_header_text = convert_header_text
  71. self.default_encoding = "latin-1"
  72. self.compression = ""
  73. self.column_names_strings = []
  74. self.column_names = []
  75. self.column_formats = []
  76. self.columns = []
  77. self._current_page_data_subheader_pointers = []
  78. self._cached_page = None
  79. self._column_data_lengths = []
  80. self._column_data_offsets = []
  81. self._column_types = []
  82. self._current_row_in_file_index = 0
  83. self._current_row_on_page_index = 0
  84. self._current_row_in_file_index = 0
  85. self._path_or_buf, _, _, _ = get_filepath_or_buffer(path_or_buf)
  86. if isinstance(self._path_or_buf, str):
  87. self._path_or_buf = open(self._path_or_buf, "rb")
  88. self.handle = self._path_or_buf
  89. self._get_properties()
  90. self._parse_metadata()
  91. def column_data_lengths(self):
  92. """Return a numpy int64 array of the column data lengths"""
  93. return np.asarray(self._column_data_lengths, dtype=np.int64)
  94. def column_data_offsets(self):
  95. """Return a numpy int64 array of the column offsets"""
  96. return np.asarray(self._column_data_offsets, dtype=np.int64)
  97. def column_types(self):
  98. """Returns a numpy character array of the column types:
  99. s (string) or d (double)"""
  100. return np.asarray(self._column_types, dtype=np.dtype("S1"))
  101. def close(self):
  102. try:
  103. self.handle.close()
  104. except AttributeError:
  105. pass
  106. def _get_properties(self):
  107. # Check magic number
  108. self._path_or_buf.seek(0)
  109. self._cached_page = self._path_or_buf.read(288)
  110. if self._cached_page[0 : len(const.magic)] != const.magic:
  111. self.close()
  112. raise ValueError("magic number mismatch (not a SAS file?)")
  113. # Get alignment information
  114. align1, align2 = 0, 0
  115. buf = self._read_bytes(const.align_1_offset, const.align_1_length)
  116. if buf == const.u64_byte_checker_value:
  117. align2 = const.align_2_value
  118. self.U64 = True
  119. self._int_length = 8
  120. self._page_bit_offset = const.page_bit_offset_x64
  121. self._subheader_pointer_length = const.subheader_pointer_length_x64
  122. else:
  123. self.U64 = False
  124. self._page_bit_offset = const.page_bit_offset_x86
  125. self._subheader_pointer_length = const.subheader_pointer_length_x86
  126. self._int_length = 4
  127. buf = self._read_bytes(const.align_2_offset, const.align_2_length)
  128. if buf == const.align_1_checker_value:
  129. align1 = const.align_2_value
  130. total_align = align1 + align2
  131. # Get endianness information
  132. buf = self._read_bytes(const.endianness_offset, const.endianness_length)
  133. if buf == b"\x01":
  134. self.byte_order = "<"
  135. else:
  136. self.byte_order = ">"
  137. # Get encoding information
  138. buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0]
  139. if buf in const.encoding_names:
  140. self.file_encoding = const.encoding_names[buf]
  141. else:
  142. self.file_encoding = f"unknown (code={buf})"
  143. # Get platform information
  144. buf = self._read_bytes(const.platform_offset, const.platform_length)
  145. if buf == b"1":
  146. self.platform = "unix"
  147. elif buf == b"2":
  148. self.platform = "windows"
  149. else:
  150. self.platform = "unknown"
  151. buf = self._read_bytes(const.dataset_offset, const.dataset_length)
  152. self.name = buf.rstrip(b"\x00 ")
  153. if self.convert_header_text:
  154. self.name = self.name.decode(self.encoding or self.default_encoding)
  155. buf = self._read_bytes(const.file_type_offset, const.file_type_length)
  156. self.file_type = buf.rstrip(b"\x00 ")
  157. if self.convert_header_text:
  158. self.file_type = self.file_type.decode(
  159. self.encoding or self.default_encoding
  160. )
  161. # Timestamp is epoch 01/01/1960
  162. epoch = datetime(1960, 1, 1)
  163. x = self._read_float(
  164. const.date_created_offset + align1, const.date_created_length
  165. )
  166. self.date_created = epoch + pd.to_timedelta(x, unit="s")
  167. x = self._read_float(
  168. const.date_modified_offset + align1, const.date_modified_length
  169. )
  170. self.date_modified = epoch + pd.to_timedelta(x, unit="s")
  171. self.header_length = self._read_int(
  172. const.header_size_offset + align1, const.header_size_length
  173. )
  174. # Read the rest of the header into cached_page.
  175. buf = self._path_or_buf.read(self.header_length - 288)
  176. self._cached_page += buf
  177. if len(self._cached_page) != self.header_length:
  178. self.close()
  179. raise ValueError("The SAS7BDAT file appears to be truncated.")
  180. self._page_length = self._read_int(
  181. const.page_size_offset + align1, const.page_size_length
  182. )
  183. self._page_count = self._read_int(
  184. const.page_count_offset + align1, const.page_count_length
  185. )
  186. buf = self._read_bytes(
  187. const.sas_release_offset + total_align, const.sas_release_length
  188. )
  189. self.sas_release = buf.rstrip(b"\x00 ")
  190. if self.convert_header_text:
  191. self.sas_release = self.sas_release.decode(
  192. self.encoding or self.default_encoding
  193. )
  194. buf = self._read_bytes(
  195. const.sas_server_type_offset + total_align, const.sas_server_type_length
  196. )
  197. self.server_type = buf.rstrip(b"\x00 ")
  198. if self.convert_header_text:
  199. self.server_type = self.server_type.decode(
  200. self.encoding or self.default_encoding
  201. )
  202. buf = self._read_bytes(
  203. const.os_version_number_offset + total_align, const.os_version_number_length
  204. )
  205. self.os_version = buf.rstrip(b"\x00 ")
  206. if self.convert_header_text:
  207. self.os_version = self.os_version.decode(
  208. self.encoding or self.default_encoding
  209. )
  210. buf = self._read_bytes(const.os_name_offset + total_align, const.os_name_length)
  211. buf = buf.rstrip(b"\x00 ")
  212. if len(buf) > 0:
  213. self.os_name = buf.decode(self.encoding or self.default_encoding)
  214. else:
  215. buf = self._read_bytes(
  216. const.os_maker_offset + total_align, const.os_maker_length
  217. )
  218. self.os_name = buf.rstrip(b"\x00 ")
  219. if self.convert_header_text:
  220. self.os_name = self.os_name.decode(
  221. self.encoding or self.default_encoding
  222. )
  223. def __next__(self):
  224. da = self.read(nrows=self.chunksize or 1)
  225. if da is None:
  226. raise StopIteration
  227. return da
  228. # Read a single float of the given width (4 or 8).
  229. def _read_float(self, offset, width):
  230. if width not in (4, 8):
  231. self.close()
  232. raise ValueError("invalid float width")
  233. buf = self._read_bytes(offset, width)
  234. fd = "f" if width == 4 else "d"
  235. return struct.unpack(self.byte_order + fd, buf)[0]
  236. # Read a single signed integer of the given width (1, 2, 4 or 8).
  237. def _read_int(self, offset, width):
  238. if width not in (1, 2, 4, 8):
  239. self.close()
  240. raise ValueError("invalid int width")
  241. buf = self._read_bytes(offset, width)
  242. it = {1: "b", 2: "h", 4: "l", 8: "q"}[width]
  243. iv = struct.unpack(self.byte_order + it, buf)[0]
  244. return iv
  245. def _read_bytes(self, offset, length):
  246. if self._cached_page is None:
  247. self._path_or_buf.seek(offset)
  248. buf = self._path_or_buf.read(length)
  249. if len(buf) < length:
  250. self.close()
  251. msg = f"Unable to read {length:d} bytes from file position {offset:d}."
  252. raise ValueError(msg)
  253. return buf
  254. else:
  255. if offset + length > len(self._cached_page):
  256. self.close()
  257. raise ValueError("The cached page is too small.")
  258. return self._cached_page[offset : offset + length]
  259. def _parse_metadata(self):
  260. done = False
  261. while not done:
  262. self._cached_page = self._path_or_buf.read(self._page_length)
  263. if len(self._cached_page) <= 0:
  264. break
  265. if len(self._cached_page) != self._page_length:
  266. self.close()
  267. raise ValueError("Failed to read a meta data page from the SAS file.")
  268. done = self._process_page_meta()
  269. def _process_page_meta(self):
  270. self._read_page_header()
  271. pt = [const.page_meta_type, const.page_amd_type] + const.page_mix_types
  272. if self._current_page_type in pt:
  273. self._process_page_metadata()
  274. is_data_page = self._current_page_type & const.page_data_type
  275. is_mix_page = self._current_page_type in const.page_mix_types
  276. return (
  277. is_data_page
  278. or is_mix_page
  279. or self._current_page_data_subheader_pointers != []
  280. )
  281. def _read_page_header(self):
  282. bit_offset = self._page_bit_offset
  283. tx = const.page_type_offset + bit_offset
  284. self._current_page_type = self._read_int(tx, const.page_type_length)
  285. tx = const.block_count_offset + bit_offset
  286. self._current_page_block_count = self._read_int(tx, const.block_count_length)
  287. tx = const.subheader_count_offset + bit_offset
  288. self._current_page_subheaders_count = self._read_int(
  289. tx, const.subheader_count_length
  290. )
  291. def _process_page_metadata(self):
  292. bit_offset = self._page_bit_offset
  293. for i in range(self._current_page_subheaders_count):
  294. pointer = self._process_subheader_pointers(
  295. const.subheader_pointers_offset + bit_offset, i
  296. )
  297. if pointer.length == 0:
  298. continue
  299. if pointer.compression == const.truncated_subheader_id:
  300. continue
  301. subheader_signature = self._read_subheader_signature(pointer.offset)
  302. subheader_index = self._get_subheader_index(
  303. subheader_signature, pointer.compression, pointer.ptype
  304. )
  305. self._process_subheader(subheader_index, pointer)
  306. def _get_subheader_index(self, signature, compression, ptype):
  307. index = const.subheader_signature_to_index.get(signature)
  308. if index is None:
  309. f1 = (compression == const.compressed_subheader_id) or (compression == 0)
  310. f2 = ptype == const.compressed_subheader_type
  311. if (self.compression != "") and f1 and f2:
  312. index = const.SASIndex.data_subheader_index
  313. else:
  314. self.close()
  315. raise ValueError("Unknown subheader signature")
  316. return index
  317. def _process_subheader_pointers(self, offset, subheader_pointer_index):
  318. subheader_pointer_length = self._subheader_pointer_length
  319. total_offset = offset + subheader_pointer_length * subheader_pointer_index
  320. subheader_offset = self._read_int(total_offset, self._int_length)
  321. total_offset += self._int_length
  322. subheader_length = self._read_int(total_offset, self._int_length)
  323. total_offset += self._int_length
  324. subheader_compression = self._read_int(total_offset, 1)
  325. total_offset += 1
  326. subheader_type = self._read_int(total_offset, 1)
  327. x = _subheader_pointer()
  328. x.offset = subheader_offset
  329. x.length = subheader_length
  330. x.compression = subheader_compression
  331. x.ptype = subheader_type
  332. return x
  333. def _read_subheader_signature(self, offset):
  334. subheader_signature = self._read_bytes(offset, self._int_length)
  335. return subheader_signature
  336. def _process_subheader(self, subheader_index, pointer):
  337. offset = pointer.offset
  338. length = pointer.length
  339. if subheader_index == const.SASIndex.row_size_index:
  340. processor = self._process_rowsize_subheader
  341. elif subheader_index == const.SASIndex.column_size_index:
  342. processor = self._process_columnsize_subheader
  343. elif subheader_index == const.SASIndex.column_text_index:
  344. processor = self._process_columntext_subheader
  345. elif subheader_index == const.SASIndex.column_name_index:
  346. processor = self._process_columnname_subheader
  347. elif subheader_index == const.SASIndex.column_attributes_index:
  348. processor = self._process_columnattributes_subheader
  349. elif subheader_index == const.SASIndex.format_and_label_index:
  350. processor = self._process_format_subheader
  351. elif subheader_index == const.SASIndex.column_list_index:
  352. processor = self._process_columnlist_subheader
  353. elif subheader_index == const.SASIndex.subheader_counts_index:
  354. processor = self._process_subheader_counts
  355. elif subheader_index == const.SASIndex.data_subheader_index:
  356. self._current_page_data_subheader_pointers.append(pointer)
  357. return
  358. else:
  359. raise ValueError("unknown subheader index")
  360. processor(offset, length)
  361. def _process_rowsize_subheader(self, offset, length):
  362. int_len = self._int_length
  363. lcs_offset = offset
  364. lcp_offset = offset
  365. if self.U64:
  366. lcs_offset += 682
  367. lcp_offset += 706
  368. else:
  369. lcs_offset += 354
  370. lcp_offset += 378
  371. self.row_length = self._read_int(
  372. offset + const.row_length_offset_multiplier * int_len, int_len
  373. )
  374. self.row_count = self._read_int(
  375. offset + const.row_count_offset_multiplier * int_len, int_len
  376. )
  377. self.col_count_p1 = self._read_int(
  378. offset + const.col_count_p1_multiplier * int_len, int_len
  379. )
  380. self.col_count_p2 = self._read_int(
  381. offset + const.col_count_p2_multiplier * int_len, int_len
  382. )
  383. mx = const.row_count_on_mix_page_offset_multiplier * int_len
  384. self._mix_page_row_count = self._read_int(offset + mx, int_len)
  385. self._lcs = self._read_int(lcs_offset, 2)
  386. self._lcp = self._read_int(lcp_offset, 2)
  387. def _process_columnsize_subheader(self, offset, length):
  388. int_len = self._int_length
  389. offset += int_len
  390. self.column_count = self._read_int(offset, int_len)
  391. if self.col_count_p1 + self.col_count_p2 != self.column_count:
  392. print(
  393. f"Warning: column count mismatch ({self.col_count_p1} + "
  394. f"{self.col_count_p2} != "
  395. f"{self.column_count})\n"
  396. )
  397. # Unknown purpose
  398. def _process_subheader_counts(self, offset, length):
  399. pass
  400. def _process_columntext_subheader(self, offset, length):
  401. offset += self._int_length
  402. text_block_size = self._read_int(offset, const.text_block_size_length)
  403. buf = self._read_bytes(offset, text_block_size)
  404. cname_raw = buf[0:text_block_size].rstrip(b"\x00 ")
  405. cname = cname_raw
  406. if self.convert_header_text:
  407. cname = cname.decode(self.encoding or self.default_encoding)
  408. self.column_names_strings.append(cname)
  409. if len(self.column_names_strings) == 1:
  410. compression_literal = ""
  411. for cl in const.compression_literals:
  412. if cl in cname_raw:
  413. compression_literal = cl
  414. self.compression = compression_literal
  415. offset -= self._int_length
  416. offset1 = offset + 16
  417. if self.U64:
  418. offset1 += 4
  419. buf = self._read_bytes(offset1, self._lcp)
  420. compression_literal = buf.rstrip(b"\x00")
  421. if compression_literal == "":
  422. self._lcs = 0
  423. offset1 = offset + 32
  424. if self.U64:
  425. offset1 += 4
  426. buf = self._read_bytes(offset1, self._lcp)
  427. self.creator_proc = buf[0 : self._lcp]
  428. elif compression_literal == const.rle_compression:
  429. offset1 = offset + 40
  430. if self.U64:
  431. offset1 += 4
  432. buf = self._read_bytes(offset1, self._lcp)
  433. self.creator_proc = buf[0 : self._lcp]
  434. elif self._lcs > 0:
  435. self._lcp = 0
  436. offset1 = offset + 16
  437. if self.U64:
  438. offset1 += 4
  439. buf = self._read_bytes(offset1, self._lcs)
  440. self.creator_proc = buf[0 : self._lcp]
  441. if self.convert_header_text:
  442. if hasattr(self, "creator_proc"):
  443. self.creator_proc = self.creator_proc.decode(
  444. self.encoding or self.default_encoding
  445. )
  446. def _process_columnname_subheader(self, offset, length):
  447. int_len = self._int_length
  448. offset += int_len
  449. column_name_pointers_count = (length - 2 * int_len - 12) // 8
  450. for i in range(column_name_pointers_count):
  451. text_subheader = (
  452. offset
  453. + const.column_name_pointer_length * (i + 1)
  454. + const.column_name_text_subheader_offset
  455. )
  456. col_name_offset = (
  457. offset
  458. + const.column_name_pointer_length * (i + 1)
  459. + const.column_name_offset_offset
  460. )
  461. col_name_length = (
  462. offset
  463. + const.column_name_pointer_length * (i + 1)
  464. + const.column_name_length_offset
  465. )
  466. idx = self._read_int(
  467. text_subheader, const.column_name_text_subheader_length
  468. )
  469. col_offset = self._read_int(
  470. col_name_offset, const.column_name_offset_length
  471. )
  472. col_len = self._read_int(col_name_length, const.column_name_length_length)
  473. name_str = self.column_names_strings[idx]
  474. self.column_names.append(name_str[col_offset : col_offset + col_len])
  475. def _process_columnattributes_subheader(self, offset, length):
  476. int_len = self._int_length
  477. column_attributes_vectors_count = (length - 2 * int_len - 12) // (int_len + 8)
  478. for i in range(column_attributes_vectors_count):
  479. col_data_offset = (
  480. offset + int_len + const.column_data_offset_offset + i * (int_len + 8)
  481. )
  482. col_data_len = (
  483. offset
  484. + 2 * int_len
  485. + const.column_data_length_offset
  486. + i * (int_len + 8)
  487. )
  488. col_types = (
  489. offset + 2 * int_len + const.column_type_offset + i * (int_len + 8)
  490. )
  491. x = self._read_int(col_data_offset, int_len)
  492. self._column_data_offsets.append(x)
  493. x = self._read_int(col_data_len, const.column_data_length_length)
  494. self._column_data_lengths.append(x)
  495. x = self._read_int(col_types, const.column_type_length)
  496. self._column_types.append(b"d" if x == 1 else b"s")
  497. def _process_columnlist_subheader(self, offset, length):
  498. # unknown purpose
  499. pass
  500. def _process_format_subheader(self, offset, length):
  501. int_len = self._int_length
  502. text_subheader_format = (
  503. offset + const.column_format_text_subheader_index_offset + 3 * int_len
  504. )
  505. col_format_offset = offset + const.column_format_offset_offset + 3 * int_len
  506. col_format_len = offset + const.column_format_length_offset + 3 * int_len
  507. text_subheader_label = (
  508. offset + const.column_label_text_subheader_index_offset + 3 * int_len
  509. )
  510. col_label_offset = offset + const.column_label_offset_offset + 3 * int_len
  511. col_label_len = offset + const.column_label_length_offset + 3 * int_len
  512. x = self._read_int(
  513. text_subheader_format, const.column_format_text_subheader_index_length
  514. )
  515. format_idx = min(x, len(self.column_names_strings) - 1)
  516. format_start = self._read_int(
  517. col_format_offset, const.column_format_offset_length
  518. )
  519. format_len = self._read_int(col_format_len, const.column_format_length_length)
  520. label_idx = self._read_int(
  521. text_subheader_label, const.column_label_text_subheader_index_length
  522. )
  523. label_idx = min(label_idx, len(self.column_names_strings) - 1)
  524. label_start = self._read_int(col_label_offset, const.column_label_offset_length)
  525. label_len = self._read_int(col_label_len, const.column_label_length_length)
  526. label_names = self.column_names_strings[label_idx]
  527. column_label = label_names[label_start : label_start + label_len]
  528. format_names = self.column_names_strings[format_idx]
  529. column_format = format_names[format_start : format_start + format_len]
  530. current_column_number = len(self.columns)
  531. col = _column()
  532. col.col_id = current_column_number
  533. col.name = self.column_names[current_column_number]
  534. col.label = column_label
  535. col.format = column_format
  536. col.ctype = self._column_types[current_column_number]
  537. col.length = self._column_data_lengths[current_column_number]
  538. self.column_formats.append(column_format)
  539. self.columns.append(col)
  540. def read(self, nrows=None):
  541. if (nrows is None) and (self.chunksize is not None):
  542. nrows = self.chunksize
  543. elif nrows is None:
  544. nrows = self.row_count
  545. if len(self._column_types) == 0:
  546. self.close()
  547. raise EmptyDataError("No columns to parse from file")
  548. if self._current_row_in_file_index >= self.row_count:
  549. return None
  550. m = self.row_count - self._current_row_in_file_index
  551. if nrows > m:
  552. nrows = m
  553. nd = self._column_types.count(b"d")
  554. ns = self._column_types.count(b"s")
  555. self._string_chunk = np.empty((ns, nrows), dtype=np.object)
  556. self._byte_chunk = np.zeros((nd, 8 * nrows), dtype=np.uint8)
  557. self._current_row_in_chunk_index = 0
  558. p = Parser(self)
  559. p.read(nrows)
  560. rslt = self._chunk_to_dataframe()
  561. if self.index is not None:
  562. rslt = rslt.set_index(self.index)
  563. return rslt
  564. def _read_next_page(self):
  565. self._current_page_data_subheader_pointers = []
  566. self._cached_page = self._path_or_buf.read(self._page_length)
  567. if len(self._cached_page) <= 0:
  568. return True
  569. elif len(self._cached_page) != self._page_length:
  570. self.close()
  571. msg = (
  572. "failed to read complete page from file (read "
  573. f"{len(self._cached_page):d} of "
  574. f"{self._page_length:d} bytes)"
  575. )
  576. raise ValueError(msg)
  577. self._read_page_header()
  578. page_type = self._current_page_type
  579. if page_type == const.page_meta_type:
  580. self._process_page_metadata()
  581. is_data_page = page_type & const.page_data_type
  582. pt = [const.page_meta_type] + const.page_mix_types
  583. if not is_data_page and self._current_page_type not in pt:
  584. return self._read_next_page()
  585. return False
  586. def _chunk_to_dataframe(self):
  587. n = self._current_row_in_chunk_index
  588. m = self._current_row_in_file_index
  589. ix = range(m - n, m)
  590. rslt = pd.DataFrame(index=ix)
  591. js, jb = 0, 0
  592. for j in range(self.column_count):
  593. name = self.column_names[j]
  594. if self._column_types[j] == b"d":
  595. rslt[name] = self._byte_chunk[jb, :].view(dtype=self.byte_order + "d")
  596. rslt[name] = np.asarray(rslt[name], dtype=np.float64)
  597. if self.convert_dates:
  598. unit = None
  599. if self.column_formats[j] in const.sas_date_formats:
  600. unit = "d"
  601. elif self.column_formats[j] in const.sas_datetime_formats:
  602. unit = "s"
  603. if unit:
  604. rslt[name] = pd.to_datetime(
  605. rslt[name], unit=unit, origin="1960-01-01"
  606. )
  607. jb += 1
  608. elif self._column_types[j] == b"s":
  609. rslt[name] = self._string_chunk[js, :]
  610. if self.convert_text and (self.encoding is not None):
  611. rslt[name] = rslt[name].str.decode(
  612. self.encoding or self.default_encoding
  613. )
  614. if self.blank_missing:
  615. ii = rslt[name].str.len() == 0
  616. rslt.loc[ii, name] = np.nan
  617. js += 1
  618. else:
  619. self.close()
  620. raise ValueError(f"unknown column type {self._column_types[j]}")
  621. return rslt