From bb626bab8113377d09210a4e3cf7cd7351182cdf Mon Sep 17 00:00:00 2001 From: lestcape Date: Mon, 17 May 2021 18:27:09 -0500 Subject: [PATCH 1/5] Writer: Do not hardcode the charset and also pass it to the buffer. --- src/Sav/Writer.php | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Sav/Writer.php b/src/Sav/Writer.php index 36260d8..e1ef808 100644 --- a/src/Sav/Writer.php +++ b/src/Sav/Writer.php @@ -97,7 +97,10 @@ public function write($data) $this->info[Record\Info\VariableAttributes::SUBTYPE] = new Record\Info\VariableAttributes(); $this->info[Record\Info\LongStringValueLabels::SUBTYPE] = new Record\Info\LongStringValueLabels(); $this->info[Record\Info\LongStringMissingValues::SUBTYPE] = new Record\Info\LongStringMissingValues(); - $this->info[Record\Info\CharacterEncoding::SUBTYPE] = new Record\Info\CharacterEncoding('UTF-8'); + + $encode = (isset($data['info']) && isset($data['info']['characterEncoding'])) ? $config['info']['characterEncoding'] : 'UTF-8'; + $this->info[Record\Info\CharacterEncoding::SUBTYPE] = new Record\Info\CharacterEncoding($encode); + $this->buffer->charset = $encode; $this->data = new Record\Data(); From 7c5d843faea6550316b259effcbec51770dc20cd Mon Sep 17 00:00:00 2001 From: lestcape Date: Mon, 17 May 2021 18:36:18 -0500 Subject: [PATCH 2/5] Reader: Refactor the code in a convenient way to add the charset support. --- src/Sav/Reader.php | 60 ++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/src/Sav/Reader.php b/src/Sav/Reader.php index 1537ca1..e76e94f 100644 --- a/src/Sav/Reader.php +++ b/src/Sav/Reader.php @@ -71,6 +71,34 @@ private function __construct(Buffer $buffer) $this->_buffer->context = $this; } + private function readBodyInternal() + { + $infoCollection = new Record\InfoCollection(); + $posVar = 0; + do { + $recType = $this->_buffer->readInt(); + switch ($recType) { + case Record\Variable::TYPE: + $variable = Record\Variable::fill($this->_buffer); + $variable->realPosition = $posVar; + $this->variables[] = $variable; + $posVar++; + break; + case Record\ValueLabel::TYPE: + $this->valueLabels[] = Record\ValueLabel::fill($this->_buffer, [ + 'variables' => $this->variables, + ]); + break; + case Record\Info::TYPE: + $this->info = $infoCollection->fill($this->_buffer); + break; + case Record\Document::TYPE: + $this->documents = Record\Document::fill($this->_buffer)->toArray(); + break; + } + } while (Record\Data::TYPE !== $recType); + } + /** * @param string $file * @@ -127,33 +155,7 @@ public function readBody() } // TODO: refactory - $infoCollection = new Record\InfoCollection(); - $tempVars = []; - $posVar = 0; - - do { - $recType = $this->_buffer->readInt(); - switch ($recType) { - case Record\Variable::TYPE: - $variable = Record\Variable::fill($this->_buffer); - $variable->realPosition = $posVar; - $tempVars[] = $variable; - $posVar++; - break; - case Record\ValueLabel::TYPE: - $this->valueLabels[] = Record\ValueLabel::fill($this->_buffer, [ - // TODO: refactory - 'variables' => $tempVars, - ]); - break; - case Record\Info::TYPE: - $this->info = $infoCollection->fill($this->_buffer); - break; - case Record\Document::TYPE: - $this->documents = Record\Document::fill($this->_buffer)->toArray(); - break; - } - } while (Record\Data::TYPE !== $recType); + $this->readBodyInternal(); // Excluding the records that are creating only as a consequence of very long string records // from the variables computation. @@ -163,9 +165,11 @@ public function readBody() } $segmentsCount = 0; + $tempVars = $this->variables; + $this->variables = []; foreach ($tempVars as $index => $var) { // Skip blank records from the variables computation - if (-1 !== $var->width) { + if ($var->width !== -1) { if ($segmentsCount <= 0) { $segmentsCount = Utils::widthToSegments( isset($veryLongStrings[$var->name]) ? From 0019a7f3daeaed397ce6b45206cf1c674c7d24dc Mon Sep 17 00:00:00 2001 From: lestcape Date: Mon, 17 May 2021 18:38:15 -0500 Subject: [PATCH 3/5] Reader: Add support for charset. --- src/Sav/Reader.php | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/Sav/Reader.php b/src/Sav/Reader.php index e76e94f..1286d49 100644 --- a/src/Sav/Reader.php +++ b/src/Sav/Reader.php @@ -154,9 +154,28 @@ public function readBody() $this->readHeader(); } - // TODO: refactory + // TODO: We need to find a better way to decode the body, because the CharacterEncoding + // data is not necessary set at the beginning of the body and any string that is set + // before it is then not decode. So, we need to read twice the body, once to find the + // encode and another to decode it. + $headerPosition = $this->_buffer->position(); $this->readBodyInternal(); + if (isset($this->info) && isset($this->info[Record\Info\CharacterEncoding::SUBTYPE])) { + $encode = $this->info[Record\Info\CharacterEncoding::SUBTYPE]->value; + // If is not set assume the UTF-8 encode. + $encode = (isset($encode) && !empty($encode)) ? $encode : "UTF-8"; + $this->_buffer->charset = $encode; + + if ($this->_buffer->seek($headerPosition) === 0) { + $this->valueLabels = []; + $this->info = []; + $this->documents = []; + $this->variables = []; + $this->readBodyInternal(); + } + } + // Excluding the records that are creating only as a consequence of very long string records // from the variables computation. $veryLongStrings = []; From d20497a25193ae86541a774d1e6fb9443ef63206 Mon Sep 17 00:00:00 2001 From: lestcape Date: Mon, 17 May 2021 19:53:31 -0500 Subject: [PATCH 4/5] Do not translate from the same encode to the same and fix the copy paste error. --- src/Sav/Writer.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Sav/Writer.php b/src/Sav/Writer.php index e1ef808..aff88bc 100644 --- a/src/Sav/Writer.php +++ b/src/Sav/Writer.php @@ -98,7 +98,7 @@ public function write($data) $this->info[Record\Info\LongStringValueLabels::SUBTYPE] = new Record\Info\LongStringValueLabels(); $this->info[Record\Info\LongStringMissingValues::SUBTYPE] = new Record\Info\LongStringMissingValues(); - $encode = (isset($data['info']) && isset($data['info']['characterEncoding'])) ? $config['info']['characterEncoding'] : 'UTF-8'; + $encode = (isset($data['info']) && isset($data['info']['characterEncoding'])) ? $data['info']['characterEncoding'] : 'UTF-8'; $this->info[Record\Info\CharacterEncoding::SUBTYPE] = new Record\Info\CharacterEncoding($encode); $this->buffer->charset = $encode; From df94c229db8780ea7d2908b81e23aa15c44a2386 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lester=20Carballo=20P=C3=A9rez?= Date: Mon, 12 Feb 2024 15:24:30 -0600 Subject: [PATCH 5/5] Resolve conflict --- src/Sav/Reader.php | 2 +- src/Sav/Writer.php | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Sav/Reader.php b/src/Sav/Reader.php index 1286d49..2f3fe5e 100644 --- a/src/Sav/Reader.php +++ b/src/Sav/Reader.php @@ -165,7 +165,7 @@ public function readBody() $encode = $this->info[Record\Info\CharacterEncoding::SUBTYPE]->value; // If is not set assume the UTF-8 encode. $encode = (isset($encode) && !empty($encode)) ? $encode : "UTF-8"; - $this->_buffer->charset = $encode; + $this->_buffer->streamCharset = $encode; if ($this->_buffer->seek($headerPosition) === 0) { $this->valueLabels = []; diff --git a/src/Sav/Writer.php b/src/Sav/Writer.php index aff88bc..149f796 100644 --- a/src/Sav/Writer.php +++ b/src/Sav/Writer.php @@ -100,7 +100,7 @@ public function write($data) $encode = (isset($data['info']) && isset($data['info']['characterEncoding'])) ? $data['info']['characterEncoding'] : 'UTF-8'; $this->info[Record\Info\CharacterEncoding::SUBTYPE] = new Record\Info\CharacterEncoding($encode); - $this->buffer->charset = $encode; + $this->buffer->streamCharset = $encode; $this->data = new Record\Data();