How do I ignore duplicate fields? #409
-
|
Hi, Thanks for you continued work on this wonderful crate! |
Beta Was this translation helpful? Give feedback.
Replies: 1 comment
-
|
In the future, for something like this, it's really important to provide an MRE. Otherwise I have to spend my own time trying to guess at what it is you're asking about. I came up with this. Here is my #[derive(Debug, serde::Deserilize)]
struct Record {
foo: i32,
bar: i32,
quux: i32,
baz: i32,
}
fn main() -> anyhow::Result<()> {
let data = "\
foo,bar,quux,bar,baz
1,2,3,4,5
";
let mut rdr = csv::Reader::from_reader(data.as_bytes());
for result in rdr.deserialize() {
let record: Record = result?;
dbg!(&record);
}
Ok(())
}With this [package]
publish = false
name = "csv-duplicate-fields"
version = "0.1.0"
edition = "2024"
[dependencies]
anyhow = "1.0.100"
csv = "1.3.1"
serde = { version = "1.0.228", features = ["derive"] }
[[bin]]
name = "csv-duplicate-fields"
path = "main.rs"
[profile.release]
debug = trueAnd the output: This error is not coming from the You can see where the error is coming from with the above example by running That means that if you want to avoid that error, you'll need to write your own #[derive(Debug)]
struct Record {
foo: i32,
bar: i32,
quux: i32,
baz: i32,
}
impl<'de> serde::Deserialize<'de> for Record {
#[inline]
fn deserialize<D: serde::Deserializer<'de>>(
deserializer: D,
) -> Result<Self, D::Error> {
use serde::de;
enum Field {
Foo,
Bar,
Quux,
Baz,
}
impl<'de> serde::Deserialize<'de> for Field {
fn deserialize<D>(deserializer: D) -> Result<Field, D::Error>
where
D: serde::Deserializer<'de>,
{
struct FieldVisitor;
impl<'de> serde::de::Visitor<'de> for FieldVisitor {
type Value = Field;
fn expecting(
&self,
f: &mut std::fmt::Formatter,
) -> std::fmt::Result {
f.write_str("`foo`, `bar`, `quux` or `baz`")
}
fn visit_str<E>(self, value: &str) -> Result<Field, E>
where
E: serde::de::Error,
{
self.visit_bytes(value.as_bytes())
}
fn visit_bytes<E>(self, value: &[u8]) -> Result<Field, E>
where
E: serde::de::Error,
{
let value = str::from_utf8(value).map_err(|_| {
de::Error::invalid_value(
de::Unexpected::Bytes(value),
&"valid UTF-8",
)
})?;
match value {
"foo" => Ok(Field::Foo),
"bar" => Ok(Field::Bar),
"quux" => Ok(Field::Quux),
"baz" => Ok(Field::Baz),
_ => Err(de::Error::unknown_field(value, FIELDS)),
}
}
}
deserializer.deserialize_identifier(FieldVisitor)
}
}
struct Visitor;
impl<'de> serde::de::Visitor<'de> for Visitor {
type Value = Record;
fn expecting(
&self,
formatter: &mut std::fmt::Formatter,
) -> std::fmt::Result {
formatter.write_str(
"a map with `foo`, `bar`, `quux` and `baz` keys",
)
}
fn visit_map<V>(self, mut map: V) -> Result<Self::Value, V::Error>
where
V: serde::de::MapAccess<'de>,
{
let mut foo = None;
let mut bar = None;
let mut quux = None;
let mut baz = None;
while let Some(key) = map.next_key()? {
match key {
Field::Foo => {
if foo.is_some() {
return Err(de::Error::duplicate_field("foo"));
}
foo = Some(map.next_value()?);
}
Field::Bar => {
if bar.is_some() {
return Err(de::Error::duplicate_field("bar"));
}
bar = Some(map.next_value()?);
}
Field::Quux => {
if quux.is_some() {
return Err(de::Error::duplicate_field(
"quux",
));
}
quux = Some(map.next_value()?);
}
Field::Baz => {
if baz.is_some() {
return Err(de::Error::duplicate_field("baz"));
}
baz = Some(map.next_value()?);
}
}
}
let foo =
foo.ok_or_else(|| de::Error::missing_field("foo"))?;
let bar =
bar.ok_or_else(|| de::Error::missing_field("bar"))?;
let quux =
quux.ok_or_else(|| de::Error::missing_field("quux"))?;
let baz =
baz.ok_or_else(|| de::Error::missing_field("baz"))?;
Ok(Record { foo, bar, quux, baz })
}
}
const FIELDS: &[&str] = &["foo", "bar", "quux", "baz"];
deserializer.deserialize_struct("Record", FIELDS, Visitor)
}
}
fn main() -> anyhow::Result<()> {
let data = "\
foo,bar,quux,bar,baz
1,2,3,4,5
";
let mut rdr = csv::Reader::from_reader(data.as_bytes());
for result in rdr.deserialize() {
let record: Record = result?;
dbg!(&record);
}
Ok(())
}If you look at this closely, you'll see exactly where the duplicate field error is occurring. Now you can implement whatever kind of semantics you want. You said you want to ignore the columns, but you don't say if you want a "first one wins" or a "last one wins" strategy. I'll pick the former and implement that: #[derive(Debug)]
struct Record {
foo: i32,
bar: i32,
quux: i32,
baz: i32,
}
impl<'de> serde::Deserialize<'de> for Record {
#[inline]
fn deserialize<D: serde::Deserializer<'de>>(
deserializer: D,
) -> Result<Self, D::Error> {
use serde::de;
enum Field {
Foo,
Bar,
Quux,
Baz,
}
impl<'de> serde::Deserialize<'de> for Field {
fn deserialize<D>(deserializer: D) -> Result<Field, D::Error>
where
D: serde::Deserializer<'de>,
{
struct FieldVisitor;
impl<'de> serde::de::Visitor<'de> for FieldVisitor {
type Value = Field;
fn expecting(
&self,
f: &mut std::fmt::Formatter,
) -> std::fmt::Result {
f.write_str("`foo`, `bar`, `quux` or `baz`")
}
fn visit_str<E>(self, value: &str) -> Result<Field, E>
where
E: serde::de::Error,
{
self.visit_bytes(value.as_bytes())
}
fn visit_bytes<E>(self, value: &[u8]) -> Result<Field, E>
where
E: serde::de::Error,
{
let value = str::from_utf8(value).map_err(|_| {
de::Error::invalid_value(
de::Unexpected::Bytes(value),
&"valid UTF-8",
)
})?;
match value {
"foo" => Ok(Field::Foo),
"bar" => Ok(Field::Bar),
"quux" => Ok(Field::Quux),
"baz" => Ok(Field::Baz),
_ => Err(de::Error::unknown_field(value, FIELDS)),
}
}
}
deserializer.deserialize_identifier(FieldVisitor)
}
}
struct Visitor;
impl<'de> serde::de::Visitor<'de> for Visitor {
type Value = Record;
fn expecting(
&self,
formatter: &mut std::fmt::Formatter,
) -> std::fmt::Result {
formatter.write_str(
"a map with `foo`, `bar`, `quux` and `baz` keys",
)
}
fn visit_map<V>(self, mut map: V) -> Result<Self::Value, V::Error>
where
V: serde::de::MapAccess<'de>,
{
let mut foo = None;
let mut bar = None;
let mut quux = None;
let mut baz = None;
while let Some(key) = map.next_key()? {
match key {
Field::Foo => {
let v = map.next_value()?;
if foo.is_none() {
foo = Some(v);
}
}
Field::Bar => {
let v = map.next_value()?;
if bar.is_none() {
bar = Some(v);
}
}
Field::Quux => {
let v = map.next_value()?;
if quux.is_none() {
quux = Some(v);
}
}
Field::Baz => {
let v = map.next_value()?;
if baz.is_none() {
baz = Some(v);
}
}
}
}
let foo =
foo.ok_or_else(|| de::Error::missing_field("foo"))?;
let bar =
bar.ok_or_else(|| de::Error::missing_field("bar"))?;
let quux =
quux.ok_or_else(|| de::Error::missing_field("quux"))?;
let baz =
baz.ok_or_else(|| de::Error::missing_field("baz"))?;
Ok(Record { foo, bar, quux, baz })
}
}
const FIELDS: &[&str] = &["foo", "bar", "quux", "baz"];
deserializer.deserialize_struct("Record", FIELDS, Visitor)
}
}
fn main() -> anyhow::Result<()> {
let data = "\
foo,bar,quux,bar,baz
1,2,3,4,5
";
let mut rdr = csv::Reader::from_reader(data.as_bytes());
for result in rdr.deserialize() {
let record: Record = result?;
dbg!(&record);
}
Ok(())
}And the output: Since With that said, writing your own use std::collections::HashSet;
use csv::StringRecord;
#[derive(Debug, serde::Deserialize)]
struct Record {
foo: i32,
bar: i32,
quux: i32,
baz: i32,
}
fn main() -> anyhow::Result<()> {
let data = "\
foo,bar,quux,bar,baz
1,2,3,4,5
";
let mut rdr = csv::Reader::from_reader(data.as_bytes());
let duplicate_remover =
DuplicateRemover::from_original_headers(rdr.headers()?);
for result in rdr.records() {
let raw_record = result?;
let record: Record = duplicate_remover.deserialize(&raw_record)?;
dbg!(&record);
}
Ok(())
}
#[derive(Debug, Default)]
struct DuplicateRemover {
header_without_duplicates: StringRecord,
duplicate_indices: HashSet<usize>,
}
impl DuplicateRemover {
fn from_original_headers(original: &StringRecord) -> DuplicateRemover {
let mut seen = HashSet::new();
let mut remover = DuplicateRemover::default();
for (i, field) in original.iter().enumerate() {
if seen.insert(field) {
remover.header_without_duplicates.push_field(field);
} else {
remover.duplicate_indices.insert(i);
}
}
remover
}
fn deserialize<D>(&self, original: &StringRecord) -> csv::Result<D>
where
D: serde::de::DeserializeOwned,
{
let without_duplicates: StringRecord = original
.iter()
.enumerate()
.filter(|(i, _)| !self.duplicate_indices.contains(&i))
.map(|(_, field)| field)
.collect();
without_duplicates.deserialize(Some(&self.header_without_duplicates))
}
}And the output: I'll leave "last one wins" (or other semantics) as an exercise to the reader. :-) |
Beta Was this translation helpful? Give feedback.
In the future, for something like this, it's really important to provide an MRE. Otherwise I have to spend my own time trying to guess at what it is you're asking about. I came up with this. Here is my
main.rs:With this
Cargo.toml: