+ '_> {
let key = PartialKey::new(inode.id(), Some(ObjectType::DirIndex), None);
let children = self.fs_root.find_range(&key)?;
let a = children.map(|(_, v)| v.try_into_dir_index().expect("dir index"));
Ok(a)
}
By looking at the file names of the `DirIndex` entries we can also look up files by their absolute path and relative paths to some known inode:
pub fn get_inode_by_relative_path(&self, inode: INode, path: P) -> Result
where
P: Path,
{
if path.is_absolute() {
self.get_inode_by_path(path)
} else {
self.get_inode_by_relative_normalized_path(inode, path.normalize())
}
}
pub fn get_inode_by_path(&self, path: P) -> Result
where
P: Path,
{
let mut normalized = path.normalize();
if !path.is_absolute() {
log::error!("path is not absolute!");
} else {
// pop root
_ = normalized.pop_segment();
}
self.get_inode_by_relative_normalized_path(self.get_root_dir(), normalized)
}
pub fn get_inode_by_relative_normalized_path(
&self,
inode: INode,
path: NormalizedPath,
) -> Result {
let mut inode = inode;
for segment in path.iter() {
match segment {
crate::path::Segment::ParentDir => {
inode = self.get_inode_parent(&inode)?;
}
crate::path::Segment::File(child_name) => {
let child = self
.get_inode_children_inodes(&inode)?
.find(|child| {
child.path.last().map(|bytes| bytes.as_slice()) == Some(child_name)
})
.ok_or(Error::INodeNotFound)?
.clone();
// silly borrow checker
inode = child;
}
_ => unreachable!(),
}
}
Ok(inode)
}
To be able to actually read a file we need to get a list of its `ExtentData` records:
pub fn get_inode_extents(&self, inode_id: u64) -> Result> {
if let Some(dir_entry) = self.get_inode_dir_index(inode_id)? {
if dir_entry.item().ty() == DirItemType::RegFile {
let key = PartialKey::new(inode_id.into(), Some(ObjectType::ExtentData), None);
let extents = self.fs_root.find_range(&key)?;
let extents = extents
.map(|(key, item)| {
(
key.key.offset.get(),
item.as_extent_data().expect("extent data").clone(),
)
})
.collect::>();
Ok(extents)
} else {
Ok(vec![])
}
} else {
Err(Error::INodeNotFound)
}
}
`ExtentData` records come in two flavors: `Inline`, where the file contents follow immediately after the record on disk, or `Regular`, where the record is followed immediately by more data pointing at the file contents.
To read some view into an inode, we need to check each extent for an overlap with our views bounds:
pub fn read_inode_raw>(&self, inode: &INode, range: I) -> Result> {
let mut contents = Vec::new();
let extents = self.get_inode_extents(inode.id)?;
let start = match range.start_bound() {
core::ops::Bound::Included(v) => *v,
core::ops::Bound::Excluded(v) => *v + 1,
core::ops::Bound::Unbounded => 0,
};
let end = match range.end_bound() {
core::ops::Bound::Included(v) => Some(*v + 1),
core::ops::Bound::Excluded(v) => Some(*v),
core::ops::Bound::Unbounded => None,
};
log::info!("extents: {}", extents.len());
log::info!("{:?}", extents);
for (offset, extent) in extents.into_iter().filter(|(offset, extent)| {
// bounds of the current extent
let extent_start = *offset;
let extent_end = extent_start + extent.len();
let extent_len = extent.len();
// entire range we want to read from the file
let range_len = end.map(|end| end - start);
// start of the UNION (from lowest bound to highest bound) of the
// current extent and the entire range
let start = start.min(extent_start);
// end of the UNION of the current extent and the entire range
let end = end.map(|end| end.max(extent_end));
// width of the union o fthe current extent and the entire range
let len = end.map(|end| (end - start));
if let (Some(len), Some(range_len)) = (len, range_len) {
// proceed if the widths of the 2 ranges (the range we want to
// read, and the current extent) are greater than the width of
// the union range:
//
// In the first example, the 2 ranges overlap, and the width of
// the union is smaller than the sum of the widths of the ranges:
//
// |------range-1------|
// |---range-2----|
// |-----width-of-union-----|
// |-------sum----|-of---widths-------|
// |------------width-of-union------------|
// |------range-1------|
// |---range-2----|
//
// In this second example, the ranges do not overlap, and the
// width of the unions is equal or greater than the sum of the
// widths.
len < extent_len + range_len
} else {
start < extent_end
}
}) {
//
let start = start.saturating_sub(offset);
let end = end.map(|end| end - offset).unwrap_or(start + extent.len());
let len = end - start;
log::info!("reading {}..{:?} from extent.", start, end);
let data: alloc::borrow::Cow<[u8]> = match &extent {
ExtentData::Inline { data, .. } => (&data[start as usize..end as usize]).into(),
ExtentData::Other(extent) => {
let address = extent.address() + extent.offset() + start;
let address = self
.volume
.inner
.offset_from_logical(address)
.ok_or(Error::BadLogicalAddress)?;
let range = match extent.extent_data1().compression() {
// compressed size
CompressionType::Zlib | CompressionType::Lzo | CompressionType::ZStd => {
address..address + extent.size()
}
_ => address + start..address + start + len,
};
let data = self.volume.inner.read_range(range).expect("bytes");
data.into()
}
};
// truncate inflated data if needed
contents.extend_from_slice(&data[..len as usize]);
}
Ok(contents)
}
Btrfs also supports compression at individual extent resolution with the Zlib, Zstd and Lzo algorithms. Uo be able to read a compressed file we need to first apply the compression algorithm to the entire extent and truncate afterwards:
log::info!("reading {} bytes from file", data.len());
log::info!("compression: {:?}", extent.header().compression());
match extent.header().compression() {
CompressionType::None => {
contents.extend_from_slice(&data);
}
CompressionType::Zlib => {
let mut state = miniz_oxide::inflate::stream::InflateState::new(
miniz_oxide::DataFormat::Zlib,
);
let mut output_data = vec![0u8; extent.header().decoded_size() as usize];
let mut output = &mut output_data[..];
let mut data = &data[..];
loop {
let result = miniz_oxide::inflate::stream::inflate(
&mut state,
&data,
&mut output,
miniz_oxide::MZFlush::None,
);
match result.status.map_err(|_| Error::DecompressionError)? {
miniz_oxide::MZStatus::Ok => {}
miniz_oxide::MZStatus::StreamEnd => break,
_ => {
log::error!("need dict ?!");
return Err(Error::DecompressionError);
}
}
data = &data[result.bytes_consumed..];
output = &mut output[result.bytes_written..];
}
_ = miniz_oxide::inflate::stream::inflate(
&mut state,
&data,
&mut output,
miniz_oxide::MZFlush::Finish,
)
.status
.map_err(|_| Error::DecompressionError)?;
// truncate inflated data if needed
contents
.extend_from_slice(&output_data[start as usize..(start + len) as usize]);
}
CompressionType::Lzo => {
todo!()
}
CompressionType::ZStd => {
let mut output_data = vec![0u8; extent.header().decoded_size() as usize];
let mut zstd = zstd_safe::DCtx::create();
zstd.init().map_err(|e| {
log::error!("zstd init error: {}", zstd_safe::get_error_name(e));
Error::DecompressionError
})?;
let mut input = zstd_safe::InBuffer::around(&data);
let mut output = zstd_safe::OutBuffer::around(&mut output_data[..]);
loop {
match zstd.decompress_stream(&mut output, &mut input) {
Ok(len) => {
if len == 0 {
break;
}
}
Err(e) => {
log::error!(
"zstd decompress stream error: {}",
zstd_safe::get_error_name(e)
);
return Err(Error::DecompressionError);
}
}
if output.pos() == extent.header().decoded_size() as usize {
break;
}
}
contents
.extend_from_slice(&output_data[start as usize..(start + len) as usize]);
}
c => {
log::error!("invalid compression type {:?}", c);
contents.extend_from_slice(&data);
}
}
## Design Flaws
Everything, of course, this is just a toy meant to help me understand how Btrfs works and is laid out :^).
But specifically:
### copying huge chunks of memory for each tree node
by copying the entire chunk past the address of a tree node and caching it instead of first reading the header and then reading all the `KeyPtr` or `Item` children and again reading records directly from file/disk we waste huge amounts of RAM, not to mention that caching en entire chunk like that probably makes exceedingly little sense if we ever want to support write access.
One possible solution might be a kind of GC-like structure that keeps track of views into chunks.
### Looking up file by comparing filename and not filename hash
btrfs stores `DirItemEntries` both one-to-one in `DirIndex` entries, aswell as in `DirItem` by the hash, which allows us to quickly look for the file name in an ordered range of file hashes, and then only comparing the actual file name if there are any hash colisions. Doing that instead of iterating through all `DirIndex` entries should be significantly faster.