Skip to content

Commit

Permalink
Update with data from delta files
Browse files Browse the repository at this point in the history
  • Loading branch information
m417z committed Sep 11, 2022
1 parent dc053f6 commit bf850dc
Show file tree
Hide file tree
Showing 8 changed files with 424 additions and 127 deletions.
1 change: 0 additions & 1 deletion .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ jobs:
choco install -y aria2
choco install -y tor
7z x data\tools.zip -odata\tools
curl -L -o data\tools\sxsexp64.exe https://github.com/hfiref0x/SXSEXP/raw/f67aa93a7d5acfdd4341d4b4822c351ce2206f62/Binary/sxsexp64.exe
- name: Install Linux dependencies
if: runner.os == 'Linux'
run: |
Expand Down
Binary file modified data/tools.zip
Binary file not shown.
184 changes: 138 additions & 46 deletions data/upd02_get_manifests_from_updates.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import datetime
import platform
import requests
import hashlib
import shutil
import json
import re
Expand Down Expand Up @@ -57,7 +58,7 @@ def get_update_download_url(update_uid):
return matches[0]


def download_update(windows_version, update_kb):
def get_update(windows_version, update_kb):
# ARM only.
if update_kb in ['KB5016138', 'KB5016139']:
raise UpdateNotSupported
Expand All @@ -83,11 +84,40 @@ def download_update(windows_version, update_kb):

found_updates = [update for update in found_updates if not re.search(filter_regex, update[1], re.IGNORECASE)]

# Replace the pattern, and if after the replacement the item exists, filter it.
# For example, if there's both Cumulative and Delta, pick Cumulative.
filter_regex_pairs = [
[r'^(\d{4}-\d{2} )?Delta ', r'\1Cumulative '],
[r'\bWindows 10 Version 1909\b', r'Windows 10 Version 1903'],
]

found_update_titles = [update[1] for update in found_updates]
filtered_updates = []
for update in found_updates:
update_title = update[1]
matched = False
for search, replace in filter_regex_pairs:
update_title_sub, num_subs = re.subn(search, replace, update_title)
if num_subs > 0 and update_title_sub in found_update_titles:
matched = True
break

if not matched:
filtered_updates.append(update)

found_updates = filtered_updates

if len(found_updates) != 1:
raise Exception(f'Expected one update item, found {len(found_updates)}')

update_uid, update_title = found_updates[0]
assert re.fullmatch(rf'(\d{{4}}-\d{{2}} )?Cumulative Update (Preview )?for {package_windows_version} for x64-based Systems \({update_kb}\)', update_title), update_title
assert re.fullmatch(rf'(\d{{4}}-\d{{2}} )?(Cumulative|Delta) Update (Preview )?for {package_windows_version} for x64-based Systems \({update_kb}\)', update_title), update_title

return update_uid, update_title


def download_update(windows_version, update_kb):
update_uid, update_title = get_update(windows_version, update_kb)

download_url = get_update_download_url(update_uid)
if not download_url:
Expand All @@ -104,78 +134,140 @@ def download_update(windows_version, update_kb):
# shutil.copyfileobj(r.raw, f)

args = ['aria2c', '-x4', '-o', local_path, '--allow-overwrite=true', download_url]
subprocess.run(args, check=True, stdout=None if config.verbose_run else subprocess.DEVNULL)
subprocess.check_call(args, stdout=None if config.verbose_run else subprocess.DEVNULL)

return download_url, local_dir, local_path


def extract_manifest_files(local_dir, local_path):
def cab_extract(pattern, from_file, to_dir):
# https://stackoverflow.com/a/44873382
def sha256sum(filename):
h = hashlib.sha256()
b = bytearray(128*1024)
mv = memoryview(b)
with open(filename, 'rb', buffering=0) as f:
while n := f.readinto(mv):
h.update(mv[:n])
return h.hexdigest()


def extract_update_files(local_dir: Path, local_path: Path):
def cab_extract(pattern: str, from_file: Path, to_dir: Path):
to_dir.mkdir()
if platform.system() == 'Windows':
args = ['expand', '-r', f'-f:{pattern}', from_file, to_dir]
else:
args = ['cabextract', '-F', pattern, '-d', to_dir, from_file]
subprocess.run(args, check=True, stdout=None if config.verbose_run else subprocess.DEVNULL)

extract_dirs = []
for i in range(4):
extract_dir = local_dir.joinpath(f'extract{i + 1}')
extract_dir.mkdir(parents=True, exist_ok=True)
extract_dirs.append(extract_dir)

cab_extract('*.cab', local_path, extract_dirs[0])

for i in range(4):
for cab in extract_dirs[i].glob('*.cab'):
if cab.name.lower() in (x.lower() for x in [
'DesktopDeployment.cab',
'DesktopDeployment_x86.cab',
'onepackage.AggregatedMetadata.cab',
'WSUSSCAN.cab'
]):
continue

cab_extract('*.manifest', cab, local_dir)
cab_extract('*.cab', cab, extract_dirs[i + 1])

# Assert that we're done.
assert not any(extract_dirs[3].glob('*.cab'))

for extract_dir in extract_dirs:
shutil.rmtree(extract_dir)

local_path.unlink()
subprocess.check_call(args, stdout=None if config.verbose_run else subprocess.DEVNULL)

# Extract all files from all cab files until no more cab files can be found.
first_unhandled_extract_dir_num = 1
next_extract_dir_num = 1

extract_dir = local_dir.joinpath(f'_extract_{next_extract_dir_num}')
next_extract_dir_num += 1
cab_extract('*', local_path, extract_dir)

while first_unhandled_extract_dir_num < next_extract_dir_num:
next_unhandled_extract_dir_num = next_extract_dir_num

for src_extract_dir_num in range(first_unhandled_extract_dir_num, next_extract_dir_num):
src_extract_dir = local_dir.joinpath(f'_extract_{src_extract_dir_num}')
for cab in src_extract_dir.glob('*.cab'):
extract_dir = local_dir.joinpath(f'_extract_{next_extract_dir_num}')
next_extract_dir_num += 1
cab_extract('*', cab, extract_dir)

first_unhandled_extract_dir_num = next_unhandled_extract_dir_num

# Move all extracted files from all folders to the target folder.
for extract_dir in local_dir.glob('_extract_*'):
def ignore_files(path, names):
source_dir = Path(path)
destination_dir = local_dir.joinpath(Path(path).relative_to(extract_dir))

ignore = []
for name in names:
source_file = source_dir.joinpath(name)
if source_file.is_file():
# Ignore files in root folder which have different non-identical copies with the same name.
# Also ignore cab archives in the root folder.
if source_dir == extract_dir:
if (name in ['update.cat', 'update.mum'] or
name.endswith('.cab') or
name.endswith('.dll')):
ignore.append(name)
continue

# Ignore files which already exist as long as they're identical.
destination_file = destination_dir.joinpath(name)
if destination_file.exists():
if not destination_file.is_file():
raise Exception(f'A destination item already exists and is not a file: {destination_file}')

if sha256sum(source_file) != sha256sum(destination_file):
raise Exception(f'A different file copy already exists: {destination_file}')

ignore.append(name)

return ignore

shutil.copytree(extract_dir, local_dir, copy_function=shutil.move, dirs_exist_ok=True, ignore=ignore_files)

# Extract delta files from the PSF file which can be found in Windows 11 updates.
# References:
# https://www.betaarchive.com/forum/viewtopic.php?t=43163
# https://github.com/Secant1006/PSFExtractor
if platform.system() == 'Windows':
psf_files = list(local_dir.glob('*.psf'))
assert len(psf_files) <= 1
if len(psf_files) == 1:
psf_file = psf_files[0]
args = ['tools/PSFExtractor.exe', '-v2', psf_file, local_dir.joinpath('express.psf.cix.xml'), local_dir]
subprocess.check_call(args, stdout=None if config.verbose_run else subprocess.DEVNULL)

# Use DeltaDownloader to extract meaningful data from delta files:
# https://github.com/m417z/DeltaDownloader
if platform.system() == 'Windows':
# Avoid path limitations by using a UNC path.
local_dir_unc = Rf'\\?\{local_dir.absolute()}'
args = ['tools/DeltaDownloader/DeltaDownloader.exe', "/g", local_dir_unc]
subprocess.check_call(args, stdout=None if config.verbose_run else subprocess.DEVNULL)

# Starting with Windows 11, manifest files are compressed with the DCM v1 format.
# Use SYSEXP to de-compress them: https://github.com/hfiref0x/SXSEXP
if platform.system() == 'Windows':
args = ['tools/sxsexp64.exe', local_dir, local_dir]
subprocess.run(args, stdout=None if config.verbose_run else subprocess.DEVNULL)
subprocess.check_call(args, stdout=None if config.verbose_run else subprocess.DEVNULL)

for extract_dir in local_dir.glob('_extract_*'):
shutil.rmtree(extract_dir)

local_path.unlink()


def get_manifests_from_update(windows_version, update_kb):
def get_files_from_update(windows_version: str, update_kb: str):
print(f'[{update_kb}] Downloading update')

download_url, local_dir, local_path = download_update(windows_version, update_kb)
print(f'[{update_kb}] Downloaded {local_path.stat().st_size} bytes from {download_url}')

def extract_manifest_files_start():
print(f'[{update_kb}] Extracting manifest files')
def extract_update_files_start():
print(f'[{update_kb}] Extracting update files')
try:
extract_manifest_files(local_dir, local_path)
extract_update_files(local_dir, local_path)
except Exception as e:
print(f'[{update_kb}] ERROR: Failed to process update')
print(f'[{update_kb}] ' + str(e))
print(f'[{update_kb}] {e}')
if config.exit_on_first_error:
raise
return
print(f'[{update_kb}] Extracted manifest files')
print(f'[{update_kb}] Extracted update files')

if config.extract_in_a_new_thread:
thread = Thread(target=extract_manifest_files_start)
thread = Thread(target=extract_update_files_start)
thread.start()
else:
extract_manifest_files_start()
extract_update_files_start()


def main():
Expand All @@ -187,7 +279,7 @@ def main():

for update_kb in updates[windows_version]:
try:
get_manifests_from_update(windows_version, update_kb)
get_files_from_update(windows_version, update_kb)
except UpdateNotSupported:
print(f'[{update_kb}] WARNING: Skipping unsupported update')
except UpdateNotFound:
Expand All @@ -203,7 +295,7 @@ def main():
print(f'[{update_kb}] WARNING: Update wasn\'t found, it was probably removed from the update catalog')
except Exception as e:
print(f'[{update_kb}] ERROR: Failed to process update')
print(f'[{update_kb}] ' + str(e))
print(f'[{update_kb}] {e}')
if config.exit_on_first_error:
raise

Expand Down
60 changes: 51 additions & 9 deletions data/upd03_parse_manifests.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,45 @@ def update_file_hashes():
file_hashes.clear()


def parse_manifest_file(file_el):
def get_delta_data_for_manifest_file(manifest_path: Path, filename: str):
delta_path = manifest_path.parent.joinpath(manifest_path.stem, 'f', filename + '.dd.txt')
if not delta_path.exists():
return None

delta_data_raw = delta_path.read_text()

delta_data = {}
key_value = re.findall(r'^(\w+):(.*)$', delta_data_raw, re.MULTILINE)
for key, value in key_value:
delta_data[key] = value.strip()

result = {}

result['size'] = int(delta_data['TargetSize'])

assert delta_data['HashAlgorithm'] == 'CALG_MD5'
result['md5'] = delta_data['Hash'].lower()

if delta_data['Code'] != 'Raw':
machine_type_values = {
'CLI4_I386': 332,
'CLI4_AMD64': 34404,
'CLI4_ARM64': 43620,
}
result['machineType'] = machine_type_values[delta_data['Code']]

result['timestamp'] = int(delta_data['TimeStamp'])

rift_table = delta_data['RiftTable']
rift_table_last = rift_table.split(';')[-1].split(',')

result['lastSectionVirtualAddress'] = int(rift_table_last[0])
result['lastSectionPointerToRawData'] = int(rift_table_last[1])

return result


def parse_manifest_file(manifest_path, file_el):
hashes = list(file_el.findall('hash'))
if len(hashes) != 1:
raise Exception('Expected to have a single hash tag')
Expand Down Expand Up @@ -55,8 +93,8 @@ def parse_manifest_file(file_el):
digest_value_el = digest_values[0]
hash = base64.b64decode(digest_value_el.text).hex()

filename = file_el.attrib['name'].split('\\')[-1].lower()
if algorithm == 'sha256':
filename = file_el.attrib['name'].split('\\')[-1].lower()
if (re.search(r'\.(exe|dll|sys|winmd|cpl|ax|node|ocx|efi|acm|scr|tsp|drv)$', filename)):
file_hashes.setdefault(filename, set()).add(hash)

Expand All @@ -65,14 +103,18 @@ def parse_manifest_file(file_el):
'attributes': dict(file_el.attrib.items()),
}

delta_data = get_delta_data_for_manifest_file(manifest_path, filename)
if delta_data:
result['delta'] = delta_data

return result


def parse_manifest(filename):
#root = ET.parse(filename).getroot()
def parse_manifest(manifest_path: Path):
#root = ET.parse(str(manifest_path)).getroot()
# Strip namespaces.
# https://stackoverflow.com/a/33997423
it = ET.iterparse(filename)
it = ET.iterparse(str(manifest_path))
for _, el in it:
if '}' in el.tag:
el.tag = el.tag.split('}', 1)[1] # strip all namespaces
Expand All @@ -94,7 +136,7 @@ def parse_manifest(filename):

files = []
for file_el in root.findall('file'):
parsed = parse_manifest_file(file_el)
parsed = parse_manifest_file(manifest_path, file_el)
files.append(parsed)

result = {
Expand All @@ -105,18 +147,18 @@ def parse_manifest(filename):
return result


def parse_manifests(manifests_dir, output_dir):
def parse_manifests(manifests_dir: Path, output_dir: Path):
output_dir.mkdir(parents=True, exist_ok=True)

for path in manifests_dir.glob('*.manifest'):
if not path.is_file():
continue

try:
parsed = parse_manifest(str(path))
parsed = parse_manifest(path)
except Exception as e:
print(f'ERROR: failed to process {path}')
print(' ' + str(e))
print(f' {e}')
if config.exit_on_first_error:
raise
continue
Expand Down
Loading

0 comments on commit bf850dc

Please sign in to comment.