Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add humanvbench operators #553

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions configs/config_all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -513,7 +513,47 @@ process:
mem_required: '9GB'
- whitespace_normalization_mapper: # normalize different kinds of whitespaces to English whitespace.


- video_human_tracks_extraction_mapper: # Get the body and face trajectory bounding box of people in one shot of the video. To ensure correctness, it should be applied after video_split_by_scene_mapper
face_track_bbox_path: your_path_to_save_bounding_box_data
YOLOv8_human_model_path: ./data_juicer/my_pretrained_method/YOLOv8_human/weights/best.pt
mem_required: '10GB'

- video_active_speaker_mapper: # Based on the results of video_human_tracks_extraction_mapper, determine whether each person is an active speaker
tempt_save_path: ./HumanVBenchRecipe/dj_ASD_tempt # Used to store temporary videos
face_track_bbox_path: ./HumanVBenchRecipe/dj_human_track # Human track Data storage address in video_human_tracks_extraction_mapper
mem_required: '10GB'

- video_audio_attribute_mapper: # If the audio is speech, classify the gender and age of the speech
hf_audio_mapper: 'pt_model/wav2vec2-large-robust-24-ft-age-gender' # Huggingface model name for speech age and gender classification
mem_required: '7GB'

- video_captioning_from_human_tracks_mapper: # Based on the results of video_human_tracks_extraction_mapper, focus on the single person in the video for captioning
video_describe_model_path: pt_model/sharegpt4video-8b # model path to sharegpt4video-8b
tempt_video_path: data_juicer/HumanVBenchRecipe/dj_tmpt # Used to store temporary videos
mem_required: '35GB'

- video_captioning_face_attribute_emotion_mapper: # Based on the results of video_human_tracks_extraction_mapper, focus on judging the gender, age, and race of a single person in the video
face_track_query: Please only describe the appearance and facial emotions of the person in the video in detail. Don't mention the background. Less than 80 words.
cropping_face_video_tempt_path: ./tempt_video/tmp_video_remove # Used to store temporary videos
video_describe_model_path: 'pt_model/VideoLLaMA2' # Huggingface model DAMO-NLP-SG/VideoLLaMA2-7B-16F
mem_required: '35GB'

- video_audio_speech_ASR_mapper: # Automatic speech recognition from video speech
model_dir_ASR: 'pt_model/SenseVoiceSmall' # Huggingface model FunAudioLLM/SenseVoiceSmall
mem_required: '20GB'

- video_audio_speech_emotion_mapper: # Speech emotion recognition from video speech
model_dir_emo: 'pt_model/SenseVoiceSmall' # # Huggingface model FunAudioLLM/SenseVoiceSmall
mem_required: '20GB'


# Filter ops
- video_face_ratio_filter: # Filter to retain human-centric videos
threshold: 0.65 # The lower limit of the ratio of frames with faces to the total number of video frames
detect_interval: 4
any_or_all: any

- alphanumeric_filter: # filter text with alphabet/numeric ratio out of specific range.
tokenization: false # whether to count the ratio of alphanumeric to the total number of tokens.
min_ratio: 0.0 # the min ratio of filter range
Expand Down
3 changes: 2 additions & 1 deletion data_juicer/ops/filter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from .video_watermark_filter import VideoWatermarkFilter
from .word_repetition_filter import WordRepetitionFilter
from .words_num_filter import WordsNumFilter
from .video_face_ratio_filter import VideoFaceRatioFilter

__all__ = [
'AlphanumericFilter', 'AudioDurationFilter', 'AudioNMFSNRFilter',
Expand All @@ -61,7 +62,7 @@
'VideoMotionScoreFilter', 'VideoMotionScoreRaftFilter', 'VideoNSFWFilter',
'VideoOcrAreaRatioFilter', 'VideoResolutionFilter',
'VideoTaggingFromFramesFilter', 'VideoWatermarkFilter',
'WordRepetitionFilter', 'WordsNumFilter'
'WordRepetitionFilter', 'WordsNumFilter', 'VideoFaceRatioFilter'
]

NON_STATS_FILTERS = [
Expand Down
142 changes: 142 additions & 0 deletions data_juicer/ops/filter/video_face_ratio_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import av
import numpy as np
from jsonargparse.typing import ClosedUnitInterval
from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields, StatsKeys
from data_juicer.utils.mm_utils import (load_data_with_context, load_video,
pil_to_opencv, pil_to_opencv, process_each_frame)
from ..base_op import OPERATORS, Filter
from ..op_fusion import LOADED_VIDEOS

import psutil
import gc,os

OP_NAME = 'video_face_ratio_filter'

with AvailabilityChecking(['dlib', 'Pillow'], OP_NAME):
import cv2,dlib
from PIL import ImageFilter

@OPERATORS.register_module(OP_NAME)
@LOADED_VIDEOS.register_module(OP_NAME)
class VideoFaceRatioFilter(Filter):
"""Keep data samples whose videos' durations are within a specified range.
"""

def __init__(self,
threshold: ClosedUnitInterval = 0.8,
detect_interval: int = 1,
any_or_all: str = 'all',
*args,
**kwargs):
"""
Initialization method.

:param any_or_all: keep this sample with 'any' or 'all' strategy of
all videos. 'any': keep this sample if any videos meet the
condition. 'all': keep this sample only if all videos meet the
condition.
:param args: extra args
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
self.threshold = threshold

if any_or_all not in ['any', 'all']:
raise ValueError(f'Keep strategy [{any_or_all}] is not supported. '
f'Can only be one of ["any", "all"].')
self.any = (any_or_all == 'any')

# Initialize face detector
self.detector = dlib.get_frontal_face_detector()
# self.detector_key = prepare_model(model_type='face_detect_S3FD')


self.detect_interval = detect_interval


def compute_stats(self, sample, rank=None, context=False):
# check if it's computed already
if StatsKeys.video_face_exist in sample[Fields.stats]:
return sample

# load videos
loaded_video_keys = sample[self.video_key]
video_faces_ratio = {}

# face_detect_S3FD = get_model(self.detector_key, rank=rank)

process = psutil.Process(os.getpid())
# memory_before = process.memory_info().rss / 1024 ** 2 # MB


for video_key in loaded_video_keys:
try:
with av.open(video_key) as container:
# getting video stream
video_stream = next(s for s in container.streams if s.type == 'video')
# iterate over the video frame and detect faces
frame_counter = 0
total_frames = 0
frames_with_face = 0
detect_num = 0
for packet in container.demux(video_stream):
try:
for frame in packet.decode():
total_frames += 1
frame_counter += 1

if frame_counter % self.detect_interval == 0:
detect_num = detect_num + 1
img = frame.to_image()
image = pil_to_opencv(img)
# imageNumpy = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
# faces = face_detect_S3FD.detect_faces(imageNumpy, conf_th=0.9, scales=[0.25])
faces = self.detector(image)
if len(faces) > 0:
frames_with_face += 1
except Exception as e:
print(f"Frame decoding error in video {video_key}: {e}")
frames_with_face = 0
detect_num = 0

# calculate the proportion of the number of face frames
if detect_num > 0:
face_ratio = frames_with_face / detect_num
else:
face_ratio = 0.0
video_faces_ratio[video_key] = face_ratio
except av.AVError as e:
print(f"Error opening video {video_key}: {e}")
video_faces_ratio[video_key] = 0.0
finally:
container.close()

video_faces_ratio[video_key] = face_ratio

# get video faces ratio
sample[Fields.stats][StatsKeys.video_face_exist] = [
video_faces_ratio[video_key] for video_key in sample[self.video_key]
]

memory_after = process.memory_info().rss / 1024 ** 2 # MB
print(f"Memory Usage: {memory_after:.2f} MB")

gc.collect()

return sample

def process(self, sample):
video_faces_ratio = sample[Fields.stats][StatsKeys.video_face_exist]
keep_bools = np.array([
duration >= self.threshold
for duration in video_faces_ratio
])
if len(keep_bools) <= 0:
return True

# different strategies
if self.any:
return keep_bools.any()
else:
return keep_bools.all()
12 changes: 11 additions & 1 deletion data_juicer/ops/mapper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,13 @@
from .video_tagging_from_audio_mapper import VideoTaggingFromAudioMapper
from .video_tagging_from_frames_mapper import VideoTaggingFromFramesMapper
from .whitespace_normalization_mapper import WhitespaceNormalizationMapper
from .video_active_speaker_mapper import VideoActiveSpeakerMapper
from .video_audio_attribute_mapper import VideoAudioAttributeMapper
from .video_audio_speech_ASR_mapper import VideoAudioSpeechASRMapper
from .video_captioning_face_attribute_emotion_mapper import VideoCaptioningFaceAttributeEmotionMapper
from .video_captioning_from_human_tracks_mapper import VideoCaptioningFromHumanTracksMapper
from .video_human_tracks_extraction_mapper import VideoHumanTracksExtractionMapper
from .video_captioning_face_attribute_emotion_mapper import VideoCaptioningFaceAttributeEmotionMapper

__all__ = [
'AudioFFmpegWrappedMapper', 'CalibrateQAMapper', 'CalibrateQueryMapper',
Expand Down Expand Up @@ -105,5 +112,8 @@
'VideoResizeResolutionMapper', 'VideoSplitByDurationMapper',
'VideoSplitByKeyFrameMapper', 'VideoSplitBySceneMapper',
'VideoTaggingFromAudioMapper', 'VideoTaggingFromFramesMapper',
'WhitespaceNormalizationMapper'
'WhitespaceNormalizationMapper','VideoActiveSpeakerMapper',
'VideoAudioAttributeMapper', 'VideoAudioSpeechASRMapper',
'VideoCaptioningFaceAttributeEmotionMapper','VideoCaptioningFromHumanTracksMapper',
'VideoHumanTracksExtractionMapper', 'VideoCaptioningFaceAttributeEmotionMapper'
]
Loading
Loading